NPLinker · CunliangGeng · Mar 5, 2024 · Feb 23, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,7 @@ namespaces = true    # enable data directory to be identified
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts    = "-ra -n auto"   # -ra: show summary info for all test outcomes; -n auto: run tests in parallel
+addopts    = "-ra -n 1"   # -ra: show summary info for all test outcomes; -n auto: run tests in parallel
 testpaths  = ["tests"]
 
 [tool.coverage.run]

diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -2,6 +2,7 @@
 import logging
 import sys
 from typing import TYPE_CHECKING
+from .arranger import DatasetArranger
 from .config import config
 from .genomics import BGC
 from .genomics import GCF
@@ -38,12 +39,12 @@ class NPLinker:
     def __init__(self):
         """Initialise an NPLinker instance."""
         # configure logging based on the supplied config params
-        LogConfig.setLogLevelStr(config.loglevel)
-        logfile = config.get("logfile")
+        LogConfig.setLogLevelStr(config.log.level)
+        logfile = config.get("log.file")
         if logfile:
             logfile_dest = logging.FileHandler(logfile)
             # if we want to log to stdout plus logfile, add the new destination
-            if config.get("log_to_stdout"):  # default to True
+            if config.get("log.to_stdout"):  # default to True
                 LogConfig.addLogDestination(logfile_dest)
             else:
                 # otherwise overwrite the default stdout destination
@@ -124,22 +125,7 @@ def root_dir(self):
         Returns:
                 str: the path to the dataset root directory currently in use
         """
-        return self._loader._root
-
-    @property
-    def dataset_id(self):
-        """Returns dataset "ID".
-
-        For local datasets this will just be the last component of the directory path,
-        e.g. /path/to/my_dataset would produce an ID of "my_dataset".
-
-        For datasets loaded from the Paired Omics platform the ID will be the platform
-        project ID, e.g. "MSV000079284"
-
-        Returns:
-            str: the dataset ID
-        """
-        return self._loader.dataset_id
+        return config.root_dir
 
     @property
     def data_dir(self):
@@ -149,30 +135,13 @@ def data_dir(self):
     @property
     def bigscape_cutoff(self):
         """Returns the current BiGSCAPE clustering cutoff value."""
-        return self._loader._bigscape_cutoff
+        return config.bigscape.cutoff
 
-    def load_data(self, new_bigscape_cutoff=None):
-        """Loads the basic components of a dataset.
-
-        This method is responsible for loading the various pieces of the supplied dataset into
-        memory and doing any initial parsing/object manipulation required. After it completes,
-        applications can access the lists of GCFs, Spectra, MolecularFamilies and strains
-        using the corresponding properties of the NPLinker class.
-
-        Returns:
-            bool: True if successful, False otherwise
-        """
-        logger.debug("load_data(new_bigscape_cutoff=%s)", new_bigscape_cutoff)
-        if new_bigscape_cutoff is None:
-            self._loader.validate()
-            self._loader.generate_strain_mappings()
-            if not self._loader.load():
-                return False
-        else:
-            # CG: only reload genomics data when changing bigscape cutoff
-            self._loader._bigscape_cutoff = new_bigscape_cutoff
-            # TODO: only need to reload gcfs using load_gcfs()
-            self._loader._load_genomics()
+    def load_data(self):
+        """Loads the basic components of a dataset."""
+        arranger = DatasetArranger()
+        arranger.arrange()
+        self._loader.load()
 
         self._spectra = self._loader.spectra
         self._molfams = self._loader.molfams
@@ -184,20 +153,6 @@ def load_data(self, new_bigscape_cutoff=None):
         self._chem_classes = self._loader.chem_classes
         self._class_matches = self._loader.class_matches
 
-        logger.debug("Generating lookup tables: genomics")
-        self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs}
-        self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs}
-
-        # don't need to do these two if cutoff changed (indicating genomics data
-        # was reloaded but not metabolomics)
-        if new_bigscape_cutoff is None:
-            logger.debug("Generating lookup tables: metabolomics")
-            self._spec_lookup = {spec.spectrum_id: spec for spec in self._spectra}
-            self._mf_lookup = {mf.family_id: mf for mf in self._molfams}
-
-        logger.debug("load_data: completed")
-        return True
-
     # TODO CG: refactor this method and update its unit tests
     def get_links(self, input_objects, scoring_methods, and_mode=True):
         """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams).

diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py
diff --git a/src/nplinker/pairedomics/runbigscape.py b/src/nplinker/pairedomics/runbigscape.py
@@ -1,17 +1,3 @@
-# Copyright 2021 The NPLinker Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import os
 import subprocess
 import sys
@@ -21,27 +7,19 @@
 
 logger = LogConfig.getLogger(__name__)
 
-# NOTE: for simplicity this is currently written with assumption it will only be
-# called in context of nplinker Docker image, where bigscape should be available
 PFAM_PATH = os.path.join(sys.prefix, "nplinker_lib")
 
 
 def run_bigscape(
-    bigscape_py_path: str | PathLike,
     antismash_path: str | PathLike,
     output_path: str | PathLike,
-    pfam_path: str | PathLike,
     extra_params: str,
 ):
+    bigscape_py_path = "bigscape.py"
     logger.info(
         f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
     )
 
-    if os.path.exists(os.path.join(output_path, "completed")):
-        logger.info("BiG-SCAPE appears to have been run already, skipping!")
-        logger.info("To force re-run, delete {%s}", os.path.join(output_path, "completed"))
-        return True
-
     try:
         subprocess.run([bigscape_py_path, "-h"], capture_output=True, check=True)
     except Exception as e:
@@ -51,7 +29,7 @@ def run_bigscape(
         raise Exception(f'antismash_path "{antismash_path}" does not exist!')
 
     # configure the IO-related parameters, including pfam_dir
-    args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", pfam_path]
+    args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", PFAM_PATH]
 
     # append the user supplied params, if any
     if len(extra_params) > 0:
@@ -65,34 +43,4 @@ def run_bigscape(
     # which will indicate to the PODPDownloader module that something went wrong.
     result.check_returncode()
 
-    # use presence of this file as a quick way to check if a previous run
-    # finished or not
-    with open(os.path.join(output_path, "completed"), "w") as f:
-        f.close()
-
     return True
-
-
-def podp_run_bigscape(
-    project_file_cache: str | PathLike,
-    PFAM_PATH: str | PathLike,
-    do_bigscape: bool,
-    extra_bigscape_parameters,
-):
-    # TODO this currently assumes docker environment, allow customisation?
-    # can check if in container with: https://stackoverflow.com/questions/20010199/how-to-determine-if-a-process-runs-inside-lxc-docker
-    if not do_bigscape:
-        logger.info("BiG-SCAPE disabled by configuration, not running it")
-        return
-
-    logger.info('Running BiG-SCAPE! extra_bigscape_parameters="%s"', extra_bigscape_parameters)
-    try:
-        run_bigscape(
-            "bigscape.py",
-            os.path.join(project_file_cache, "antismash"),
-            os.path.join(project_file_cache, "bigscape"),
-            PFAM_PATH,
-            extra_bigscape_parameters,
-        )
-    except Exception as e:
-        logger.warning('Failed to run BiG-SCAPE on antismash data, error was "%s"', e)