diff --git a/pyproject.toml b/pyproject.toml
index 25b6cf08..28cd90b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,7 +48,7 @@ namespaces = true    # enable data directory to be identified
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts    = "-ra -n auto"   # -ra: show summary info for all test outcomes; -n auto: run tests in parallel
+addopts    = "-ra -n 1"   # -ra: show summary info for all test outcomes; -n auto: run tests in parallel
 testpaths  = ["tests"]
 
 [tool.coverage.run]
diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py
new file mode 100644
index 00000000..cb3091a5
--- /dev/null
+++ b/src/nplinker/arranger.py
@@ -0,0 +1,452 @@
+import fnmatch
+import json
+import shutil
+from glob import glob
+from pathlib import Path
+from jsonschema import validate
+import nplinker.globals as globals
+from nplinker.config import config
+from nplinker.genomics import generate_mappings_genome_id_bgc_id
+from nplinker.genomics.mibig import download_and_extract_mibig_metadata
+from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
+from nplinker.globals import GENOME_STATUS_FILENAME
+from nplinker.globals import STRAIN_MAPPINGS_FILENAME
+from nplinker.metabolomics.gnps import GNPSDownloader
+from nplinker.metabolomics.gnps import GNPSExtractor
+from nplinker.pairedomics import podp_download_and_extract_antismash_data
+from nplinker.pairedomics.runbigscape import run_bigscape
+from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings
+from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA
+from nplinker.schemas import USER_STRAINS_SCHEMA
+from nplinker.schemas import validate_podp_json
+from nplinker.utils import download_url
+from nplinker.utils import list_dirs
+from nplinker.utils import list_files
+
+
+PODP_PROJECT_URL = "https://pairedomicsdata.bioinformatics.nl/api/projects/{}"
+
+
+class DatasetArranger:
+    def __init__(self) -> None:
+        """Arrange the dataset required by NPLinker.
+
+        This class is used to arrange the datasets required by NPLinker according to the
+        configuration. The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
+
+        If `config.mode` is "local", the datasets are validated.
+        If `config.mode` is "podp", the datasets are downloaded or generated.
+
+        It uses the default downloads directory `globals.DOWNLOADS_DEFAULT_PATH` to store the
+        downloaded files. Default data paths for MIBiG, GNPS, antiSMASH, and BiG-SCAPE are defined
+        in `nplinker.globals`.
+        """
+        # Prepare the downloads directory and/or PODP json file which are required for other methods
+        globals.DOWNLOADS_DEFAULT_PATH.mkdir(exist_ok=True)
+        self.arrange_podp_project_json()
+
+    def arrange(self) -> None:
+        """Arrange the datasets according to the configuration.
+
+        The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
+        """
+        # The order of arranging the datasets matters, as some datasets depend on others
+        self.arrange_mibig()
+        self.arrange_gnps()
+        self.arrange_antismash()
+        self.arrange_bigscape()
+        self.arrange_strain_mappings()
+        self.arrange_strains_selected()
+
+    def arrange_podp_project_json(self) -> None:
+        """Arrange the PODP project JSON file.
+
+        If `config.mode` is "podp", download the PODP project JSON file if it doesn't exist. Then
+        validate the PODP project JSON file if it exists or is downloaded.
+
+        The validation is controlled by the json schema `schemas/podp_adapted_schema.json`.
+        """
+        if config.mode == "podp":
+            file_name = f"paired_datarecord_{config.podp_id}.json"
+            podp_file = globals.DOWNLOADS_DEFAULT_PATH / file_name
+            if not podp_file.exists():
+                download_url(
+                    PODP_PROJECT_URL.format(config.podp_id),
+                    globals.DOWNLOADS_DEFAULT_PATH,
+                    file_name,
+                )
+
+            with open(podp_file, "r") as f:
+                json_data = json.load(f)
+            validate_podp_json(json_data)
+
+    def arrange_mibig(self) -> None:
+        """Arrange the MIBiG metadata.
+
+        Always download and extract the MIBiG metadata if `config.mibig.to_use` is True.
+        If the default directory has already existed, it will be removed and re-downloaded to ensure
+        the latest version is used. So it's not allowed to manually put MIBiG metadata in the
+        default directory.
+        """
+        if config.mibig.to_use:
+            if globals.MIBIG_DEFAULT_PATH.exists():
+                # remove existing mibig data
+                shutil.rmtree(globals.MIBIG_DEFAULT_PATH)
+            download_and_extract_mibig_metadata(
+                globals.DOWNLOADS_DEFAULT_PATH,
+                globals.MIBIG_DEFAULT_PATH,
+                version=config.mibig.version,
+            )
+
+    def arrange_gnps(self) -> None:
+        """Arrange the GNPS data.
+
+        If `config.mode` is "local", validate the GNPS data directory.
+        If `config.mode` is "podp", download the GNPS data if it doesn't exist or remove the
+        existing GNPS data and re-download it if it is invalid.
+
+        The validation process includes:
+        - Check if the GNPS data directory exists.
+        - Check if the required files exist in the GNPS data directory, including:
+            - file_mappings.tsv or file_mappings.csv
+            - spectra.mgf
+            - molecular_families.tsv
+            - annotations.tsv
+        """
+        pass_validation = False
+        if config.mode == "podp":
+            # retry downloading at most 3 times if downloaded data has problems
+            for _ in range(3):
+                try:
+                    validate_gnps(globals.GNPS_DEFAULT_PATH)
+                    pass_validation = True
+                    break
+                except (FileNotFoundError, ValueError):
+                    # Don't need to remove downloaded archive, as it'll be overwritten
+                    shutil.rmtree(globals.GNPS_DEFAULT_PATH, ignore_errors=True)
+                    self._download_and_extract_gnps()
+
+        if not pass_validation:
+            validate_gnps(globals.GNPS_DEFAULT_PATH)
+
+        # get the path to file_mappings file (csv or tsv)
+        self.gnps_file_mappings_file = self._get_gnps_file_mappings_file()
+
+    def _get_gnps_file_mappings_file(self) -> Path:
+        """Get the GNPS file mappings file.
+
+        The GNPS file mappings file can be either a TSV file or a CSV file. This method checks if
+        the TSV file or the CSV file exists in the default GNPS directory.
+
+        Returns:
+            Path: Path to the GNPS file mappings file.
+        """
+        file_mappings_tsv = globals.GNPS_DEFAULT_PATH / globals.GNPS_FILE_MAPPINGS_TSV
+        file_mappings_csv = globals.GNPS_DEFAULT_PATH / globals.GNPS_FILE_MAPPINGS_CSV
+
+        gnps_file_mappings_file = (
+            file_mappings_tsv if file_mappings_tsv.exists() else file_mappings_csv
+        )
+
+        return gnps_file_mappings_file
+
+    def _download_and_extract_gnps(self) -> None:
+        """Download and extract the GNPS data.
+
+        Get the GNPS task ID from the PODP project JSON file, then download and extract the GNPS
+        data to the default GNPS directory.
+        """
+        podp_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json"
+        with open(podp_file, "r") as f:
+            podp_json_data = json.load(f)
+        gnps_task_id = podp_json_data["metabolomics"]["project"].get("molecular_network")
+
+        data_archive = (
+            GNPSDownloader(gnps_task_id, globals.DOWNLOADS_DEFAULT_PATH)
+            .download()
+            .get_download_file()
+        )
+        GNPSExtractor(data_archive, globals.GNPS_DEFAULT_PATH)
+
+    def arrange_antismash(self) -> None:
+        """Arrange the antiSMASH data.
+
+        If `config.mode` is "local", validate the antiSMASH data directory.
+        If `config.mode` is "podp", download the antiSMASH data if it doesn't exist or remove the
+        existing antiSMASH data and re-download it if it is invalid.
+
+        The validation process includes:
+        - Check if the antiSMASH data directory exists.
+        - Check if the antiSMASH data directory contains at least one sub-directory, and each
+            sub-directory contains at least one BGC file (with the suffix ".region???.gbk" where ???
+            is a number).
+
+        AntiSMASH BGC directory must follow the structure below:
+        antismash
+            ├── genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)
+            │  ├── GCF_000514775.1.gbk
+            │  ├── NZ_AZWO01000004.region001.gbk
+            │  └── ...
+            ├── genome_id_2
+            │  ├── ...
+            └── ...
+        """
+        pass_validation = False
+        if config.mode == "podp":
+            for _ in range(3):
+                try:
+                    validate_antismash(globals.ANTISMASH_DEFAULT_PATH)
+                    pass_validation = True
+                    break
+                except FileNotFoundError:
+                    shutil.rmtree(globals.ANTISMASH_DEFAULT_PATH, ignore_errors=True)
+                    self._download_and_extract_antismash()
+
+        if not pass_validation:
+            validate_antismash(globals.ANTISMASH_DEFAULT_PATH)
+
+    def _download_and_extract_antismash(self) -> None:
+        """Download and extract the antiSMASH data.
+
+        Get the antiSMASH data from the PODP project JSON file, then download and extract the
+        antiSMASH data to the default antiSMASH directory.
+        """
+        podp_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json"
+        with open(podp_file, "r") as f:
+            podp_json_data = json.load(f)
+        podp_download_and_extract_antismash_data(
+            podp_json_data["genomes"], globals.DOWNLOADS_DEFAULT_PATH, config.root_dir
+        )
+
+    def arrange_bigscape(self) -> None:
+        """Arrange the BiG-SCAPE data.
+
+        If `config.mode` is "local", validate the BiG-SCAPE data directory.
+        If `config.mode` is "podp", run BiG-SCAPE to generate the clustering file if it doesn't
+        exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid.
+        The running output of BiG-SCAPE will be saved to the directory "bigscape_running_output"
+        in the default BiG-SCAPE directory, and the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv"
+        will be copied to the default BiG-SCAPE directory.
+
+        The validation process includes:
+        - Check if the default BiG-SCAPE data directory exists.
+        - Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the
+            BiG-SCAPE data directory.
+        """
+        pass_validation = False
+        if config.mode == "podp":
+            for _ in range(3):
+                try:
+                    validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH)
+                    pass_validation = True
+                    break
+                except FileNotFoundError:
+                    shutil.rmtree(globals.BIGSCAPE_DEFAULT_PATH, ignore_errors=True)
+                    self._run_bigscape()
+
+        if not pass_validation:
+            validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH)
+
+    def _run_bigscape(self) -> None:
+        """Run BiG-SCAPE to generate the clustering file.
+
+        The running output of BiG-SCAPE will be saved to the `BIGSCAPE_RUNNING_OUTPUT_PATH`.
+
+        The clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" will be copied to the
+        default BiG-SCAPE directory.
+        """
+        globals.BIGSCAPE_RUNNING_OUTPUT_PATH.mkdir(exist_ok=True, parents=True)
+        run_bigscape(
+            globals.ANTISMASH_DEFAULT_PATH,
+            globals.BIGSCAPE_RUNNING_OUTPUT_PATH,
+            config.bigscape.parameters,
+        )
+        for f in glob(
+            str(
+                globals.BIGSCAPE_RUNNING_OUTPUT_PATH
+                / "network_files"
+                / "*"
+                / "mix"
+                / "mix_clustering_c*.tsv"
+            )
+        ):
+            shutil.copy(f, globals.BIGSCAPE_DEFAULT_PATH)
+
+    def arrange_strain_mappings(self) -> None:
+        """Arrange the strain mappings file.
+
+        If `config.mode` is "local", validate the strain mappings file.
+        If `config.mode` is "podp", always generate the strain mappings file and validate it.
+
+        The valiation checks if the strain mappings file exists and if it is a valid JSON file
+        according to the schema defined in `schemas/strain_mappings_schema.json`.
+        """
+        if config.mode == "podp":
+            self._generate_strain_mappings()
+
+        self._validate_strain_mappings()
+
+    def _validate_strain_mappings(self) -> None:
+        """Validate the strain mappings file.
+
+        The validation process includes:
+        - Check if the strain mappings file exists.
+        - Check if the strain mappings file is a valid JSON file according to the schema defined in
+            `schemas/strain_mappings_schema.json`.
+
+        Raises:
+            FileNotFoundError: If the strain mappings file is not found.
+            ValidationError: If the strain mappings file is not a valid JSON file according to the
+                schema.
+        """
+        strain_mappings_file = config.root_dir / STRAIN_MAPPINGS_FILENAME
+
+        if not strain_mappings_file.exists():
+            raise FileNotFoundError(f"Strain mappings file not found at {strain_mappings_file}")
+
+        with open(strain_mappings_file, "r") as f:
+            json_data = json.load(f)
+        # Will raise ValidationError if the JSON file is invalid
+        validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA)
+
+    def _generate_strain_mappings(self) -> None:
+        """Generate the strain mappings file for the PODP mode."""
+        podp_json_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json"
+        genome_status_json_file = globals.DOWNLOADS_DEFAULT_PATH / GENOME_STATUS_FILENAME
+        genome_bgc_mappings_file = globals.ANTISMASH_DEFAULT_PATH / GENOME_BGC_MAPPINGS_FILENAME
+        gnps_file_mapping_file = self.gnps_file_mappings_file
+        strain_mappings_file = config.root_dir / STRAIN_MAPPINGS_FILENAME
+
+        # generate the genome_bgc_mappings_file
+        generate_mappings_genome_id_bgc_id(globals.ANTISMASH_DEFAULT_PATH)
+        # generate the strain_mappings_file
+        podp_generate_strain_mappings(
+            podp_json_file,
+            genome_status_json_file,
+            genome_bgc_mappings_file,
+            gnps_file_mapping_file,
+            strain_mappings_file,
+        )
+
+    def arrange_strains_selected(self) -> None:
+        """Arrange the strains selected file.
+
+        Validate the strains selected file if it exists.
+        The validation checks if the strains selected file is a valid JSON file according to the
+        schema defined in `schemas/user_strains.json`.
+        """
+        strains_selected_file = config.root_dir / globals.STRAINS_SELECTED_FILENAME
+        if strains_selected_file.exists():
+            with open(strains_selected_file, "r") as f:
+                json_data = json.load(f)
+            validate(instance=json_data, schema=USER_STRAINS_SCHEMA)
+
+
+def validate_gnps(gnps_dir: Path) -> None:
+    """Validate the GNPS data directory and its contents.
+
+    The GNPS data directory must contain the following files:
+    - file_mappings.tsv or file_mappings.csv
+    - spectra.mgf
+    - molecular_families.tsv
+    - annotations.tsv
+
+    Args:
+        gnps_dir (Path): Path to the GNPS data directory.
+
+    Raises:
+        FileNotFoundError: If the GNPS data directory is not found or any of the required files
+            is not found.
+        ValueError: If both file_mappings.tsv and file_mapping.csv are found.
+    """
+    if not gnps_dir.exists():
+        raise FileNotFoundError(f"GNPS data directory not found at {gnps_dir}")
+
+    file_mappings_tsv = gnps_dir / globals.GNPS_FILE_MAPPINGS_TSV
+    file_mappings_csv = gnps_dir / globals.GNPS_FILE_MAPPINGS_CSV
+    if file_mappings_tsv.exists() and file_mappings_csv.exists():
+        raise ValueError(
+            f"Both {file_mappings_tsv.name} and {file_mappings_csv.name} found in GNPS directory "
+            f"{gnps_dir}, only one is allowed."
+        )
+    elif not file_mappings_tsv.exists() and not file_mappings_csv.exists():
+        raise FileNotFoundError(
+            f"Neither {file_mappings_tsv.name} nor {file_mappings_csv.name} found in GNPS directory"
+            f" {gnps_dir}"
+        )
+
+    required_files = [
+        gnps_dir / globals.GNPS_SPECTRA_FILENAME,
+        gnps_dir / globals.GNPS_MOLECULAR_FAMILY_FILENAME,
+        gnps_dir / globals.GNPS_ANNOTATIONS_FILENAME,
+    ]
+    list_not_found = [f.name for f in required_files if not f.exists()]
+    if list_not_found:
+        raise FileNotFoundError(
+            f"Files not found in GNPS directory {gnps_dir}: ', '.join({list_not_found})"
+        )
+
+
+def validate_antismash(antismash_dir: Path) -> None:
+    """Validate the antiSMASH data directory and its contents.
+
+    The validation only checks the structure of the antiSMASH data directory and file names.
+    It does not check
+    - the content of the BGC files
+    - the consistency between the antiSMASH data and the PODP project JSON file for the PODP
+        mode
+
+    The antiSMASH data directory must exist and contain at least one sub-directory. The name of the
+    sub-directories must not contain any space. Each sub-directory must contain at least one BGC
+    file (with the suffix ".region???.gbk" where ??? is the region number).
+
+    Args:
+        antismash_dir (Path): Path to the antiSMASH data directory.
+
+    Raises:
+        FileNotFoundError: If the antiSMASH data directory is not found, or no sub-directories
+            are found in the antiSMASH data directory, or no BGC files are found in any
+            sub-directory.
+        ValueError: If any sub-directory name contains a space.
+    """
+    if not antismash_dir.exists():
+        raise FileNotFoundError(f"antiSMASH data directory not found at {antismash_dir}")
+
+    sub_dirs = list_dirs(antismash_dir)
+    if not sub_dirs:
+        raise FileNotFoundError(
+            "No BGC directories found in antiSMASH data directory {antismash_dir}"
+        )
+
+    for sub_dir in sub_dirs:
+        dir_name = Path(sub_dir).name
+        if " " in dir_name:
+            raise ValueError(
+                f"antiSMASH sub-directory name {dir_name} contains space, which is not allowed"
+            )
+
+        gbk_files = list_files(sub_dir, suffix=".gbk", keep_parent=False)
+        bgc_files = fnmatch.filter(gbk_files, "*.region???.gbk")
+        if not bgc_files:
+            raise FileNotFoundError(f"No BGC files found in antiSMASH sub-directory {sub_dir}")
+
+
+def validate_bigscape(bigscape_dir: Path) -> None:
+    """Validate the BiG-SCAPE data directory and its contents.
+
+    The BiG-SCAPE data directory must exist and contain the clustering file
+    "mix_clustering_c{config.bigscape.cutoff}.tsv" where {config.bigscape.cutoff} is the
+    bigscape cutoff value set in the config file.
+
+    Args:
+        bigscape_dir(Path): Path to the BiG-SCAPE data directory.
+
+    Raises:
+        FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found.
+    """
+    if not bigscape_dir.exists():
+        raise FileNotFoundError(f"BiG-SCAPE data directory not found at {bigscape_dir}")
+
+    clustering_file = bigscape_dir / f"mix_clustering_c{config.bigscape.cutoff}.tsv"
+    if not clustering_file.exists():
+        raise FileNotFoundError(f"BiG-SCAPE clustering file not found: {clustering_file}")
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 25ef8711..cfadc0d7 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -1,23 +1,19 @@
-import glob
 import os
-from pathlib import Path
 from deprecated import deprecated
-from nplinker.class_info.chem_classes import ChemClassPredictions
-from nplinker.class_info.class_matches import ClassMatches
-from nplinker.class_info.runcanopus import run_canopus
+from nplinker import globals
 from nplinker.config import config
 from nplinker.genomics import add_bgc_to_gcf
 from nplinker.genomics import add_strain_to_bgc
-from nplinker.genomics import generate_mappings_genome_id_bgc_id
 from nplinker.genomics import get_mibig_from_gcf
 from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.bigscape import BigscapeGCFLoader
 from nplinker.genomics.mibig import MibigLoader
-from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
-from nplinker.globals import GENOME_STATUS_FILENAME
-from nplinker.globals import GNPS_FILE_MAPPINGS_FILENAME
-from nplinker.globals import PFAM_PATH
+from nplinker.globals import GNPS_ANNOTATIONS_FILENAME
+from nplinker.globals import GNPS_DEFAULT_PATH
+from nplinker.globals import GNPS_MOLECULAR_FAMILY_FILENAME
+from nplinker.globals import GNPS_SPECTRA_FILENAME
 from nplinker.globals import STRAIN_MAPPINGS_FILENAME
+from nplinker.globals import STRAINS_SELECTED_FILENAME
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics import add_annotation_to_spectrum
 from nplinker.metabolomics import add_spectrum_to_mf
@@ -25,9 +21,6 @@
 from nplinker.metabolomics.gnps import GNPSAnnotationLoader
 from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader
 from nplinker.metabolomics.gnps import GNPSSpectrumLoader
-from nplinker.pairedomics.downloader import PODPDownloader
-from nplinker.pairedomics.runbigscape import run_bigscape
-from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings
 from nplinker.strain_collection import StrainCollection
 from nplinker.strain_loader import load_user_strains
 
@@ -43,138 +36,24 @@
 
 
 class DatasetLoader:
-    ANTISMASH_DELIMITERS_DEFAULT = [".", "_", "-"]
-    ANTISMASH_IGNORE_SPACES_DEFAULT = False
-
-    TABLES_CUTOFF_DEFAULT = 2.0
-
-    BIGSCAPE_CUTOFF_DEFAULT = 30
-    USE_MIBIG_DEFAULT = True
-    MIBIG_VERSION_DEFAULT = "3.1"
-
-    RUN_BIGSCAPE_DEFAULT = True
-
-    # https://git.wageningenur.nl/medema-group/BiG-SCAPE/-/wikis/parameters#mibig
-    # BigScape mibig default version is 3.1
-    EXTRA_BIGSCAPE_PARAMS_DEFAULT = "--mibig --clans-off --mix --include_singletons"
-
     RUN_CANOPUS_DEFAULT = False
     EXTRA_CANOPUS_PARAMS_DEFAULT = "--maxmz 600 formula zodiac structure canopus"
 
-    # keys for overriding metabolomics data elements
-    OR_GNPS_NODES = "gnps_nodes_file"
-    OR_GNPS_EDGES = "gnps_edges_file"
-    OR_GNPS_MGF = "gnps_mgf_file"
-    OR_GNPS_ANNOTATIONS = "gnps_annotations_file"
-    # and the same for genomics data
-    OR_ANTISMASH = "antismash_dir"
-    OR_BIGSCAPE = "bigscape_dir"
-    OR_MIBIG_JSON = "mibig_json_dir"
-    OR_STRAINS = "strain_mappings_file"
-    # misc files
-    OR_INCLUDE_STRAINS = "include_strains_file"
     # class predictions
     OR_CANOPUS = "canopus_dir"
     OR_MOLNETENHANCER = "molnetenhancer_dir"
 
-    BIGSCAPE_PRODUCT_TYPES = [
-        "NRPS",
-        "Others",
-        "PKSI",
-        "PKS-NRP_Hybrids",
-        "PKSother",
-        "RiPPs",
-        "Saccharides",
-        "Terpene",
-    ]
-
     def __init__(self):
-        # load the config data
-        self._config_docker = config.get("docker", {})
-        self._config_webapp = config.get("webapp", {})
-        self._config_antismash = config.get("antismash", {})
-        self._config_overrides = config.dataset.get("overrides", {})
-        # set private attributes
-        self._antismash_delimiters = self._config_antismash.get(
-            "antismash_delimiters", self.ANTISMASH_DELIMITERS_DEFAULT
-        )
-        self._antismash_ignore_spaces = self._config_antismash.get(
-            "ignore_spaces", self.ANTISMASH_IGNORE_SPACES_DEFAULT
-        )
-        self._bigscape_cutoff = config.dataset.get("bigscape_cutoff", self.BIGSCAPE_CUTOFF_DEFAULT)
-        self._use_mibig = config.dataset.get("use_mibig", self.USE_MIBIG_DEFAULT)
-        self._mibig_version = config.dataset.get("mibig_version", self.MIBIG_VERSION_DEFAULT)
-        # TODO: the actual value of self._root is set in _start_downloads() method
-        self._root = Path(config.dataset["root"])
-        self._platform_id = config.dataset["platform_id"]
-        self._remote_loading = len(self._platform_id) > 0
-
         # set public attributes
-        self.dataset_id = (
-            os.path.split(self._root)[-1] if not self._remote_loading else self._platform_id
-        )
         self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
         self.mibig_bgcs = []
         self.mibig_strains_in_use = StrainCollection()
         self.product_types = []
         self.strains = StrainCollection()
-        self.webapp_scoring_cutoff = self._config_webapp.get(
-            "tables_metcalf_threshold", self.TABLES_CUTOFF_DEFAULT
-        )
+
         self.class_matches = None
         self.chem_classes = None
 
-        logger.debug(
-            "DatasetLoader({}, {}, {})".format(self._root, self.dataset_id, self._remote_loading)
-        )
-
-    def __repr__(self):
-        return "Root={}\n   MGF={}\n   EDGES={}\n   NODES={}\n   BIGSCAPE={}\n   ANTISMASH={}\n".format(
-            self._root,
-            self.gnps_mgf_file,
-            self.gnps_edges_file,
-            self.gnps_nodes_file,
-            self.bigscape_dir,
-            self.antismash_dir,
-        )
-
-    def validate(self):
-        """Download data and build paths for local data."""
-        # if remote loading mode, need to download the data here
-        # CG: for PODP workflow, strain_mappings.json is generated in the download step
-        if self._remote_loading:
-            self._start_downloads()
-
-        # construct the paths and filenames required to load everything else and check
-        # they all seem to exist (but don't parse anything yet)
-        # TODO CG: the logics of _init_paths and _validate_paths are not clear
-        # 1. after downloading (manual preparation), some files alreay exist, some not
-        # 2. get the default, constructed or real path for each file/dir (need refactoring)
-        #   - self._config_overrides.get()
-        #   - os.path.join(self._root, 'strain_mappings.json')
-        #   - find_via_glob() --> actually check if the file/dir exists
-        # 3. check if (some) file/dir exists
-        self._init_paths()
-        self._validate_paths()
-
-    def generate_strain_mappings(self):
-        generate_mappings_genome_id_bgc_id(self._root / "antismash")
-
-        podp_project_json_file = self._root.parent.parent / (self._platform_id + ".json")
-        genome_status_json_file = (
-            self._root.parent.parent / "downloads" / self._platform_id / GENOME_STATUS_FILENAME
-        )
-        genome_bgc_mappings_file = self._root / "antismash" / GENOME_BGC_MAPPINGS_FILENAME
-        gnps_file_mapping_tsv_file = self._root / GNPS_FILE_MAPPINGS_FILENAME
-
-        podp_generate_strain_mappings(
-            podp_project_json_file,
-            genome_status_json_file,
-            genome_bgc_mappings_file,
-            gnps_file_mapping_tsv_file,
-            self.strain_mappings_file,
-        )
-
     def load(self):
         if not self._load_strain_mappings():
             return False
@@ -189,150 +68,23 @@ def load(self):
         self.strains = self.strains + self.mibig_strains_in_use
 
         if len(self.strains) == 0:
-            raise Exception(f"Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?")
+            raise Exception("Failed to find *ANY* strains.")
 
         return True
 
-    def _start_downloads(self):
-        downloader = PODPDownloader(self._platform_id)
-        # TODO CG: this step generates the real path for _root. Should generate
-        # it before loading process starts. Otherwise, npl.root_dir will get
-        # wrong value if loading from local data or not using download.
-        self._root = Path(downloader.project_results_dir)
-        logger.debug("remote loading mode, configuring root=%s", self._root)
-        # CG: to download both MET and GEN data
-        # CG: Continue to understand how strain_mappings.json is generated
-        downloader.get(
-            self._config_docker.get("run_bigscape", self.RUN_BIGSCAPE_DEFAULT),
-            self._config_docker.get(
-                "extra_bigscape_parameters", self.EXTRA_BIGSCAPE_PARAMS_DEFAULT
-            ),
-            self._use_mibig,
-            self._mibig_version,
-        )
-
-    def _init_paths(self):
-        # 1. strain mapping are used for everything else so
-        self.strain_mappings_file = self._config_overrides.get(self.OR_STRAINS) or os.path.join(
-            self._root, STRAIN_MAPPINGS_FILENAME
-        )
-
-        self._init_metabolomics_paths()
-
-        self._init_genomics_paths()
-
-        # 14. MISC: <root>/include_strains.csv / include_strains_file=<override>
-        self.include_strains_file = self._config_overrides.get(
-            self.OR_INCLUDE_STRAINS
-        ) or os.path.join(self._root, "include_strains.csv")
-
-        # 15. CLASS: <root>/canopus / canopus_dir=<override>
-        self.canopus_dir = self._config_overrides.get(self.OR_CANOPUS) or os.path.join(
-            self._root, "canopus"
-        )
-
-        # 15. CLASS: <root>/canopus / canopus_dir=<override>
-        self.molnetenhancer_dir = self._config_overrides.get(
-            self.OR_MOLNETENHANCER
-        ) or os.path.join(self._root, "molnetenhancer")
-
-    def _init_metabolomics_paths(self):
-        """Initializes the paths for metabolomics data."""
-        # GNPS nodes_files is `file_mappings.tsv/csv` file
-        self.gnps_nodes_file = self._config_overrides.get(self.OR_GNPS_NODES) or find_via_glob_alts(
-            [
-                os.path.join(self._root, "file_mappings.csv"),
-                os.path.join(self._root, "file_mappings.tsv"),
-                os.path.join(self._root, "clusterinfo*", "*.tsv"),
-                os.path.join(self._root, "clusterinfo*", "*.clustersummary"),
-            ],
-            self.OR_GNPS_NODES,
-        )
-
-        # GNPS edges_file is `molecular_families.tsv` file
-        self.gnps_edges_file = self._config_overrides.get(self.OR_GNPS_EDGES) or find_via_glob_alts(
-            [
-                os.path.join(self._root, "molecular_families.tsv"),
-                os.path.join(self._root, "*.pairsinfo"),
-                os.path.join(self._root, "networkedges_selfloop", "*.pairsinfo"),
-                os.path.join(self._root, "networkedges_selfloop", "*.selfloop"),
-            ],
-            self.OR_GNPS_EDGES,
-        )
-
-        # GNPS mgf_file is `spectra.mgf` file
-        self.gnps_mgf_file = self._config_overrides.get(self.OR_GNPS_MGF) or find_via_glob_alts(
-            [
-                os.path.join("spectra.mgf"),
-                os.path.join(self._root, "*.mgf"),
-                os.path.join(self._root, "spectra", "*.mgf"),
-            ],
-            self.OR_GNPS_MGF,
-        )
-
-        # GNPS annotations_file is `annotations.tsv` file
-        self.gnps_annotations_file = self._config_overrides.get(
-            self.OR_GNPS_ANNOTATIONS
-        ) or find_via_glob_alts(
-            [
-                os.path.join(self._root, "annotations.tsv"),
-                os.path.join(self._root, "DB_result", "*.tsv"),
-                os.path.join(self._root, "result_specnets_DB", "*.tsv"),
-            ],
-            self.OR_GNPS_ANNOTATIONS,
-        )
-
-    def _init_genomics_paths(self):
-        # 9. GEN: <root>/antismash / antismash_dir=<override>
-        self.antismash_dir = self._config_overrides.get(self.OR_ANTISMASH) or os.path.join(
-            self._root, "antismash"
-        )
-        self.antismash_cache = {}
-
-        # 10. GEN: <root>/bigscape / bigscape_dir=<override>
-        self.bigscape_dir = self._config_overrides.get(self.OR_BIGSCAPE) or os.path.join(
-            self._root, "bigscape"
-        )
-        # what we really want here is the subdirectory containing the network/annotation files,
-        # but in case this is the path to the top level bigscape output, try to figure it out automatically
-        if not os.path.exists(os.path.join(self.bigscape_dir, "mix")):
-            new_bigscape_dir = find_bigscape_dir(self.bigscape_dir)
-            if new_bigscape_dir is not None:
-                logger.info(
-                    "Updating bigscape_dir to discovered location {}".format(new_bigscape_dir)
-                )
-                self.bigscape_dir = new_bigscape_dir
-
-        # 11. GEN: <root>/mibig_json / mibig_json_dir=<override>
-        self.mibig_json_dir = self._config_overrides.get(self.OR_MIBIG_JSON) or os.path.join(
-            self._root, "mibig_json"
-        )
-
-    def _validate_paths(self):
-        """Validates that the required files and directories exist before loading starts."""
-        required_paths = [
-            self.gnps_nodes_file,
-            self.gnps_edges_file,
-            self.gnps_mgf_file,
-            self.antismash_dir,
-        ]
-
-        for f in required_paths:
-            if not os.path.exists(str(f)):
-                raise FileNotFoundError(f'File/directory "{f}" does not exist.')
-
     def _load_strain_mappings(self):
         # 1. load strain mappings
-        sc = StrainCollection.read_json(self.strain_mappings_file)
+        sc = StrainCollection.read_json(config.root_dir / STRAIN_MAPPINGS_FILENAME)
         for strain in sc:
             self.strains.add(strain)
         logger.info("Loaded {} non-MiBIG Strain objects".format(len(self.strains)))
 
         # 2. filter user specificied strains (remove all that are not specified by user).
         # It's not allowed to specify empty list of strains, otherwise validation will fail.
-        if os.path.exists(self.include_strains_file):
-            logger.info(f"Loading user specified strains from file {self.include_strains_file}.")
-            user_strains = load_user_strains(self.include_strains_file)
+        user_strains_file = config.root_dir / STRAINS_SELECTED_FILENAME
+        if user_strains_file.exists():
+            logger.info(f"Loading user specified strains from file {user_strains_file}.")
+            user_strains = load_user_strains(user_strains_file)
             logger.info(f"Loaded {len(user_strains)} user specified strains.")
             self.strains.filter(user_strains)
 
@@ -353,11 +105,15 @@ def _load_metabolomics(self):
         logger.debug("\nLoading metabolomics data starts...")
 
         # Step 1: load all Spectrum objects
-        raw_spectra = GNPSSpectrumLoader(self.gnps_mgf_file).spectra
+        raw_spectra = GNPSSpectrumLoader(GNPS_DEFAULT_PATH / GNPS_SPECTRA_FILENAME).spectra
         # Step 2: load all GNPS annotations
-        raw_annotations = GNPSAnnotationLoader(self.gnps_annotations_file).annotations
+        raw_annotations = GNPSAnnotationLoader(
+            GNPS_DEFAULT_PATH / GNPS_ANNOTATIONS_FILENAME
+        ).annotations
         # Step 3: load all MolecularFamily objects
-        raw_molfams = GNPSMolecularFamilyLoader(self.gnps_edges_file).get_mfs(keep_singleton=False)
+        raw_molfams = GNPSMolecularFamilyLoader(
+            GNPS_DEFAULT_PATH / GNPS_MOLECULAR_FAMILY_FILENAME
+        ).get_mfs(keep_singleton=False)
 
         # Step 4: add GNPS annotations to Spectrum.gnps_annotations
         add_annotation_to_spectrum(raw_annotations, raw_spectra)
@@ -389,20 +145,19 @@ def _load_genomics(self):
 
         # Step 1: load antismash BGC objects & add strain info
         logger.debug("Parsing AntiSMASH directory...")
-        antismash_bgcs = AntismashBGCLoader(self.antismash_dir).get_bgcs()
+        antismash_bgcs = AntismashBGCLoader(str(globals.ANTISMASH_DEFAULT_PATH)).get_bgcs()
         antismash_bgcs_with_strain, _ = add_strain_to_bgc(self.strains, antismash_bgcs)
 
         # Step 2: load mibig BGC objects (having strain info)
-        if self._use_mibig:
-            self.mibig_bgcs = MibigLoader(self.mibig_json_dir).get_bgcs()
+        if config.mibig.to_use:
+            self.mibig_bgcs = MibigLoader(str(globals.MIBIG_DEFAULT_PATH)).get_bgcs()
 
         # Step 3: get all BGC objects with strain info
         all_bgcs_with_strain = antismash_bgcs_with_strain + self.mibig_bgcs
 
         # Step 4: load all GCF objects
-        # TODO: create a config for "bigscape_cluster_file" and discard "bigscape_dir" and "bigscape_cutoff"?
         bigscape_cluster_file = (
-            Path(self.bigscape_dir) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv"
+            globals.BIGSCAPE_DEFAULT_PATH / f"mix_clustering_c{config.bigscape.cutoff}.tsv"
         )
         raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs()
 
@@ -411,7 +166,7 @@ def _load_genomics(self):
 
         # Step 6: get mibig bgcs and strains in use from GCFs
         mibig_strains_in_use = StrainCollection()
-        if self._use_mibig:
+        if config.mibig.to_use:
             mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(all_gcfs_with_bgc)
         else:
             mibig_bgcs_in_use = []
@@ -424,48 +179,6 @@ def _load_genomics(self):
         logger.debug("Loading genomics data completed\n")
         return True
 
-    # TODO CG: run bigscape before loading and after downloading
-    def _run_bigscape(self):
-        # Check for spaces in the folder names under <dataset>/antismash and
-        # rename them by replacing spaces with underscores
-        if not self._antismash_ignore_spaces:
-            logger.debug("Checking for spaces in antiSMASH folder names...")
-            for root, dirs, _ in os.walk(self.antismash_dir):
-                for d in dirs:
-                    if d.find(" ") != -1:
-                        os.rename(os.path.join(root, d), os.path.join(root, d.replace(" ", "_")))
-                        logger.warn(
-                            'Renaming antiSMASH folder "{}" to "{}" to remove spaces! (suppress with ignore_spaces = true in config file)'.format(
-                                d, d.replace(" ", "_")
-                            )
-                        )
-
-        if not os.path.exists(self.bigscape_dir):
-            should_run_bigscape = self._config_docker.get("run_bigscape", self.RUN_BIGSCAPE_DEFAULT)
-            extra_bigscape_parameters = self._config_docker.get(
-                "extra_bigscape_parameters", self.EXTRA_BIGSCAPE_PARAMS_DEFAULT
-            )
-            if should_run_bigscape:
-                logger.info(
-                    'Running BiG-SCAPE! extra_bigscape_parameters="{}"'.format(
-                        extra_bigscape_parameters
-                    )
-                )
-                try:
-                    run_bigscape(
-                        "bigscape.py",
-                        os.path.join(self._root, "antismash"),
-                        os.path.join(self._root, "bigscape"),
-                        PFAM_PATH,
-                        extra_params=extra_bigscape_parameters,
-                    )
-                except Exception as e:
-                    logger.warning(
-                        'Failed to run BiG-SCAPE on antismash data, error was "{}"'.format(e)
-                    )
-
-                self.bigscape_dir = find_bigscape_dir(self.bigscape_dir)
-
     @deprecated(reason="To be refactored. It was used in the `self.load` method before.")
     def _load_class_info(self):
         """Load class match info (based on mibig) and chemical class predictions.
@@ -522,34 +235,3 @@ def _load_class_info(self):
         # include them in loader
         self.chem_classes = chem_classes
         return True
-
-
-def find_via_glob_alts(paths, file_type, optional=False):
-    filename = None
-    for path in paths:
-        try:
-            filename = glob.glob(path)[0]
-            break
-        except (OSError, IndexError):
-            continue
-
-    if filename is None and not optional:
-        raise Exception(
-            "ERROR: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths)
-        )
-    elif filename is None:
-        logger.warning(
-            "WARNING: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths)
-        )
-
-    return filename
-
-
-def find_bigscape_dir(broot):
-    logger.info(f"Trying to discover correct bigscape directory under {broot}")
-    for root, _, files in os.walk(broot):
-        if "Network_Annotations_Full.tsv" in files:
-            logger.info(f"Found network files directory: {root}")
-            return root
-
-    return None
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index d17df87c..8b8217c4 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -2,6 +2,7 @@
 import logging
 import sys
 from typing import TYPE_CHECKING
+from .arranger import DatasetArranger
 from .config import config
 from .genomics import BGC
 from .genomics import GCF
@@ -38,12 +39,12 @@ class NPLinker:
     def __init__(self):
         """Initialise an NPLinker instance."""
         # configure logging based on the supplied config params
-        LogConfig.setLogLevelStr(config.loglevel)
-        logfile = config.get("logfile")
+        LogConfig.setLogLevelStr(config.log.level)
+        logfile = config.get("log.file")
         if logfile:
             logfile_dest = logging.FileHandler(logfile)
             # if we want to log to stdout plus logfile, add the new destination
-            if config.get("log_to_stdout"):  # default to True
+            if config.get("log.to_stdout"):  # default to True
                 LogConfig.addLogDestination(logfile_dest)
             else:
                 # otherwise overwrite the default stdout destination
@@ -124,22 +125,7 @@ def root_dir(self):
         Returns:
                 str: the path to the dataset root directory currently in use
         """
-        return self._loader._root
-
-    @property
-    def dataset_id(self):
-        """Returns dataset "ID".
-
-        For local datasets this will just be the last component of the directory path,
-        e.g. /path/to/my_dataset would produce an ID of "my_dataset".
-
-        For datasets loaded from the Paired Omics platform the ID will be the platform
-        project ID, e.g. "MSV000079284"
-
-        Returns:
-            str: the dataset ID
-        """
-        return self._loader.dataset_id
+        return config.root_dir
 
     @property
     def data_dir(self):
@@ -149,30 +135,13 @@ def data_dir(self):
     @property
     def bigscape_cutoff(self):
         """Returns the current BiGSCAPE clustering cutoff value."""
-        return self._loader._bigscape_cutoff
+        return config.bigscape.cutoff
 
-    def load_data(self, new_bigscape_cutoff=None):
-        """Loads the basic components of a dataset.
-
-        This method is responsible for loading the various pieces of the supplied dataset into
-        memory and doing any initial parsing/object manipulation required. After it completes,
-        applications can access the lists of GCFs, Spectra, MolecularFamilies and strains
-        using the corresponding properties of the NPLinker class.
-
-        Returns:
-            bool: True if successful, False otherwise
-        """
-        logger.debug("load_data(new_bigscape_cutoff=%s)", new_bigscape_cutoff)
-        if new_bigscape_cutoff is None:
-            self._loader.validate()
-            self._loader.generate_strain_mappings()
-            if not self._loader.load():
-                return False
-        else:
-            # CG: only reload genomics data when changing bigscape cutoff
-            self._loader._bigscape_cutoff = new_bigscape_cutoff
-            # TODO: only need to reload gcfs using load_gcfs()
-            self._loader._load_genomics()
+    def load_data(self):
+        """Loads the basic components of a dataset."""
+        arranger = DatasetArranger()
+        arranger.arrange()
+        self._loader.load()
 
         self._spectra = self._loader.spectra
         self._molfams = self._loader.molfams
@@ -184,20 +153,6 @@ def load_data(self, new_bigscape_cutoff=None):
         self._chem_classes = self._loader.chem_classes
         self._class_matches = self._loader.class_matches
 
-        logger.debug("Generating lookup tables: genomics")
-        self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs}
-        self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs}
-
-        # don't need to do these two if cutoff changed (indicating genomics data
-        # was reloaded but not metabolomics)
-        if new_bigscape_cutoff is None:
-            logger.debug("Generating lookup tables: metabolomics")
-            self._spec_lookup = {spec.spectrum_id: spec for spec in self._spectra}
-            self._mf_lookup = {mf.family_id: mf for mf in self._molfams}
-
-        logger.debug("load_data: completed")
-        return True
-
     # TODO CG: refactor this method and update its unit tests
     def get_links(self, input_objects, scoring_methods, and_mode=True):
         """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams).
diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py
deleted file mode 100644
index 2236a699..00000000
--- a/src/nplinker/pairedomics/downloader.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import json
-import os
-import shutil
-from os import PathLike
-from pathlib import Path
-from nplinker.genomics.mibig import download_and_extract_mibig_metadata
-from nplinker.globals import PFAM_PATH
-from nplinker.logconfig import LogConfig
-from nplinker.metabolomics.gnps import GNPSDownloader
-from nplinker.metabolomics.gnps import GNPSExtractor
-from nplinker.utils import download_url
-from . import podp_download_and_extract_antismash_data
-from .runbigscape import podp_run_bigscape
-
-
-logger = LogConfig.getLogger(__name__)
-
-PAIREDOMICS_PROJECT_DATA_ENDPOINT = "https://pairedomicsdata.bioinformatics.nl/api/projects"
-PAIREDOMICS_PROJECT_URL = "https://pairedomicsdata.bioinformatics.nl/api/projects/{}"
-GNPS_DATA_DOWNLOAD_URL = (
-    "https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra"
-)
-
-MIBIG_METADATA_URL = "https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz"
-MIBIG_BGC_METADATA_URL = "https://mibig.secondarymetabolites.org/repository/{}/annotations.json"
-
-
-class PODPDownloader:
-    def __init__(
-        self,
-        podp_platform_id: str,
-        force_download: bool = False,
-        root_dir: str | PathLike | None = None,
-    ):
-        """Downloader for PODP pipeline.
-
-        The downloader will download the following data:
-            - GNPS Molecular Network task results
-            - AntiSMASH results
-            - MIBiG metadata
-
-        Args:
-            podp_platform_id(str): The metabolomics project ID of PODP platform,
-                e.g. GNPS MassIVE ID.
-            force_download (bool): Re-download data even if it already exists
-                locally. Defaults to False.
-            working_dir (str | PathLike | None): The root directory to use for
-                the project. Defaults to None, in which case the default location
-                is used.
-
-        Raises:
-            ValueError: If the given ID does not have a corresponding PODP ID,
-                or if the GNPS Molecular Network task URL does not exist for
-                the given ID.
-        """
-        self.gnps_massive_id = podp_platform_id
-
-        if root_dir is None:
-            root_dir = os.path.join(os.getenv("HOME"), "nplinker_data", "pairedomics")
-
-        # TODO CG: init folder structure should be moved out of PODPDownloader
-        self._init_folder_structure(root_dir)
-
-        # init project json files
-        if not os.path.exists(self.project_json_file) or force_download:
-            logger.info("Downloading new copy of platform project data...")
-            self.all_projects_json_data = self._download_and_load_json(
-                PAIREDOMICS_PROJECT_DATA_ENDPOINT, self.all_projects_json_file
-            )
-        else:
-            logger.info("Using existing copy of platform project data")
-            with open(self.all_projects_json_file, encoding="utf-8") as f:
-                self.all_projects_json_data = json.load(f)
-
-        # Verify that the given ID has a corresponding PODP ID
-        self.podp_id = None
-        for project in self.all_projects_json_data["data"]:
-            if self.gnps_massive_id == project["metabolite_id"]:
-                self.podp_id = project["_id"]
-                logger.debug(
-                    "Given ID %s matched to PODP ID %s", self.gnps_massive_id, self.podp_id
-                )
-                break
-        if self.podp_id is None:
-            raise ValueError(f"Failed to find PODP ID for given ID {self.gnps_massive_id}")
-
-        # now get the project JSON data
-        logger.info("Found project, retrieving JSON data...")
-        self.project_json_data = self._download_and_load_json(
-            PAIREDOMICS_PROJECT_URL.format(self.podp_id), self.project_json_file
-        )
-
-        self.gnps_task_id = self.project_json_data["metabolomics"]["project"].get(
-            "molecular_network"
-        )
-        if self.gnps_task_id is None:
-            raise ValueError(
-                f"GNPS Molecular Network task URL not exist for "
-                f"given ID {self.gnps_massive_id}. Please check and"
-                f"run GNPS Molecular Network task first."
-            )
-
-    def _init_folder_structure(self, working_dir):
-        """Create local cache folders and set up paths for various files."""
-        # init local cache root
-        self.working_dir = working_dir
-        self.downloads_dir = os.path.join(self.working_dir, "downloads")
-        self.results_dir = os.path.join(self.working_dir, "extracted")
-        os.makedirs(self.working_dir, exist_ok=True)
-        logger.info("PODPDownloader for %s, caching to %s", self.gnps_massive_id, self.working_dir)
-
-        # create local cache folders for this dataset
-        self.project_downloads_dir = os.path.join(self.downloads_dir, self.gnps_massive_id)
-        os.makedirs(self.project_downloads_dir, exist_ok=True)
-
-        self.project_results_dir = os.path.join(self.results_dir, self.gnps_massive_id)
-        os.makedirs(self.project_results_dir, exist_ok=True)
-
-        # placeholder directories
-        for d in ["antismash", "bigscape"]:
-            os.makedirs(os.path.join(self.project_results_dir, d), exist_ok=True)
-
-        # init project paths
-        self.all_projects_json_file = os.path.join(self.working_dir, "all_projects.json")
-        self.project_json_file = os.path.join(self.working_dir, f"{self.gnps_massive_id}.json")
-
-    # download function
-    def get(self, do_bigscape, extra_bigscape_parameters, use_mibig, mibig_version):
-        logger.info("Going to download the metabolomics data file")
-
-        self._download_metabolomics_zipfile(self.gnps_task_id)
-
-        # TODO CG: this function will modify the project_json['genomes'],
-        # this should be done in a better way
-        podp_download_and_extract_antismash_data(
-            self.project_json_data["genomes"], self.project_downloads_dir, self.project_results_dir
-        )
-
-        if use_mibig:
-            self._download_mibig_json(mibig_version)
-        podp_run_bigscape(
-            self.project_results_dir, PFAM_PATH, do_bigscape, extra_bigscape_parameters
-        )
-
-    def _download_mibig_json(self, version):
-        output_path = os.path.join(self.project_results_dir, "mibig_json")
-
-        # Override existing mibig json files
-        if os.path.exists(output_path):
-            shutil.rmtree(output_path)
-
-        os.makedirs(output_path)
-
-        download_and_extract_mibig_metadata(self.project_downloads_dir, output_path, version)
-
-        self._create_completed_file(output_path)
-
-        return True
-
-    @staticmethod
-    def _create_completed_file(output_path):
-        with open(os.path.join(output_path, "completed"), "w", encoding="utf-8"):
-            pass
-
-    def _download_metabolomics_zipfile(self, gnps_task_id):
-        archive = (
-            GNPSDownloader(gnps_task_id, self.project_downloads_dir).download().get_download_file()
-        )
-        GNPSExtractor(archive, self.project_results_dir)
-
-    def _download_and_load_json(self, url: str, output_file: str | PathLike) -> dict:
-        """Download a JSON file from a URL and return the parsed JSON data."""
-        fpath = Path(output_file)
-        download_url(url, fpath.parent, fpath.name)
-        logger.debug("Downloaded %s to %s", url, output_file)
-
-        with open(output_file, "r") as f:
-            data = json.load(f)
-
-        return data
diff --git a/src/nplinker/pairedomics/runbigscape.py b/src/nplinker/pairedomics/runbigscape.py
index 59cc39c0..b0bb0859 100644
--- a/src/nplinker/pairedomics/runbigscape.py
+++ b/src/nplinker/pairedomics/runbigscape.py
@@ -1,17 +1,3 @@
-# Copyright 2021 The NPLinker Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import os
 import subprocess
 import sys
@@ -21,27 +7,19 @@
 
 logger = LogConfig.getLogger(__name__)
 
-# NOTE: for simplicity this is currently written with assumption it will only be
-# called in context of nplinker Docker image, where bigscape should be available
 PFAM_PATH = os.path.join(sys.prefix, "nplinker_lib")
 
 
 def run_bigscape(
-    bigscape_py_path: str | PathLike,
     antismash_path: str | PathLike,
     output_path: str | PathLike,
-    pfam_path: str | PathLike,
     extra_params: str,
 ):
+    bigscape_py_path = "bigscape.py"
     logger.info(
         f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
     )
 
-    if os.path.exists(os.path.join(output_path, "completed")):
-        logger.info("BiG-SCAPE appears to have been run already, skipping!")
-        logger.info("To force re-run, delete {%s}", os.path.join(output_path, "completed"))
-        return True
-
     try:
         subprocess.run([bigscape_py_path, "-h"], capture_output=True, check=True)
     except Exception as e:
@@ -51,7 +29,7 @@ def run_bigscape(
         raise Exception(f'antismash_path "{antismash_path}" does not exist!')
 
     # configure the IO-related parameters, including pfam_dir
-    args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", pfam_path]
+    args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", PFAM_PATH]
 
     # append the user supplied params, if any
     if len(extra_params) > 0:
@@ -65,34 +43,4 @@ def run_bigscape(
     # which will indicate to the PODPDownloader module that something went wrong.
     result.check_returncode()
 
-    # use presence of this file as a quick way to check if a previous run
-    # finished or not
-    with open(os.path.join(output_path, "completed"), "w") as f:
-        f.close()
-
     return True
-
-
-def podp_run_bigscape(
-    project_file_cache: str | PathLike,
-    PFAM_PATH: str | PathLike,
-    do_bigscape: bool,
-    extra_bigscape_parameters,
-):
-    # TODO this currently assumes docker environment, allow customisation?
-    # can check if in container with: https://stackoverflow.com/questions/20010199/how-to-determine-if-a-process-runs-inside-lxc-docker
-    if not do_bigscape:
-        logger.info("BiG-SCAPE disabled by configuration, not running it")
-        return
-
-    logger.info('Running BiG-SCAPE! extra_bigscape_parameters="%s"', extra_bigscape_parameters)
-    try:
-        run_bigscape(
-            "bigscape.py",
-            os.path.join(project_file_cache, "antismash"),
-            os.path.join(project_file_cache, "bigscape"),
-            PFAM_PATH,
-            extra_bigscape_parameters,
-        )
-    except Exception as e:
-        logger.warning('Failed to run BiG-SCAPE on antismash data, error was "%s"', e)
diff --git a/tests/conftest.py b/tests/conftest.py
index 7bb93bbf..974265ea 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,18 @@
 import os
 import tempfile
+import zipfile
 from . import DATA_DIR
 
 
-# Specify the config file via environment variable before importing nplinker in any test.
-os.environ["NPLINKER_CONFIG_FILE"] = str(DATA_DIR / "nplinker_demo1.toml")
+# Prepare dataset for local mode testing
+# ⚠️ Multiple temp dirs will be created if using parallel testing.
+temp_dir = tempfile.mkdtemp(prefix="nplinker_")
+nplinker_root_dir = os.path.join(temp_dir, "local_mode_example")
+with zipfile.ZipFile(DATA_DIR / "local_mode_example.zip", "r") as zip_ref:
+    zip_ref.extractall(temp_dir)
+
 # NPLinker setting `root_dir` must be a path that exists, so setting it to a temporary directory.
-os.environ["NPLINKER_ROOT_DIR"] = tempfile.mkdtemp(prefix="nplinker_")
+os.environ["NPLINKER_ROOT_DIR"] = nplinker_root_dir
+
+# Specify the config file via environment variable before importing nplinker in any test.
+os.environ["NPLINKER_CONFIG_FILE"] = str(DATA_DIR / "nplinker_local_mode.toml")
diff --git a/tests/data/nplinker_demo1.toml b/tests/data/nplinker_demo1.toml
deleted file mode 100644
index 0fd30554..00000000
--- a/tests/data/nplinker_demo1.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-root_dir = "@format {env[NPLINKER_ROOT_DIR]}"
-mode = "podp"
-podp_id = "4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4"
diff --git a/tests/data/nplinker_local_mode.toml b/tests/data/nplinker_local_mode.toml
new file mode 100644
index 00000000..174ee852
--- /dev/null
+++ b/tests/data/nplinker_local_mode.toml
@@ -0,0 +1,7 @@
+dynaconf_merge = true # merge with the default settings, provided by the Dynaconf library
+
+root_dir = "@format {env[NPLINKER_ROOT_DIR]}"
+mode = "local"
+
+[log]
+level = "DEBUG"
diff --git a/tests/pairedomics/test_downloader.py b/tests/pairedomics/test_downloader.py
deleted file mode 100644
index 9dc64ebd..00000000
--- a/tests/pairedomics/test_downloader.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-from pathlib import Path
-import httpx
-import pytest
-from pytest_lazyfixture import lazy_fixture
-from nplinker.pairedomics.downloader import PODPDownloader
-
-
-@pytest.mark.parametrize(
-    "expected", [Path(os.getenv("HOME"), "nplinker_data", "pairedomics"), lazy_fixture("tmp_path")]
-)
-def test_default(expected: Path):
-    gnps_id = "MSV000079284"
-
-    sut = PODPDownloader(gnps_id, root_dir=str(expected))
-
-    assert sut.gnps_massive_id == gnps_id
-    assert sut.working_dir == str(expected)
-
-    assert sut.downloads_dir == str(expected / "downloads")
-    assert sut.project_downloads_dir == str(expected / "downloads" / gnps_id)
-
-    assert sut.results_dir == str(expected / "extracted")
-    assert sut.project_results_dir == str(expected / "extracted" / gnps_id)
-    assert os.path.exists(str(expected / "extracted" / gnps_id / "antismash"))
-    assert os.path.exists(str(expected / "extracted" / gnps_id / "bigscape"))
-
-    assert sut.all_projects_json_file == str(expected / "all_projects.json")
-    assert sut.project_json_file == str(expected / f"{gnps_id}.json")
-
-
-def test_download_metabolomics_zipfile(tmp_path):
-    sut = PODPDownloader("MSV000079284", root_dir=tmp_path)
-    try:
-        sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
-        expected_path = os.path.join(
-            sut.project_downloads_dir, "METABOLOMICS-SNETS-c22f44b14a3d450eb836d607cb9521bb.zip"
-        )
-
-        assert os.path.exists(expected_path)
-        assert (Path(sut.project_results_dir) / "molecular_families.tsv").is_file()
-        assert (Path(sut.project_results_dir) / "file_mappings.tsv").is_file()
-        assert (Path(sut.project_results_dir) / "spectra.mgf").is_file()
-    except httpx.TimeoutException:
-        pytest.skip("GNPS is down")
diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py
index 3d1567c9..1f279c49 100644
--- a/tests/scoring/conftest.py
+++ b/tests/scoring/conftest.py
@@ -94,9 +94,6 @@ def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker:
     npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs}
     npl._mf_lookup = {mf.family_id: mf for mf in mfs}
     npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra}
-    # tmp path to store 'metcalf/metcalf_scores.pckl' file
-    # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope)
-    npl._loader._root = tmp_path_factory.mktemp("npl_test")
     return npl
 
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 262d7191..45ae92a1 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,17 +1,15 @@
 from nplinker.config import config
 
 
-def test_config_demo1():
-    """Test loading the default config file (nplinker_demo1.toml)."""
-    # The file "nplinker_demo1.toml" is set in ./conftest.py
-
-    assert config.mode == "podp"
-    assert config.podp_id == "4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4"
+def test_config():
+    """Test loading the default config file."""
+    # The default config file is set in "./conftest.py", which is "data/nplinker_local_mode.toml"
+    assert config.mode == "local"
+    assert config.log.level == "DEBUG"
+    assert config["log.level"] == "DEBUG"
+    assert config.get("log.level") == "DEBUG"
 
     # The following are default values from nplinker_default.toml
-    assert config.log.level == "INFO"
-    assert config["log.level"] == "INFO"
-    assert config.get("log.level") == "INFO"
     assert config.get("log.file") is None
     assert config.log.to_stdout is True
 
diff --git a/tests/test_nplinker_local.py b/tests/test_nplinker_local.py
index 879f5afe..3d8b4636 100644
--- a/tests/test_nplinker_local.py
+++ b/tests/test_nplinker_local.py
@@ -1,12 +1,10 @@
 import hashlib
-import os
 from pathlib import Path
 import pytest
 from nplinker.nplinker import NPLinker
 
 
-# NOTE: This file only contains tests that run locally and are skipped on CI.
-# Basically, only tests related to data loading should be put here.
+# Only tests related to data arranging and loading should be put here.
 # For tests on scoring/links, add them to `scoring/test_nplinker_scoring.py`.
 
 
@@ -27,13 +25,6 @@ def get_file_hash(file_path):
 def npl() -> NPLinker:
     npl = NPLinker()
     npl.load_data()
-    hash_proj_file = get_file_hash(
-        os.path.join(npl._loader._root.parent.parent, npl._loader._platform_id + ".json")
-    )
-    if hash_proj_file != "97f31f13f7a4c87c0b7648e2a2bad5ab2f96c38f92c304a5dc17299b44e698c7":
-        pytest.exit(
-            "PoDP project file has changed, please clean your local cache folder and rerun the tests."
-        )
     # remove cached score results before running tests
     root_dir = Path(npl.root_dir)
     score_cache = root_dir / "metcalf" / "metcalf_scores.pckl"
@@ -65,7 +56,6 @@ def npl() -> NPLinker:
 # ---------------------------------------------------------------------------------------------------
 
 
-@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip when running on CI")
 def test_load_data(npl: NPLinker):
     assert len(npl.bgcs) == 390
     assert len(npl.gcfs) == 64