From 187a7b0d2e43108e01763c22d4f3737136076cfd Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 28 Jun 2024 14:11:00 +0100 Subject: [PATCH] Deployed b7e852a to dev with MkDocs 1.6.0 and mike 2.1.1 --- dev/404.html | 21 + dev/api/antismash/index.html | 21 + dev/api/arranger/index.html | 21 + dev/api/bigscape/index.html | 21 + dev/api/genomics/index.html | 21 + dev/api/genomics_abc/index.html | 21 + dev/api/genomics_utils/index.html | 21 + dev/api/gnps/index.html | 21 + dev/api/loader/index.html | 21 + dev/api/metabolomics/index.html | 21 + dev/api/metabolomics_abc/index.html | 21 + dev/api/metabolomics_utils/index.html | 21 + dev/api/mibig/index.html | 21 + dev/api/nplinker/index.html | 23 +- dev/api/schema/index.html | 21 + dev/api/scoring/index.html | 21 + dev/api/scoring_abc/index.html | 21 + dev/api/scoring_methods/index.html | 21 + dev/api/scoring_utils/index.html | 21 + dev/api/strain/index.html | 21 + dev/api/strain_utils/index.html | 21 + dev/api/utils/index.html | 21 + dev/concepts/bigscape/index.html | 21 + dev/concepts/config_file/index.html | 21 + dev/concepts/gnps_data/index.html | 21 + dev/concepts/working_dir_structure/index.html | 21 + dev/diagrams/arranger/index.html | 23 +- dev/diagrams/loader/index.html | 1464 +++++++++++++++++ dev/figure/data_loading_pipeline.svg | 4 + dev/index.html | 21 + dev/install/index.html | 21 + dev/logging/index.html | 21 + dev/quickstart/index.html | 21 + dev/search/search_index.json | 2 +- dev/sitemap.xml | 65 +- dev/sitemap.xml.gz | Bin 411 -> 416 bytes latest | 2 +- versions.json | 8 +- 38 files changed, 2162 insertions(+), 38 deletions(-) create mode 100644 dev/diagrams/loader/index.html create mode 100644 dev/figure/data_loading_pipeline.svg diff --git a/dev/404.html b/dev/404.html index 9d3688eb7..59a14c3ba 100644 --- a/dev/404.html +++ b/dev/404.html @@ -641,6 +641,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/antismash/index.html b/dev/api/antismash/index.html index f2dbc8e1f..94c1db7de 100644 --- a/dev/api/antismash/index.html +++ b/dev/api/antismash/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/arranger/index.html b/dev/api/arranger/index.html index c5e0a260b..8cde78a1d 100644 --- a/dev/api/arranger/index.html +++ b/dev/api/arranger/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/bigscape/index.html b/dev/api/bigscape/index.html index bb30a3171..bf6bddd43 100644 --- a/dev/api/bigscape/index.html +++ b/dev/api/bigscape/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/genomics/index.html b/dev/api/genomics/index.html index f0369edab..c03f6588e 100644 --- a/dev/api/genomics/index.html +++ b/dev/api/genomics/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/genomics_abc/index.html b/dev/api/genomics_abc/index.html index 84449c194..46c82bed8 100644 --- a/dev/api/genomics_abc/index.html +++ b/dev/api/genomics_abc/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/genomics_utils/index.html b/dev/api/genomics_utils/index.html index 2a52f2edc..99c61c38f 100644 --- a/dev/api/genomics_utils/index.html +++ b/dev/api/genomics_utils/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/gnps/index.html b/dev/api/gnps/index.html index de11a37e1..b4c85151e 100644 --- a/dev/api/gnps/index.html +++ b/dev/api/gnps/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/loader/index.html b/dev/api/loader/index.html index 6431be552..4253e7216 100644 --- a/dev/api/loader/index.html +++ b/dev/api/loader/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/metabolomics/index.html b/dev/api/metabolomics/index.html index c03021fe8..33dd6d761 100644 --- a/dev/api/metabolomics/index.html +++ b/dev/api/metabolomics/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/metabolomics_abc/index.html b/dev/api/metabolomics_abc/index.html index bb67da3c0..762d50b0b 100644 --- a/dev/api/metabolomics_abc/index.html +++ b/dev/api/metabolomics_abc/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/metabolomics_utils/index.html b/dev/api/metabolomics_utils/index.html index cf3e9b596..8858c778f 100644 --- a/dev/api/metabolomics_utils/index.html +++ b/dev/api/metabolomics_utils/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/mibig/index.html b/dev/api/mibig/index.html index 6e97312c3..437281343 100644 --- a/dev/api/mibig/index.html +++ b/dev/api/mibig/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/nplinker/index.html b/dev/api/nplinker/index.html index 8dd1cbc46..361517da6 100644 --- a/dev/api/nplinker/index.html +++ b/dev/api/nplinker/index.html @@ -13,7 +13,7 @@ - + @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/schema/index.html b/dev/api/schema/index.html index 97f185c36..058057d02 100644 --- a/dev/api/schema/index.html +++ b/dev/api/schema/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/scoring/index.html b/dev/api/scoring/index.html index 4b8ec9f08..559e17866 100644 --- a/dev/api/scoring/index.html +++ b/dev/api/scoring/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/scoring_abc/index.html b/dev/api/scoring_abc/index.html index e8e8d0ab9..4933f344b 100644 --- a/dev/api/scoring_abc/index.html +++ b/dev/api/scoring_abc/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/scoring_methods/index.html b/dev/api/scoring_methods/index.html index cb603698d..907e35056 100644 --- a/dev/api/scoring_methods/index.html +++ b/dev/api/scoring_methods/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/scoring_utils/index.html b/dev/api/scoring_utils/index.html index 5d8b60c48..75d41e1b7 100644 --- a/dev/api/scoring_utils/index.html +++ b/dev/api/scoring_utils/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/strain/index.html b/dev/api/strain/index.html index 4f9c86b74..efb8cdd4a 100644 --- a/dev/api/strain/index.html +++ b/dev/api/strain/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/strain_utils/index.html b/dev/api/strain_utils/index.html index 11ce9d406..f1fe8c084 100644 --- a/dev/api/strain_utils/index.html +++ b/dev/api/strain_utils/index.html @@ -654,6 +654,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/api/utils/index.html b/dev/api/utils/index.html index c7d82cfbe..397f66f64 100644 --- a/dev/api/utils/index.html +++ b/dev/api/utils/index.html @@ -652,6 +652,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/concepts/bigscape/index.html b/dev/concepts/bigscape/index.html index 81061941a..1e0c86d51 100644 --- a/dev/concepts/bigscape/index.html +++ b/dev/concepts/bigscape/index.html @@ -662,6 +662,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/concepts/config_file/index.html b/dev/concepts/config_file/index.html index ceab4e52d..b9b722f89 100644 --- a/dev/concepts/config_file/index.html +++ b/dev/concepts/config_file/index.html @@ -722,6 +722,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/concepts/gnps_data/index.html b/dev/concepts/gnps_data/index.html index 50517a77c..79a47c9a6 100644 --- a/dev/concepts/gnps_data/index.html +++ b/dev/concepts/gnps_data/index.html @@ -704,6 +704,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/concepts/working_dir_structure/index.html b/dev/concepts/working_dir_structure/index.html index 73d74f506..21821fc60 100644 --- a/dev/concepts/working_dir_structure/index.html +++ b/dev/concepts/working_dir_structure/index.html @@ -669,6 +669,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/diagrams/arranger/index.html b/dev/diagrams/arranger/index.html index b72c2ead3..b165629fd 100644 --- a/dev/diagrams/arranger/index.html +++ b/dev/diagrams/arranger/index.html @@ -16,7 +16,7 @@ - + @@ -744,6 +744,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/diagrams/loader/index.html b/dev/diagrams/loader/index.html new file mode 100644 index 000000000..d1ef3578c --- /dev/null +++ b/dev/diagrams/loader/index.html @@ -0,0 +1,1464 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Dataset Loader - NPLinker + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    Dataset Loading Pipeline

    +

    The DatasetLoader is implemented according to the following pipeline.

    +

    Data Loading

    + + + + + + + + + + + + + +
    +
    + + + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/dev/figure/data_loading_pipeline.svg b/dev/figure/data_loading_pipeline.svg new file mode 100644 index 000000000..911e8fdf1 --- /dev/null +++ b/dev/figure/data_loading_pipeline.svg @@ -0,0 +1,4 @@ + + + +
    input / output
    function
    antiSMASH .gbk files
    BGC loader
    BGC objects
    BigScape clustering .tsv file
    GCF loader
    GCF objects
    map strain to bgc
    BGC objects
    with BGC.strain updated 
    map bgc to gcf
    GCF objects with GCF.bgcs and GCF.strains updated
    GNPS mgf file
    Spectrum loader
    Spec objects
    GNPS annotation file
    annotation loader
    annotation dict
    add annotation to spec
    MF objects
    with mf.spec and mf.strains updated
    Spec objects with
    spec.gnps_annotations
    updated
    strain mappings file
    add strains to Spec
    Spec objects
    with Spec.strains updated
    GNPS edges file
    MF loader
    MF objects
    add spec to MF
    input strains
    Genomics load pipeline
    Strains
    load pipeline
    Metabolomics
    load pipeline
    MiBIG bgc files
    StrainCollection
    read_json loader
    mibig loader
    BGC objects
    MiBIG strains
    mibig loader
    BGC objects with BGC.strain updated
    mibig BGC objects used in GCF
    mibig Strain objects used in GCF
    antismash BGCs + mibig BGCs
    input strains + mibig strains
    get mibg from gcfs
    mibig BGC objects used in GCF
    \ No newline at end of file diff --git a/dev/index.html b/dev/index.html index ec55c3080..276cdcc52 100644 --- a/dev/index.html +++ b/dev/index.html @@ -667,6 +667,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/install/index.html b/dev/install/index.html index fc64eb782..79312e40a 100644 --- a/dev/install/index.html +++ b/dev/install/index.html @@ -704,6 +704,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/logging/index.html b/dev/logging/index.html index aedee4f1e..c51a0e033 100644 --- a/dev/logging/index.html +++ b/dev/logging/index.html @@ -713,6 +713,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/quickstart/index.html b/dev/quickstart/index.html index 63398aab5..2d47e24f2 100644 --- a/dev/quickstart/index.html +++ b/dev/quickstart/index.html @@ -773,6 +773,27 @@ + + + + + + +
  • + + + + + Dataset Loader + + + + +
  • + + + + diff --git a/dev/search/search_index.json b/dev/search/search_index.json index b8112f98e..a150da8de 100644 --- a/dev/search/search_index.json +++ b/dev/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"NPLinker","text":"

    NPLinker is a python framework for data mining microbial natural products by integrating genomics and metabolomics data.

    For a deep understanding of NPLinker, please refer to the original paper.

    Under Development

    NPLinker v2 is under active development (see its pre-releases). The documentation is not complete yet. If you have any questions, please contact us via GitHub Issues.

    "},{"location":"install/","title":"Installation","text":"Requirements

    NPLinker is a python package that has both pypi packages and non-pypi packages as dependencies. It requires ~4.5GB of disk space to install all the dependencies.

    Install nplinker package as following:

    Install nplinker package
    # Check python version (\u22653.9)\npython --version\n\n# Create a new virtual environment\npython -m venv env          # (1)!\nsource env/bin/activate\n\n# install nplinker package (requiring ~300MB of disk space)\npip install nplinker==2.0.0a2 # (2)! \n\n# install nplinker non-pypi dependencies and databases (~4GB)\ninstall-nplinker-deps\n
    1. A virtual environment is required to install the the non-pypi dependencies. You can also use conda to create a new environment. But NPLinker is not available on conda yet.
    2. NPLinker v2 is still under development and released as pre-release. To install the pre-release, you have to explicitly specifiy the version. The command pip install nplinker will install the legacy NPLinker (v1.3.2), which is not recommended.
    "},{"location":"install/#install-from-source-code","title":"Install from source code","text":"

    You can also install NPLinker from source code:

    Install from latest source code
    pip install git+https://github.com/nplinker/nplinker@dev  # (1)!\ninstall-nplinker-deps\n
    1. The @dev is the branch name. You can replace it with the branch name, commit or tag.
    "},{"location":"logging/","title":"Logging","text":"

    NPLinker uses the standard library logging module for managing log messages and the python library rich to colorize the log messages. Depending on how you use NPLinker, you can set up logging in different ways.

    "},{"location":"logging/#nplinker-as-an-application","title":"NPLinker as an application","text":"

    If you're using NPLinker as an application, you're running the whole workflow of NPLinker as described in the Quickstart. In this case, you can set up logging in the nplinker configuration file nplinker.toml.

    "},{"location":"logging/#nplinker-as-a-library","title":"NPLinker as a library","text":"

    If you're using NPLinker as a library, you're using only some functions and classes of NPLinker in your script. By default, NPLinker will not log any messages. However, you can set up logging in your script to log messages.

    Set up logging in 'your_script.py'
    # Set up logging configuration first\nfrom nplinker import setup_logging\n\nsetup_logging(level=\"DEBUG\", file=\"nplinker.log\", use_console=True) # (1)!\n\n# Your business code here\n# e.g. download and extract nplinker example data\nfrom nplinker.utils import download_and_extract_archive\n\ndownload_and_extract_archive(\n    url=\"https://zenodo.org/records/10822604/files/nplinker_local_mode_example.zip\",\n    download_root=\".\",\n)\n
    1. The setup_logging function sets up the logging configuration. The level argument sets the logging level. The file argument sets the log file. The use_console argument sets whether to log messages to the console.

    The log messages will be written to the log file nplinker.log and displayed in the console with a format like this: [Date Time] Level Log-message Module:Line.

    Run your script in a terminal
    # Run your script\n$ python your_script.py\nDownloading nplinker_local_mode_example.zip \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 100.0% \u2022 195.3/195.3 MB \u2022 2.6 MB/s \u2022 0:00:00 \u2022 0:01:02 # (1)!\n[2024-05-10 15:14:48] INFO     Extracting nplinker_local_mode_example.zip to .                      utils.py:401\n\n# Check the log file\n$ cat nplinker.log\n[2024-05-10 15:14:48] INFO     Extracting nplinker_local_mode_example.zip to .                      utils.py:401\n
    1. This is a progress bar but not a log message.
    "},{"location":"quickstart/","title":"Quickstart","text":"

    NPLinker allows you to run in two modes:

    local modepodp mode

    The local mode assumes that the data required by NPLinker is available on your local machine.

    The required input data includes:

    The podp mode assumes that you use an identifier of Paired Omics Data Platform (PODP) as the input for NPLinker. Then NPLinker will download and prepare all data necessary based on the PODP id which refers to the metadata of the dataset.

    So, which mode will you use? The answer is important for the next steps.

    "},{"location":"quickstart/#1-create-a-working-directory","title":"1. Create a working directory","text":"

    The working directory is used to store all input and output data for NPLinker. You can name this directory as you like, for example nplinker_quickstart:

    Create a working directory
    mkdir nplinker_quickstart\n

    Important

    Before going to the next step, make sure you get familiar with how NPLinker organizes data in the working directory, see Working Directory Structure page.

    "},{"location":"quickstart/#2-prepare-input-data-local-mode-only","title":"2. Prepare input data (local mode only)","text":"Details

    Skip this step if you choose to use the podp mode.

    If you choose to use the local mode, meaning you have input data of NPLinker stored on your local machine, you need to move the input data to the working directory created in the previous step.

    "},{"location":"quickstart/#gnps-data","title":"GNPS data","text":"

    NPLinker accepts data from the output of the following GNPS workflows:

    NPLinker provides the tools GNPSDownloader and GNPSExtractor to download and extract the GNPS data with ease. What you need to give is a valid GNPS task ID, referring to a task of the GNPS workflows supported by NPLinker.

    GNPS task id and workflow

    Given an example of GNPS task at https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=c22f44b14a3d450eb836d607cb9521bb, the task id is the last part of this url, i.e. c22f44b14a3d450eb836d607cb9521bb. Open this link, you can find the worklow info at the row \"Workflow\" of the table \"Job Status\", for this case, it is METABOLOMICS-SNETS.

    Download & Extract GNPS data
    from nplinker.metabolomics.gnps import GNPSDownloader, GNPSExtractor\n\n# Go to the working directory\ncd nplinker_quickstart\n\n# Download GNPS data & get the path to the downloaded archive\ndownloader = GNPSDownloader(\"gnps_task_id\", \"downloads\") # (1)!\ndownloaded_archive = downloader.download().get_download_file()\n\n# Extract GNPS data to `gnps` directory\nextractor = GNPSExtractor(downloaded_archive, \"gnps\") # (2)!\n
    1. If you already have the downloaded archive of GNPS data, you can skip the download steps.
    2. Replace downloaded_archive with the actuall path to your GNPS data archive if you skipped the download steps.

    The required data for NPLinker will be extracted to the gnps subdirectory of the working directory.

    Info

    Not all GNPS data are required by NPLinker, and only the necessary data will be extracted. During the extraction, these data will be renamed to the standard names used by NPLinker. See the page GNPS Data for more information.

    Prepare GNPS data manually

    If you have GNPS data but it is not the archive format as downloaded from GNPS, it's recommended to re-download the data from GNPS.

    If (re-)downloading is not possible, you could manually prepare data for the gnps directory. In this case, you must make sure that the data is organized as expected by NPLinker. See the page GNPS Data for examples of how to prepare the data.

    "},{"location":"quickstart/#antismash-data","title":"AntiSMASH data","text":"

    NPLinker requires AntiSMASH BGC data as input, which are organized in the antismash subdirectory of the working directory.

    For each output of AntiSMASH run, the BGC data must be stored in a subdirectory named after the NCBI accession number (e.g. GCF_000514975.1). And only the *.region*.gbk files are required by NPLinker.

    When manually preparing AntiSMASH data for NPLinker, you must make sure that the data is organized as expected by NPLinker. See the page Working Directory Structure for more information.

    "},{"location":"quickstart/#bigscape-data-optional","title":"BigScape data (optional)","text":"

    It is optional to provide the output of BigScape to NPLinker. If the output of BigScape is not provided, NPLinker will run BigScape automatically to generate the data using the AntiSMASH BGC data.

    If you have the output of BigScape, you can put its mix_clustering_c{cutoff}.tsv file in the bigscape subdirectory of the NPLinker working directory, where {cutoff} is the cutoff value used in the BigScape run.

    "},{"location":"quickstart/#strain-mappings-file","title":"Strain mappings file","text":"

    The strain mappings file strain_mapping.json is required by NPLinker to map the strain to genomics and metabolomics data.

    `strain_mappings.json` example
    {\n    \"strain_mappings\": [\n        {\n            \"strain_id\": \"strain_id_1\", # (1)!\n            \"strain_alias\": [\"bgc_id_1\", \"spectrum_id_1\", ...] # (2)!\n        },\n        {\n            \"strain_id\": \"strain_id_2\",\n            \"strain_alias\": [\"bgc_id_2\", \"spectrum_id_2\", ...]\n        },\n        ...\n    ],\n    \"version\": \"1.0\" # (3)!\n}\n
    1. strain_id is the unique identifier of the strain.
    2. strain_alias is a list of aliases of the strain, which are the identifiers of the BGCs and spectra of the strain.
    3. version is the schema version of this file. It is recommended to use the latest version of the schema. The current latest version is 1.0.

    The BGC id is same as the name of the BGC file in the antismash directory, for example, given a BGC file xxxx.region001.gbk, the BGC id is xxxx.region001.

    The spectrum id is same as the scan number in the spectra.mgf file in the gnps directory, for example, given a spectrum in the mgf file with a scan SCANS=1, the spectrum id is 1.

    If you labelled the mzXML files (input for GNPS) with the strain id, you may need the function extract_mappings_ms_filename_spectrum_id to extract the mappings from mzXML files to the spectrum ids.

    For the local mode, you need to create this file manually and put it in the working directory. It takes some effort to prepare this file manually, especially when you have a large number of strains.

    "},{"location":"quickstart/#3-prepare-config-file","title":"3. Prepare config file","text":"

    The configuration file nplinker.toml is required by NPLinker to specify the working directory, mode, and other settings for the run of NPLinker. You can put the nplinker.toml file in any place, but it is recommended to put it in the working directory created in step 2.

    The details of all settings can be found at this page Config File.

    To keep it simple, default settings will be used automatically by NPLinker if you don't set them in your nplinker.toml config file.

    What you need to do is to set the root_dir and mode in the nplinker.toml file.

    local modepodp mode nplinker.toml
    root_dir = \"absolute/path/to/working/directory\" # (1)!\nmode = \"local\"\n# and other settings you want to override the default settings \n
    1. Replace absolute/path/to/working/directory with the absolute path to the working directory created in step 2.
    nplinker.toml
    root_dir = \"absolute/path/to/working/directory\" # (1)!\nmode = \"podp\"\npodp_id = \"podp_id\" # (2)!\n# and other settings you want to override the default settings \n
    1. Replace absolute/path/to/working/directory with the absolute path to the working directory created in step 2.
    2. Replace podp_id with the identifier of the dataset in the Paired Omics Data Platform (PODP).
    "},{"location":"quickstart/#4-run-nplinker","title":"4. Run NPLinker","text":"

    Before running NPLinker, make sure your working directory has the correct directory structure and names as described in the Working Directory Structure page.

    Run NPLinker in your working directory
    from nplinker import NPLinker\n\n# create an instance of NPLinker\nnpl = NPLinker(\"nplinker.toml\") # (1)!\n\n# load data\nnpl.load_data()\n\n# check loaded data\nprint(npl.bgcs)\nprint(npl.gcfs)\nprint(npl.spectra)\nprint(npl.mfs)\nprint(npl.strains)\n\n# compute the links for the first 3 GCFs using metcalf scoring method\nlink_graph = npl.get_links(npl.gcfs[:3], \"metcalf\")  # (2)!\n\n# get links as a list of tuples\nlink_graph.links \n\n# get the link data between two objects or entities\nlink_graph.get_link_data(npl.gcfs[0], npl.spectra[0]) \n\n# Save data to a pickle file\nnpl.save_data(\"npl.pkl\", link_graph)\n
    1. Replace nplinker.toml with the actual path to your configuration file.
    2. The get_links returns a LinkGraph object that represents the calculated links between the GCFs and other entities as a graph.

    For more info about the classes and methods, see the API Documentation.

    "},{"location":"api/antismash/","title":"AntiSMASH","text":""},{"location":"api/antismash/#nplinker.genomics.antismash","title":"antismash","text":""},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader","title":"AntismashBGCLoader","text":"
    AntismashBGCLoader(data_dir: str | PathLike)\n

    Bases: BGCLoaderBase

    Build a loader for AntiSMASH BGC genbank (.gbk) files.

    Note

    AntiSMASH BGC directory must follow the structure below:

    antismash\n    \u251c\u2500\u2500 genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)\n    \u2502\u00a0 \u251c\u2500\u2500 GCF_000514775.1.gbk\n    \u2502\u00a0 \u251c\u2500\u2500 NZ_AZWO01000004.region001.gbk\n    \u2502\u00a0 \u2514\u2500\u2500 ...\n    \u251c\u2500\u2500 genome_id_2\n    \u2502\u00a0 \u251c\u2500\u2500 ...\n    \u2514\u2500\u2500 ...\n

    Parameters:

    Name Type Description Default data_dir str | PathLike

    Path to AntiSMASH directory that contains a collection of AntiSMASH outputs.

    required Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def __init__(self, data_dir: str | PathLike) -> None:\n    \"\"\"Initialize the AntiSMASH BGC loader.\n\n    Args:\n        data_dir: Path to AntiSMASH directory that contains a\n            collection of AntiSMASH outputs.\n    \"\"\"\n    self.data_dir = str(data_dir)\n    self._file_dict = self._parse_data_dir(self.data_dir)\n    self._bgcs = self._parse_bgcs(self._file_dict)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.data_dir","title":"data_dir instance-attribute","text":"
    data_dir = str(data_dir)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.get_bgc_genome_mapping","title":"get_bgc_genome_mapping","text":"
    get_bgc_genome_mapping() -> dict[str, str]\n

    Get the mapping from BGC to genome.

    Note that the directory name of the gbk file is treated as genome id.

    Returns:

    Type Description dict[str, str]

    The key is BGC name (gbk file name) and value is genome id (the directory name of the

    dict[str, str]

    gbk file).

    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def get_bgc_genome_mapping(self) -> dict[str, str]:\n    \"\"\"Get the mapping from BGC to genome.\n\n    Note that the directory name of the gbk file is treated as genome id.\n\n    Returns:\n        The key is BGC name (gbk file name) and value is genome id (the directory name of the\n        gbk file).\n    \"\"\"\n    return {\n        bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()\n    }\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.get_files","title":"get_files","text":"
    get_files() -> dict[str, str]\n

    Get BGC gbk files.

    Returns:

    Type Description dict[str, str]

    The key is BGC name (gbk file name) and value is path to the gbk file.

    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def get_files(self) -> dict[str, str]:\n    \"\"\"Get BGC gbk files.\n\n    Returns:\n        The key is BGC name (gbk file name) and value is path to the gbk file.\n    \"\"\"\n    return self._file_dict\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.get_bgcs","title":"get_bgcs","text":"
    get_bgcs() -> list[BGC]\n

    Get all BGC objects.

    Returns:

    Type Description list[BGC]

    A list of BGC objects

    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def get_bgcs(self) -> list[BGC]:\n    \"\"\"Get all BGC objects.\n\n    Returns:\n        A list of BGC objects\n    \"\"\"\n    return self._bgcs\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus","title":"GenomeStatus","text":"
    GenomeStatus(\n    original_id: str,\n    resolved_refseq_id: str = \"\",\n    resolve_attempted: bool = False,\n    bgc_path: str = \"\",\n)\n

    A class to represent the status of a single genome.

    The status of genomes is tracked in a JSON file which has a name defined in variable GENOME_STATUS_FILENAME.

    Parameters:

    Name Type Description Default original_id str

    The original ID of the genome.

    required resolved_refseq_id str

    The resolved RefSeq ID of the genome. Defaults to \"\".

    '' resolve_attempted bool

    A flag indicating whether an attempt to resolve the RefSeq ID has been made. Defaults to False.

    False bgc_path str

    The path to the downloaded BGC file for the genome. Defaults to \"\".

    '' Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    def __init__(\n    self,\n    original_id: str,\n    resolved_refseq_id: str = \"\",\n    resolve_attempted: bool = False,\n    bgc_path: str = \"\",\n):\n    \"\"\"Initialize a GenomeStatus object for the given genome.\n\n    Args:\n        original_id: The original ID of the genome.\n        resolved_refseq_id: The resolved RefSeq ID of the\n            genome. Defaults to \"\".\n        resolve_attempted: A flag indicating whether an\n            attempt to resolve the RefSeq ID has been made. Defaults to False.\n        bgc_path: The path to the downloaded BGC file for\n            the genome. Defaults to \"\".\n    \"\"\"\n    self.original_id = original_id\n    self.resolved_refseq_id = \"\" if resolved_refseq_id == \"None\" else resolved_refseq_id\n    self.resolve_attempted = resolve_attempted\n    self.bgc_path = bgc_path\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.original_id","title":"original_id instance-attribute","text":"
    original_id = original_id\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.resolved_refseq_id","title":"resolved_refseq_id instance-attribute","text":"
    resolved_refseq_id = (\n    \"\"\n    if resolved_refseq_id == \"None\"\n    else resolved_refseq_id\n)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.resolve_attempted","title":"resolve_attempted instance-attribute","text":"
    resolve_attempted = resolve_attempted\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.bgc_path","title":"bgc_path instance-attribute","text":"
    bgc_path = bgc_path\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.read_json","title":"read_json staticmethod","text":"
    read_json(\n    file: str | PathLike,\n) -> dict[str, \"GenomeStatus\"]\n

    Get a dict of GenomeStatus objects by loading given genome status file.

    Note that an empty dict is returned if the given file doesn't exist.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to genome status file.

    required

    Returns:

    Type Description dict[str, 'GenomeStatus']

    Dict keys are genome original id and values are GenomeStatus objects. An empty dict is returned if the given file doesn't exist.

    Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    @staticmethod\ndef read_json(file: str | PathLike) -> dict[str, \"GenomeStatus\"]:\n    \"\"\"Get a dict of GenomeStatus objects by loading given genome status file.\n\n    Note that an empty dict is returned if the given file doesn't exist.\n\n    Args:\n        file: Path to genome status file.\n\n    Returns:\n        Dict keys are genome original id and values are GenomeStatus\n            objects. An empty dict is returned if the given file doesn't exist.\n    \"\"\"\n    genome_status_dict = {}\n    if Path(file).exists():\n        with open(file, \"r\") as f:\n            data = json.load(f)\n\n        # validate json data before using it\n        validate(data, schema=GENOME_STATUS_SCHEMA)\n\n        genome_status_dict = {\n            gs[\"original_id\"]: GenomeStatus(**gs) for gs in data[\"genome_status\"]\n        }\n    return genome_status_dict\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.to_json","title":"to_json staticmethod","text":"
    to_json(\n    genome_status_dict: Mapping[str, \"GenomeStatus\"],\n    file: str | PathLike | None = None,\n) -> str | None\n

    Convert the genome status dictionary to a JSON string.

    If a file path is provided, the JSON string is written to the file. If the file already exists, it is overwritten.

    Parameters:

    Name Type Description Default genome_status_dict Mapping[str, 'GenomeStatus']

    A dictionary of genome status objects. The keys are the original genome IDs and the values are GenomeStatus objects.

    required file str | PathLike | None

    The path to the output JSON file. If None, the JSON string is returned but not written to a file.

    None

    Returns:

    Type Description str | None

    The JSON string if file is None, otherwise None.

    Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    @staticmethod\ndef to_json(\n    genome_status_dict: Mapping[str, \"GenomeStatus\"], file: str | PathLike | None = None\n) -> str | None:\n    \"\"\"Convert the genome status dictionary to a JSON string.\n\n    If a file path is provided, the JSON string is written to the file. If\n    the file already exists, it is overwritten.\n\n    Args:\n        genome_status_dict: A dictionary of genome\n            status objects. The keys are the original genome IDs and the values\n            are GenomeStatus objects.\n        file: The path to the output JSON file.\n            If None, the JSON string is returned but not written to a file.\n\n    Returns:\n        The JSON string if `file` is None, otherwise None.\n    \"\"\"\n    gs_list = [gs._to_dict() for gs in genome_status_dict.values()]\n    json_data = {\"genome_status\": gs_list, \"version\": \"1.0\"}\n\n    # validate json object before dumping\n    validate(json_data, schema=GENOME_STATUS_SCHEMA)\n\n    if file is not None:\n        with open(file, \"w\") as f:\n            json.dump(json_data, f)\n        return None\n    return json.dumps(json_data)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.download_and_extract_antismash_data","title":"download_and_extract_antismash_data","text":"
    download_and_extract_antismash_data(\n    antismash_id: str,\n    download_root: str | PathLike,\n    extract_root: str | PathLike,\n) -> None\n

    Download and extract antiSMASH BGC archive for a specified genome.

    The antiSMASH database (https://antismash-db.secondarymetabolites.org/) is used to download the BGC archive. And antiSMASH use RefSeq assembly id of a genome as the id of the archive.

    Parameters:

    Name Type Description Default antismash_id str

    The id used to download BGC archive from antiSMASH database. If the id is versioned (e.g., \"GCF_004339725.1\") please be sure to specify the version as well.

    required download_root str | PathLike

    Path to the directory to place downloaded archive in.

    required extract_root str | PathLike

    Path to the directory data files will be extracted to. Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.

    required

    Raises:

    Type Description ValueError

    if download_root and extract_root dirs are the same.

    ValueError

    if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.

    Examples:

    >>> download_and_extract_antismash_metadata(\"GCF_004339725.1\", \"/data/download\", \"/data/extracted\")\n
    Source code in src/nplinker/genomics/antismash/antismash_downloader.py
    def download_and_extract_antismash_data(\n    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike\n) -> None:\n    \"\"\"Download and extract antiSMASH BGC archive for a specified genome.\n\n    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)\n    is used to download the BGC archive. And antiSMASH use RefSeq assembly id\n    of a genome as the id of the archive.\n\n    Args:\n        antismash_id: The id used to download BGC archive from antiSMASH database.\n            If the id is versioned (e.g., \"GCF_004339725.1\") please be sure to\n            specify the version as well.\n        download_root: Path to the directory to place downloaded archive in.\n        extract_root: Path to the directory data files will be extracted to.\n            Note that an `antismash` directory will be created in the specified `extract_root` if\n            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.\n\n    Raises:\n        ValueError: if `download_root` and `extract_root` dirs are the same.\n        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.\n\n    Examples:\n        >>> download_and_extract_antismash_metadata(\"GCF_004339725.1\", \"/data/download\", \"/data/extracted\")\n    \"\"\"\n    download_root = Path(download_root)\n    extract_root = Path(extract_root)\n    extract_path = extract_root / \"antismash\" / antismash_id\n    _check_roots(download_root, extract_root)\n\n    try:\n        if extract_path.exists():\n            _check_extract_path(extract_path)\n        else:\n            extract_path.mkdir(parents=True, exist_ok=True)\n\n        for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:\n            url = base_url.format(antismash_id, antismash_id + \".zip\")\n            download_and_extract_archive(url, download_root, extract_path, antismash_id + \".zip\")\n            break\n\n        # delete subdirs\n        for subdir_path in list_dirs(extract_path):\n            shutil.rmtree(subdir_path)\n\n        # delete unnecessary files\n        files_to_keep = list_files(extract_path, suffix=(\".json\", \".gbk\"))\n        for file in list_files(extract_path):\n            if file not in files_to_keep:\n                os.remove(file)\n\n        logger.info(\"antiSMASH BGC data of %s is downloaded and extracted.\", antismash_id)\n\n    except Exception as e:\n        shutil.rmtree(extract_path)\n        logger.warning(e)\n        raise e\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.parse_bgc_genbank","title":"parse_bgc_genbank","text":"
    parse_bgc_genbank(file: str | PathLike) -> BGC\n

    Parse a single BGC gbk file to BGC object.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to BGC gbk file

    required

    Returns:

    Type Description BGC

    BGC object

    Examples:

    >>> bgc = AntismashBGCLoader.parse_bgc(\n...    \"/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk\")\n
    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def parse_bgc_genbank(file: str | PathLike) -> BGC:\n    \"\"\"Parse a single BGC gbk file to BGC object.\n\n    Args:\n        file: Path to BGC gbk file\n\n    Returns:\n        BGC object\n\n    Examples:\n        >>> bgc = AntismashBGCLoader.parse_bgc(\n        ...    \"/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk\")\n    \"\"\"\n    file = Path(file)\n    fname = file.stem\n\n    record = SeqIO.read(file, format=\"genbank\")\n    description = record.description  # \"DEFINITION\" in gbk file\n    antismash_id = record.id  # \"VERSION\" in gbk file\n    features = _parse_antismash_genbank(record)\n    product_prediction = features.get(\"product\")\n    if product_prediction is None:\n        raise ValueError(f\"Not found product prediction in antiSMASH Genbank file {file}\")\n\n    # init BGC\n    bgc = BGC(fname, *product_prediction)\n    bgc.description = description\n    bgc.antismash_id = antismash_id\n    bgc.antismash_file = str(file)\n    bgc.antismash_region = features.get(\"region_number\")\n    bgc.smiles = features.get(\"smiles\")\n    bgc.strain = Strain(fname)\n    return bgc\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.get_best_available_genome_id","title":"get_best_available_genome_id","text":"
    get_best_available_genome_id(\n    genome_id_data: Mapping[str, str]\n) -> str | None\n

    Get the best available ID from genome_id_data dict.

    Parameters:

    Name Type Description Default genome_id_data Mapping[str, str]

    dictionary containing information for each genome record present.

    required

    Returns:

    Type Description str | None

    ID for the genome, if present, otherwise None.

    Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | None:\n    \"\"\"Get the best available ID from genome_id_data dict.\n\n    Args:\n        genome_id_data: dictionary containing information for each genome record present.\n\n    Returns:\n        ID for the genome, if present, otherwise None.\n    \"\"\"\n    if \"RefSeq_accession\" in genome_id_data:\n        best_id = genome_id_data[\"RefSeq_accession\"]\n    elif \"GenBank_accession\" in genome_id_data:\n        best_id = genome_id_data[\"GenBank_accession\"]\n    elif \"JGI_Genome_ID\" in genome_id_data:\n        best_id = genome_id_data[\"JGI_Genome_ID\"]\n    else:\n        best_id = None\n\n    if best_id is None or len(best_id) == 0:\n        logger.warning(f\"Failed to get valid genome ID in genome data: {genome_id_data}\")\n        return None\n    return best_id\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.podp_download_and_extract_antismash_data","title":"podp_download_and_extract_antismash_data","text":"
    podp_download_and_extract_antismash_data(\n    genome_records: Sequence[\n        Mapping[str, Mapping[str, str]]\n    ],\n    project_download_root: str | PathLike,\n    project_extract_root: str | PathLike,\n)\n

    Download and extract antiSMASH BGC archive for the given genome records.

    Parameters:

    Name Type Description Default genome_records Sequence[Mapping[str, Mapping[str, str]]]

    list of dicts representing genome records. The dict of each genome record contains - key(str): \"genome_ID\" - value(dict[str, str]): a dict containing information about genome type, label and accession ids (RefSeq, GenBank, and/or JGI).

    required project_download_root str | PathLike

    Path to the directory to place downloaded archive in.

    required project_extract_root str | PathLike

    Path to the directory downloaded archive will be extracted to. Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.

    required Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    def podp_download_and_extract_antismash_data(\n    genome_records: Sequence[Mapping[str, Mapping[str, str]]],\n    project_download_root: str | PathLike,\n    project_extract_root: str | PathLike,\n):\n    \"\"\"Download and extract antiSMASH BGC archive for the given genome records.\n\n    Args:\n        genome_records: list of dicts\n            representing genome records. The dict of each genome record contains\n                - key(str): \"genome_ID\"\n                - value(dict[str, str]): a dict containing information about genome\n                type, label and accession ids (RefSeq, GenBank, and/or JGI).\n        project_download_root: Path to the directory to place\n            downloaded archive in.\n        project_extract_root: Path to the directory downloaded archive\n            will be extracted to.\n            Note that an `antismash` directory will be created in the specified\n            `extract_root` if it doesn't exist. The files will be extracted to\n            `<extract_root>/antismash/<antismash_id>` directory.\n    \"\"\"\n    if not Path(project_download_root).exists():\n        # otherwise in case of failed first download, the folder doesn't exist and\n        # genome_status_file can't be written\n        Path(project_download_root).mkdir(parents=True, exist_ok=True)\n\n    gs_file = Path(project_download_root, GENOME_STATUS_FILENAME)\n    gs_dict = GenomeStatus.read_json(gs_file)\n\n    for i, genome_record in enumerate(genome_records):\n        # get the best available ID from the dict\n        genome_id_data = genome_record[\"genome_ID\"]\n        raw_genome_id = get_best_available_genome_id(genome_id_data)\n        if raw_genome_id is None or len(raw_genome_id) == 0:\n            logger.warning(\n                f'Ignoring genome record \"{genome_record}\" due to missing genome ID field'\n            )\n            continue\n\n        # check if genome ID exist in the genome status file\n        if raw_genome_id not in gs_dict:\n            gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id)\n\n        gs_obj = gs_dict[raw_genome_id]\n\n        logger.info(\n            f\"Checking for antismash data {i + 1}/{len(genome_records)}, \"\n            f\"current genome ID={raw_genome_id}\"\n        )\n        # first, check if BGC data is downloaded\n        if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists():\n            logger.info(f\"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}\")\n            continue\n        # second, check if lookup attempted previously\n        if gs_obj.resolve_attempted:\n            logger.info(f\"Genome ID {raw_genome_id} skipped due to previous failure\")\n            continue\n\n        # if not downloaded or lookup attempted, then try to resolve the ID\n        # and download\n        logger.info(f\"Beginning lookup process for genome ID {raw_genome_id}\")\n        gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data)\n        gs_obj.resolve_attempted = True\n\n        if gs_obj.resolved_refseq_id == \"\":\n            # give up on this one\n            logger.warning(f\"Failed lookup for genome ID {raw_genome_id}\")\n            continue\n\n        # if resolved id is valid, try to download and extract antismash data\n        try:\n            download_and_extract_antismash_data(\n                gs_obj.resolved_refseq_id, project_download_root, project_extract_root\n            )\n\n            gs_obj.bgc_path = str(\n                Path(project_download_root, gs_obj.resolved_refseq_id + \".zip\").absolute()\n            )\n\n            output_path = Path(project_extract_root, \"antismash\", gs_obj.resolved_refseq_id)\n            if output_path.exists():\n                Path.touch(output_path / \"completed\", exist_ok=True)\n\n        except Exception:\n            gs_obj.bgc_path = \"\"\n\n    missing = len([gs for gs in gs_dict.values() if not gs.bgc_path])\n    logger.info(\n        f\"Dataset has {missing} missing sets of antiSMASH data \"\n        f\" (from a total of {len(genome_records)}).\"\n    )\n\n    # save updated genome status to json file\n    GenomeStatus.to_json(gs_dict, gs_file)\n\n    if missing == len(genome_records):\n        raise ValueError(\"No antiSMASH data found for any genome\")\n
    "},{"location":"api/arranger/","title":"Dataset Arranger","text":""},{"location":"api/arranger/#nplinker.arranger","title":"arranger","text":""},{"location":"api/arranger/#nplinker.arranger.PODP_PROJECT_URL","title":"PODP_PROJECT_URL module-attribute","text":"
    PODP_PROJECT_URL = \"https://pairedomicsdata.bioinformatics.nl/api/projects/{}\"\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger","title":"DatasetArranger","text":"
    DatasetArranger(config: Dynaconf)\n

    Arrange the dataset required by NPLinker.

    This class is used to arrange the datasets required by NPLinker according to the configuration. The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.

    If self.config.mode is \"local\", the datasets are validated. If self.config.mode is \"podp\", the datasets are downloaded or generated.

    Attributes:

    Name Type Description config

    A Dynaconf object that contains the configuration settings. Check nplinker.config module for more details.

    root_dir

    The root directory of the datasets.

    downloads_dir

    The directory to store downloaded files.

    mibig_dir

    The directory to store MIBiG metadata.

    gnps_dir

    The directory to store GNPS data.

    antismash_dir

    The directory to store antiSMASH data.

    bigscape_dir

    The directory to store BiG-SCAPE data.

    bigscape_running_output_dir

    The directory to store the running output of BiG-SCAPE.

    Parameters:

    Name Type Description Default config Dynaconf

    A Dynaconf object that contains the configuration settings. Check nplinker.config module for more details.

    required Source code in src/nplinker/arranger.py
    def __init__(self, config: Dynaconf) -> None:\n    \"\"\"Initialize the DatasetArranger.\n\n    Args:\n        config: A Dynaconf object that contains the configuration settings. Check `nplinker.config`\n            module for more details.\n    \"\"\"\n    self.config = config\n    self.root_dir = config.root_dir\n    self.downloads_dir = self.root_dir / defaults.DOWNLOADS_DIRNAME\n    self.downloads_dir.mkdir(exist_ok=True)\n\n    self.mibig_dir = self.root_dir / defaults.MIBIG_DIRNAME\n    self.gnps_dir = self.root_dir / defaults.GNPS_DIRNAME\n    self.antismash_dir = self.root_dir / defaults.ANTISMASH_DIRNAME\n    self.bigscape_dir = self.root_dir / defaults.BIGSCAPE_DIRNAME\n    self.bigscape_running_output_dir = (\n        self.bigscape_dir / defaults.BIGSCAPE_RUNNING_OUTPUT_DIRNAME\n    )\n\n    self.arrange_podp_project_json()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.config","title":"config instance-attribute","text":"
    config = config\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.root_dir","title":"root_dir instance-attribute","text":"
    root_dir = root_dir\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.downloads_dir","title":"downloads_dir instance-attribute","text":"
    downloads_dir = root_dir / DOWNLOADS_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.mibig_dir","title":"mibig_dir instance-attribute","text":"
    mibig_dir = root_dir / MIBIG_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.gnps_dir","title":"gnps_dir instance-attribute","text":"
    gnps_dir = root_dir / GNPS_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.antismash_dir","title":"antismash_dir instance-attribute","text":"
    antismash_dir = root_dir / ANTISMASH_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.bigscape_dir","title":"bigscape_dir instance-attribute","text":"
    bigscape_dir = root_dir / BIGSCAPE_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.bigscape_running_output_dir","title":"bigscape_running_output_dir instance-attribute","text":"
    bigscape_running_output_dir = (\n    bigscape_dir / BIGSCAPE_RUNNING_OUTPUT_DIRNAME\n)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange","title":"arrange","text":"
    arrange() -> None\n

    Arrange the datasets according to the configuration.

    The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.

    Source code in src/nplinker/arranger.py
    def arrange(self) -> None:\n    \"\"\"Arrange the datasets according to the configuration.\n\n    The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.\n    \"\"\"\n    # The order of arranging the datasets matters, as some datasets depend on others\n    self.arrange_mibig()\n    self.arrange_gnps()\n    self.arrange_antismash()\n    self.arrange_bigscape()\n    self.arrange_strain_mappings()\n    self.arrange_strains_selected()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_podp_project_json","title":"arrange_podp_project_json","text":"
    arrange_podp_project_json() -> None\n

    Arrange the PODP project JSON file.

    If self.config.mode is \"podp\", download the PODP project JSON file if it doesn't exist. Then validate the PODP project JSON file if it exists or is downloaded.

    The validation is controlled by the json schema schemas/podp_adapted_schema.json.

    Source code in src/nplinker/arranger.py
    def arrange_podp_project_json(self) -> None:\n    \"\"\"Arrange the PODP project JSON file.\n\n    If `self.config.mode` is \"podp\", download the PODP project JSON file if it doesn't exist. Then\n    validate the PODP project JSON file if it exists or is downloaded.\n\n    The validation is controlled by the json schema `schemas/podp_adapted_schema.json`.\n    \"\"\"\n    if self.config.mode == \"podp\":\n        file_name = f\"paired_datarecord_{self.config.podp_id}.json\"\n        podp_file = self.downloads_dir / file_name\n        if not podp_file.exists():\n            download_url(\n                PODP_PROJECT_URL.format(self.config.podp_id),\n                self.downloads_dir,\n                file_name,\n            )\n\n        with open(podp_file, \"r\") as f:\n            json_data = json.load(f)\n        validate_podp_json(json_data)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_mibig","title":"arrange_mibig","text":"
    arrange_mibig() -> None\n

    Arrange the MIBiG metadata.

    Always download and extract the MIBiG metadata if self.config.mibig.to_use is True. If the default directory has already existed, it will be removed and re-downloaded to ensure the latest version is used. So it's not allowed to manually put MIBiG metadata in the default directory.

    Source code in src/nplinker/arranger.py
    def arrange_mibig(self) -> None:\n    \"\"\"Arrange the MIBiG metadata.\n\n    Always download and extract the MIBiG metadata if `self.config.mibig.to_use` is True.\n    If the default directory has already existed, it will be removed and re-downloaded to ensure\n    the latest version is used. So it's not allowed to manually put MIBiG metadata in the\n    default directory.\n    \"\"\"\n    if self.config.mibig.to_use:\n        if self.mibig_dir.exists():\n            # remove existing mibig data\n            shutil.rmtree(self.mibig_dir)\n        download_and_extract_mibig_metadata(\n            self.downloads_dir,\n            self.mibig_dir,\n            version=self.config.mibig.version,\n        )\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_gnps","title":"arrange_gnps","text":"
    arrange_gnps() -> None\n

    Arrange the GNPS data.

    If self.config.mode is \"local\", validate the GNPS data directory. If self.config.mode is \"podp\", download the GNPS data if it doesn't exist or remove the existing GNPS data and re-download it if it is invalid.

    The validation process includes:

    Source code in src/nplinker/arranger.py
    def arrange_gnps(self) -> None:\n    \"\"\"Arrange the GNPS data.\n\n    If `self.config.mode` is \"local\", validate the GNPS data directory.\n    If `self.config.mode` is \"podp\", download the GNPS data if it doesn't exist or remove the\n    existing GNPS data and re-download it if it is invalid.\n\n    The validation process includes:\n\n    - Check if the GNPS data directory exists.\n    - Check if the required files exist in the GNPS data directory, including:\n        - file_mappings.tsv or file_mappings.csv\n        - spectra.mgf\n        - molecular_families.tsv\n        - annotations.tsv\n    \"\"\"\n    pass_validation = False\n    if self.config.mode == \"podp\":\n        # retry downloading at most 3 times if downloaded data has problems\n        for _ in range(3):\n            try:\n                validate_gnps(self.gnps_dir)\n                pass_validation = True\n                break\n            except (FileNotFoundError, ValueError):\n                # Don't need to remove downloaded archive, as it'll be overwritten\n                shutil.rmtree(self.gnps_dir, ignore_errors=True)\n                self._download_and_extract_gnps()\n\n    if not pass_validation:\n        validate_gnps(self.gnps_dir)\n\n    # get the path to file_mappings file (csv or tsv)\n    self.gnps_file_mappings_file = self._get_gnps_file_mappings_file()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_antismash","title":"arrange_antismash","text":"
    arrange_antismash() -> None\n

    Arrange the antiSMASH data.

    If self.config.mode is \"local\", validate the antiSMASH data directory. If self.config.mode is \"podp\", download the antiSMASH data if it doesn't exist or remove the existing antiSMASH data and re-download it if it is invalid.

    The validation process includes: - Check if the antiSMASH data directory exists. - Check if the antiSMASH data directory contains at least one sub-directory, and each sub-directory contains at least one BGC file (with the suffix \".region???.gbk\" where ??? is a number).

    AntiSMASH BGC directory must follow the structure below:

    antismash\n    \u251c\u2500\u2500 genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)\n    \u2502\u00a0 \u251c\u2500\u2500 GCF_000514775.1.gbk\n    \u2502\u00a0 \u251c\u2500\u2500 NZ_AZWO01000004.region001.gbk\n    \u2502\u00a0 \u2514\u2500\u2500 ...\n    \u251c\u2500\u2500 genome_id_2\n    \u2502\u00a0 \u251c\u2500\u2500 ...\n    \u2514\u2500\u2500 ...\n

    Source code in src/nplinker/arranger.py
    def arrange_antismash(self) -> None:\n    \"\"\"Arrange the antiSMASH data.\n\n    If `self.config.mode` is \"local\", validate the antiSMASH data directory.\n    If `self.config.mode` is \"podp\", download the antiSMASH data if it doesn't exist or remove the\n    existing antiSMASH data and re-download it if it is invalid.\n\n    The validation process includes:\n    - Check if the antiSMASH data directory exists.\n    - Check if the antiSMASH data directory contains at least one sub-directory, and each\n        sub-directory contains at least one BGC file (with the suffix \".region???.gbk\" where ???\n        is a number).\n\n    AntiSMASH BGC directory must follow the structure below:\n    ```\n    antismash\n        \u251c\u2500\u2500 genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)\n        \u2502\u00a0 \u251c\u2500\u2500 GCF_000514775.1.gbk\n        \u2502\u00a0 \u251c\u2500\u2500 NZ_AZWO01000004.region001.gbk\n        \u2502\u00a0 \u2514\u2500\u2500 ...\n        \u251c\u2500\u2500 genome_id_2\n        \u2502\u00a0 \u251c\u2500\u2500 ...\n        \u2514\u2500\u2500 ...\n    ```\n    \"\"\"\n    pass_validation = False\n    if self.config.mode == \"podp\":\n        for _ in range(3):\n            try:\n                validate_antismash(self.antismash_dir)\n                pass_validation = True\n                break\n            except FileNotFoundError:\n                shutil.rmtree(self.antismash_dir, ignore_errors=True)\n                self._download_and_extract_antismash()\n\n    if not pass_validation:\n        validate_antismash(self.antismash_dir)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_bigscape","title":"arrange_bigscape","text":"
    arrange_bigscape() -> None\n

    Arrange the BiG-SCAPE data.

    If self.config.mode is \"local\", validate the BiG-SCAPE data directory. If self.config.mode is \"podp\", run BiG-SCAPE to generate the clustering file if it doesn't exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid. The running output of BiG-SCAPE will be saved to the directory \"bigscape_running_output\" in the default BiG-SCAPE directory, and the clustering file \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" will be copied to the default BiG-SCAPE directory.

    The validation process includes:

    Source code in src/nplinker/arranger.py
    def arrange_bigscape(self) -> None:\n    \"\"\"Arrange the BiG-SCAPE data.\n\n    If `self.config.mode` is \"local\", validate the BiG-SCAPE data directory.\n    If `self.config.mode` is \"podp\", run BiG-SCAPE to generate the clustering file if it doesn't\n    exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid.\n    The running output of BiG-SCAPE will be saved to the directory \"bigscape_running_output\"\n    in the default BiG-SCAPE directory, and the clustering file\n    \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" will be copied to the default BiG-SCAPE\n    directory.\n\n    The validation process includes:\n\n    - Check if the default BiG-SCAPE data directory exists.\n    - Check if the clustering file \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" exists in the\n            BiG-SCAPE data directory.\n    - Check if the 'data_sqlite.db' file exists in the BiG-SCAPE data directory.\n    \"\"\"\n    pass_validation = False\n    if self.config.mode == \"podp\":\n        for _ in range(3):\n            try:\n                validate_bigscape(self.bigscape_dir, self.config.bigscape.cutoff)\n                pass_validation = True\n                break\n            except FileNotFoundError:\n                shutil.rmtree(self.bigscape_dir, ignore_errors=True)\n                self._run_bigscape()\n\n    if not pass_validation:\n        validate_bigscape(self.bigscape_dir, self.config.bigscape.cutoff)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_strain_mappings","title":"arrange_strain_mappings","text":"
    arrange_strain_mappings() -> None\n

    Arrange the strain mappings file.

    If self.config.mode is \"local\", validate the strain mappings file. If self.config.mode is \"podp\", always generate the strain mappings file and validate it.

    The validation checks if the strain mappings file exists and if it is a valid JSON file according to the schema defined in schemas/strain_mappings_schema.json.

    Source code in src/nplinker/arranger.py
    def arrange_strain_mappings(self) -> None:\n    \"\"\"Arrange the strain mappings file.\n\n    If `self.config.mode` is \"local\", validate the strain mappings file.\n    If `self.config.mode` is \"podp\", always generate the strain mappings file and validate it.\n\n    The validation checks if the strain mappings file exists and if it is a valid JSON file\n    according to the schema defined in `schemas/strain_mappings_schema.json`.\n    \"\"\"\n    if self.config.mode == \"podp\":\n        self._generate_strain_mappings()\n\n    self._validate_strain_mappings()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_strains_selected","title":"arrange_strains_selected","text":"
    arrange_strains_selected() -> None\n

    Arrange the strains selected file.

    Validate the strains selected file if it exists. The validation checks if the strains selected file is a valid JSON file according to the schema defined in schemas/user_strains.json.

    Source code in src/nplinker/arranger.py
    def arrange_strains_selected(self) -> None:\n    \"\"\"Arrange the strains selected file.\n\n    Validate the strains selected file if it exists.\n    The validation checks if the strains selected file is a valid JSON file according to the\n    schema defined in `schemas/user_strains.json`.\n    \"\"\"\n    strains_selected_file = self.root_dir / defaults.STRAINS_SELECTED_FILENAME\n    if strains_selected_file.exists():\n        with open(strains_selected_file, \"r\") as f:\n            json_data = json.load(f)\n        validate(instance=json_data, schema=USER_STRAINS_SCHEMA)\n
    "},{"location":"api/arranger/#nplinker.arranger.validate_gnps","title":"validate_gnps","text":"
    validate_gnps(gnps_dir: str | PathLike) -> None\n

    Validate the GNPS data directory and its contents.

    The GNPS data directory must contain the following files:

    Parameters:

    Name Type Description Default gnps_dir str | PathLike

    Path to the GNPS data directory.

    required

    Raises:

    Type Description FileNotFoundError

    If the GNPS data directory is not found or any of the required files is not found.

    ValueError

    If both file_mappings.tsv and file_mapping.csv are found.

    Source code in src/nplinker/arranger.py
    def validate_gnps(gnps_dir: str | PathLike) -> None:\n    \"\"\"Validate the GNPS data directory and its contents.\n\n    The GNPS data directory must contain the following files:\n\n    - file_mappings.tsv or file_mappings.csv\n    - spectra.mgf\n    - molecular_families.tsv\n    - annotations.tsv\n\n    Args:\n        gnps_dir: Path to the GNPS data directory.\n\n    Raises:\n        FileNotFoundError: If the GNPS data directory is not found or any of the required files\n            is not found.\n        ValueError: If both file_mappings.tsv and file_mapping.csv are found.\n    \"\"\"\n    gnps_dir = Path(gnps_dir)\n    if not gnps_dir.exists():\n        raise FileNotFoundError(f\"GNPS data directory not found at {gnps_dir}\")\n\n    file_mappings_tsv = gnps_dir / defaults.GNPS_FILE_MAPPINGS_TSV\n    file_mappings_csv = gnps_dir / defaults.GNPS_FILE_MAPPINGS_CSV\n    if file_mappings_tsv.exists() and file_mappings_csv.exists():\n        raise ValueError(\n            f\"Both {file_mappings_tsv.name} and {file_mappings_csv.name} found in GNPS directory \"\n            f\"{gnps_dir}, only one is allowed.\"\n        )\n    elif not file_mappings_tsv.exists() and not file_mappings_csv.exists():\n        raise FileNotFoundError(\n            f\"Neither {file_mappings_tsv.name} nor {file_mappings_csv.name} found in GNPS directory\"\n            f\" {gnps_dir}\"\n        )\n\n    required_files = [\n        gnps_dir / defaults.GNPS_SPECTRA_FILENAME,\n        gnps_dir / defaults.GNPS_MOLECULAR_FAMILY_FILENAME,\n        gnps_dir / defaults.GNPS_ANNOTATIONS_FILENAME,\n    ]\n    list_not_found = [f.name for f in required_files if not f.exists()]\n    if list_not_found:\n        raise FileNotFoundError(\n            f\"Files not found in GNPS directory {gnps_dir}: ', '.join({list_not_found})\"\n        )\n
    "},{"location":"api/arranger/#nplinker.arranger.validate_antismash","title":"validate_antismash","text":"
    validate_antismash(antismash_dir: str | PathLike) -> None\n

    Validate the antiSMASH data directory and its contents.

    The validation only checks the structure of the antiSMASH data directory and file names. It does not check

    The antiSMASH data directory must exist and contain at least one sub-directory. The name of the sub-directories must not contain any space. Each sub-directory must contain at least one BGC file (with the suffix \".region???.gbk\" where ??? is the region number).

    Parameters:

    Name Type Description Default antismash_dir str | PathLike

    Path to the antiSMASH data directory.

    required

    Raises:

    Type Description FileNotFoundError

    If the antiSMASH data directory is not found, or no sub-directories are found in the antiSMASH data directory, or no BGC files are found in any sub-directory.

    ValueError

    If any sub-directory name contains a space.

    Source code in src/nplinker/arranger.py
    def validate_antismash(antismash_dir: str | PathLike) -> None:\n    \"\"\"Validate the antiSMASH data directory and its contents.\n\n    The validation only checks the structure of the antiSMASH data directory and file names.\n    It does not check\n\n    - the content of the BGC files\n    - the consistency between the antiSMASH data and the PODP project JSON file for the PODP\n        mode\n\n    The antiSMASH data directory must exist and contain at least one sub-directory. The name of the\n    sub-directories must not contain any space. Each sub-directory must contain at least one BGC\n    file (with the suffix \".region???.gbk\" where ??? is the region number).\n\n    Args:\n        antismash_dir: Path to the antiSMASH data directory.\n\n    Raises:\n        FileNotFoundError: If the antiSMASH data directory is not found, or no sub-directories\n            are found in the antiSMASH data directory, or no BGC files are found in any\n            sub-directory.\n        ValueError: If any sub-directory name contains a space.\n    \"\"\"\n    antismash_dir = Path(antismash_dir)\n    if not antismash_dir.exists():\n        raise FileNotFoundError(f\"antiSMASH data directory not found at {antismash_dir}\")\n\n    sub_dirs = list_dirs(antismash_dir)\n    if not sub_dirs:\n        raise FileNotFoundError(\n            \"No BGC directories found in antiSMASH data directory {antismash_dir}\"\n        )\n\n    for sub_dir in sub_dirs:\n        dir_name = Path(sub_dir).name\n        if \" \" in dir_name:\n            raise ValueError(\n                f\"antiSMASH sub-directory name {dir_name} contains space, which is not allowed\"\n            )\n\n        gbk_files = list_files(sub_dir, suffix=\".gbk\", keep_parent=False)\n        bgc_files = fnmatch.filter(gbk_files, \"*.region???.gbk\")\n        if not bgc_files:\n            raise FileNotFoundError(f\"No BGC files found in antiSMASH sub-directory {sub_dir}\")\n
    "},{"location":"api/arranger/#nplinker.arranger.validate_bigscape","title":"validate_bigscape","text":"
    validate_bigscape(\n    bigscape_dir: str | PathLike, cutoff: str\n) -> None\n

    Validate the BiG-SCAPE data directory and its contents.

    The BiG-SCAPE data directory must exist and contain the clustering file \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" where {self.config.bigscape.cutoff} is the bigscape cutoff value set in the config file.

    Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2. At the moment, all the family assignments in the database will be used, so this database should contain results from a single run with the desired cutoff.

    Parameters:

    Name Type Description Default bigscape_dir str | PathLike

    Path to the BiG-SCAPE data directory.

    required cutoff str

    The BiG-SCAPE cutoff value.

    required

    Raises:

    Type Description FileNotFoundError

    If the BiG-SCAPE data directory or the clustering file is not found.

    Source code in src/nplinker/arranger.py
    def validate_bigscape(bigscape_dir: str | PathLike, cutoff: str) -> None:\n    \"\"\"Validate the BiG-SCAPE data directory and its contents.\n\n    The BiG-SCAPE data directory must exist and contain the clustering file\n    \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" where {self.config.bigscape.cutoff} is the\n    bigscape cutoff value set in the config file.\n\n    Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2.\n    At the moment, all the family assignments in the database will be used, so this database should\n    contain results from a single run with the desired cutoff.\n\n    Args:\n        bigscape_dir: Path to the BiG-SCAPE data directory.\n        cutoff: The BiG-SCAPE cutoff value.\n\n    Raises:\n        FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found.\n    \"\"\"\n    bigscape_dir = Path(bigscape_dir)\n    if not bigscape_dir.exists():\n        raise FileNotFoundError(f\"BiG-SCAPE data directory not found at {bigscape_dir}\")\n\n    clustering_file = bigscape_dir / f\"mix_clustering_c{cutoff}.tsv\"\n    database_file = bigscape_dir / \"data_sqlite.db\"\n    if not clustering_file.exists() and not database_file.exists():\n        raise FileNotFoundError(f\"BiG-SCAPE data not found in {clustering_file} or {database_file}\")\n
    "},{"location":"api/bigscape/","title":"BigScape","text":""},{"location":"api/bigscape/#nplinker.genomics.bigscape","title":"bigscape","text":""},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeGCFLoader","title":"BigscapeGCFLoader","text":"
    BigscapeGCFLoader(cluster_file: str | PathLike)\n

    Bases: GCFLoaderBase

    Build a loader for BiG-SCAPE GCF cluster file.

    Attributes:

    Name Type Description cluster_file str

    path to the BiG-SCAPE cluster file.

    Parameters:

    Name Type Description Default cluster_file str | PathLike

    Path to the BiG-SCAPE cluster file, the filename has a pattern of \"_clustering_c0.xx.tsv\". required Source code in src/nplinker/genomics/bigscape/bigscape_loader.py

    def __init__(self, cluster_file: str | PathLike, /) -> None:\n    \"\"\"Initialize the BiG-SCAPE GCF loader.\n\n    Args:\n        cluster_file: Path to the BiG-SCAPE cluster file,\n            the filename has a pattern of \"<class>_clustering_c0.xx.tsv\".\n    \"\"\"\n    self.cluster_file: str = str(cluster_file)\n    self._gcf_list = self._parse_gcf(self.cluster_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeGCFLoader.cluster_file","title":"cluster_file instance-attribute","text":"
    cluster_file: str = str(cluster_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeGCFLoader.get_gcfs","title":"get_gcfs","text":"
    get_gcfs(\n    keep_mibig_only: bool = False,\n    keep_singleton: bool = False,\n) -> list[GCF]\n

    Get all GCF objects.

    Parameters:

    Name Type Description Default keep_mibig_only bool

    True to keep GCFs that contain only MIBiG BGCs.

    False keep_singleton bool

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

    False

    Returns:

    Type Description list[GCF]

    A list of GCF objects.

    Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
    def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]:\n    \"\"\"Get all GCF objects.\n\n    Args:\n        keep_mibig_only: True to keep GCFs that contain only MIBiG\n            BGCs.\n        keep_singleton: True to keep singleton GCFs. A singleton GCF\n            is a GCF that contains only one BGC.\n\n    Returns:\n        A list of GCF objects.\n    \"\"\"\n    gcf_list = self._gcf_list\n    if not keep_mibig_only:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()]\n    if not keep_singleton:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()]\n    return gcf_list\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeV2GCFLoader","title":"BigscapeV2GCFLoader","text":"
    BigscapeV2GCFLoader(db_file: str | PathLike)\n

    Bases: GCFLoaderBase

    Build a loader for BiG-SCAPE v2 database file.

    Attributes:

    Name Type Description db_file

    Path to the BiG-SCAPE database file.

    Parameters:

    Name Type Description Default db_file str | PathLike

    Path to the BiG-SCAPE v2 database file

    required Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
    def __init__(self, db_file: str | PathLike, /) -> None:\n    \"\"\"Initialize the BiG-SCAPE v2 GCF loader.\n\n    Args:\n        db_file: Path to the BiG-SCAPE v2 database file\n    \"\"\"\n    self.db_file = str(db_file)\n    self._gcf_list = self._parse_gcf(self.db_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeV2GCFLoader.db_file","title":"db_file instance-attribute","text":"
    db_file = str(db_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeV2GCFLoader.get_gcfs","title":"get_gcfs","text":"
    get_gcfs(\n    keep_mibig_only: bool = False,\n    keep_singleton: bool = False,\n) -> list[GCF]\n

    Get all GCF objects.

    Parameters:

    Name Type Description Default keep_mibig_only bool

    True to keep GCFs that contain only MIBiG BGCs.

    False keep_singleton bool

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

    False

    Returns:

    Type Description list[GCF]

    a list of GCF objects.

    Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
    def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]:\n    \"\"\"Get all GCF objects.\n\n    Args:\n        keep_mibig_only: True to keep GCFs that contain only MIBiG\n            BGCs.\n        keep_singleton: True to keep singleton GCFs. A singleton GCF\n            is a GCF that contains only one BGC.\n\n    Returns:\n        a list of GCF objects.\n    \"\"\"\n    gcf_list = self._gcf_list\n    if not keep_mibig_only:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()]\n    if not keep_singleton:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()]\n    return gcf_list\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.run_bigscape","title":"run_bigscape","text":"
    run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n)\n
    Source code in src/nplinker/genomics/bigscape/runbigscape.py
    def run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n):\n    bigscape_py_path = \"bigscape.py\"\n    logger.info(\n        f'run_bigscape: input=\"{antismash_path}\", output=\"{output_path}\", extra_params={extra_params}\"'\n    )\n\n    try:\n        subprocess.run([bigscape_py_path, \"-h\"], capture_output=True, check=True)\n    except Exception as e:\n        raise Exception(f\"Failed to find/run bigscape.py (path={bigscape_py_path}, err={e})\") from e\n\n    if not os.path.exists(antismash_path):\n        raise Exception(f'antismash_path \"{antismash_path}\" does not exist!')\n\n    # configure the IO-related parameters, including pfam_dir\n    args = [bigscape_py_path, \"-i\", antismash_path, \"-o\", output_path, \"--pfam_dir\", PFAM_PATH]\n\n    # append the user supplied params, if any\n    if len(extra_params) > 0:\n        args.extend(extra_params.split(\" \"))\n\n    logger.info(f\"BiG-SCAPE command: {args}\")\n    result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr, check=True)\n    logger.info(f\"BiG-SCAPE completed with return code {result.returncode}\")\n    # use subprocess.CompletedProcess.check_returncode() to test if the BiG-SCAPE\n    # process exited successfully. This throws an exception for non-zero returncodes\n    # which will indicate to the PODPDownloader module that something went wrong.\n    result.check_returncode()\n\n    return True\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.run_bigscape","title":"run_bigscape","text":"
    run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n)\n
    Source code in src/nplinker/genomics/bigscape/runbigscape.py
    def run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n):\n    bigscape_py_path = \"bigscape.py\"\n    logger.info(\n        f'run_bigscape: input=\"{antismash_path}\", output=\"{output_path}\", extra_params={extra_params}\"'\n    )\n\n    try:\n        subprocess.run([bigscape_py_path, \"-h\"], capture_output=True, check=True)\n    except Exception as e:\n        raise Exception(f\"Failed to find/run bigscape.py (path={bigscape_py_path}, err={e})\") from e\n\n    if not os.path.exists(antismash_path):\n        raise Exception(f'antismash_path \"{antismash_path}\" does not exist!')\n\n    # configure the IO-related parameters, including pfam_dir\n    args = [bigscape_py_path, \"-i\", antismash_path, \"-o\", output_path, \"--pfam_dir\", PFAM_PATH]\n\n    # append the user supplied params, if any\n    if len(extra_params) > 0:\n        args.extend(extra_params.split(\" \"))\n\n    logger.info(f\"BiG-SCAPE command: {args}\")\n    result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr, check=True)\n    logger.info(f\"BiG-SCAPE completed with return code {result.returncode}\")\n    # use subprocess.CompletedProcess.check_returncode() to test if the BiG-SCAPE\n    # process exited successfully. This throws an exception for non-zero returncodes\n    # which will indicate to the PODPDownloader module that something went wrong.\n    result.check_returncode()\n\n    return True\n
    "},{"location":"api/genomics/","title":"Data Models","text":""},{"location":"api/genomics/#nplinker.genomics","title":"genomics","text":""},{"location":"api/genomics/#nplinker.genomics.BGC","title":"BGC","text":"
    BGC(id: str, /, *product_prediction: str)\n

    Class to model BGC (biosynthetic gene cluster) data.

    BGC data include both annotations and sequence data. This class is mainly designed to model the annotations or metadata.

    The raw BGC data is stored in GenBank format (.gbk). Additional GenBank features could be added to the GenBank file to annotate BGCs, e.g. antiSMASH has some self-defined features (like region) in its output GenBank files.

    The annotations of BGC can be stored in JSON format, which is defined and used by MIBiG.

    Attributes:

    Name Type Description id

    BGC identifier, e.g. MIBiG accession, GenBank accession.

    product_prediction

    A tuple of (predicted) natural products or product classes of the BGC. For antiSMASH's GenBank data, the feature region /product gives product information. For MIBiG metadata, its biosynthetic class provides such info.

    mibig_bgc_class tuple[str] | None

    A tuple of MIBiG biosynthetic classes to which the BGC belongs. Defaults to None. MIBiG defines 6 major biosynthetic classes for natural products, including \"NRP\", \"Polyketide\", \"RiPP\", \"Terpene\", \"Saccharide\" and \"Alkaloid\". Note that natural products created by all other biosynthetic mechanisms fall under the category \"Other\". More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.

    description str | None

    Brief description of the BGC. Defaults to None.

    smiles tuple[str] | None

    A tuple of SMILES formulas of the BGC's products. Defaults to None.

    antismash_file str | None

    The path to the antiSMASH GenBank file. Defaults to None.

    antismash_id str | None

    Identifier of the antiSMASH BGC, referring to the feature VERSION of GenBank file. Defaults to None.

    antismash_region int | None

    AntiSMASH BGC region number, referring to the feature region of GenBank file. Defaults to None.

    parents set[GCF]

    The set of GCFs that contain the BGC.

    strain Strain | None

    The strain of the BGC.

    Parameters:

    Name Type Description Default id str

    BGC identifier, e.g. MIBiG accession, GenBank accession.

    required product_prediction str

    BGC's (predicted) natural products or product classes.

    () Source code in src/nplinker/genomics/bgc.py
    def __init__(self, id: str, /, *product_prediction: str):\n    \"\"\"Initialize the BGC object.\n\n    Args:\n        id: BGC identifier, e.g. MIBiG accession, GenBank accession.\n        product_prediction: BGC's (predicted) natural products or product classes.\n    \"\"\"\n    # BGC metadata\n    self.id = id\n    self.product_prediction = product_prediction\n\n    self.mibig_bgc_class: tuple[str] | None = None\n    self.description: str | None = None\n    self.smiles: tuple[str] | None = None\n\n    # antismash related attributes\n    self.antismash_file: str | None = None\n    self.antismash_id: str | None = None  # version in .gbk, id in SeqRecord\n    self.antismash_region: int | None = None  # antismash region number\n\n    # other attributes\n    self.parents: set[GCF] = set()\n    self._strain: Strain | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.id","title":"id instance-attribute","text":"
    id = id\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.product_prediction","title":"product_prediction instance-attribute","text":"
    product_prediction = product_prediction\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.mibig_bgc_class","title":"mibig_bgc_class instance-attribute","text":"
    mibig_bgc_class: tuple[str] | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.description","title":"description instance-attribute","text":"
    description: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.smiles","title":"smiles instance-attribute","text":"
    smiles: tuple[str] | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.antismash_file","title":"antismash_file instance-attribute","text":"
    antismash_file: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.antismash_id","title":"antismash_id instance-attribute","text":"
    antismash_id: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.antismash_region","title":"antismash_region instance-attribute","text":"
    antismash_region: int | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.parents","title":"parents instance-attribute","text":"
    parents: set[GCF] = set()\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.strain","title":"strain property writable","text":"
    strain: Strain | None\n

    Get the strain of the BGC.

    "},{"location":"api/genomics/#nplinker.genomics.BGC.bigscape_classes","title":"bigscape_classes property","text":"
    bigscape_classes: set[str | None]\n

    Get BiG-SCAPE's BGC classes.

    BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have more categories (7 classes). More details see: https://doi.org/10.1038%2Fs41589-019-0400-9.

    "},{"location":"api/genomics/#nplinker.genomics.BGC.aa_predictions","title":"aa_predictions property","text":"
    aa_predictions: list\n

    Amino acids as predicted monomers of product.

    Returns:

    Type Description list

    list of dicts with key as amino acid and value as prediction

    list

    probability.

    "},{"location":"api/genomics/#nplinker.genomics.BGC.add_parent","title":"add_parent","text":"
    add_parent(gcf: GCF) -> None\n

    Add a parent GCF to the BGC.

    Parameters:

    Name Type Description Default gcf GCF

    gene cluster family

    required Source code in src/nplinker/genomics/bgc.py
    def add_parent(self, gcf: GCF) -> None:\n    \"\"\"Add a parent GCF to the BGC.\n\n    Args:\n        gcf: gene cluster family\n    \"\"\"\n    gcf.add_bgc(self)\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.detach_parent","title":"detach_parent","text":"
    detach_parent(gcf: GCF) -> None\n

    Remove a parent GCF.

    Source code in src/nplinker/genomics/bgc.py
    def detach_parent(self, gcf: GCF) -> None:\n    \"\"\"Remove a parent GCF.\"\"\"\n    gcf.detach_bgc(self)\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.is_mibig","title":"is_mibig","text":"
    is_mibig() -> bool\n

    Check if the BGC is MIBiG reference BGC or not.

    Note

    This method evaluates MIBiG BGC based on the pattern that MIBiG BGC names start with \"BGC\". It might give false positive result.

    Returns:

    Type Description bool

    True if it's MIBiG reference BGC

    Source code in src/nplinker/genomics/bgc.py
    def is_mibig(self) -> bool:\n    \"\"\"Check if the BGC is MIBiG reference BGC or not.\n\n    Note:\n        This method evaluates MIBiG BGC based on the pattern that MIBiG\n        BGC names start with \"BGC\". It might give false positive result.\n\n    Returns:\n        True if it's MIBiG reference BGC\n    \"\"\"\n    return self.id.startswith(\"BGC\")\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF","title":"GCF","text":"
    GCF(id: str)\n

    Class to model gene cluster family (GCF).

    GCF is a group of similar BGCs and generated by clustering BGCs with tools such as BiG-SCAPE and BiG-SLICE.

    Attributes:

    Name Type Description id

    id of the GCF object.

    bgc_ids set[str]

    a set of BGC ids that belongs to the GCF.

    bigscape_class str | None

    BiG-SCAPE's BGC class. BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have more categories (7 classes). More details see: https://doi.org/10.1038%2Fs41589-019-0400-9.

    Parameters:

    Name Type Description Default id str

    id of the GCF object.

    required Source code in src/nplinker/genomics/gcf.py
    def __init__(self, id: str, /) -> None:\n    \"\"\"Initialize the GCF object.\n\n    Args:\n        id: id of the GCF object.\n    \"\"\"\n    self.id = id\n    self.bgc_ids: set[str] = set()\n    self.bigscape_class: str | None = None\n    self._bgcs: set[BGC] = set()\n    self._strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.id","title":"id instance-attribute","text":"
    id = id\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.bgc_ids","title":"bgc_ids instance-attribute","text":"
    bgc_ids: set[str] = set()\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.bigscape_class","title":"bigscape_class instance-attribute","text":"
    bigscape_class: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.bgcs","title":"bgcs property","text":"
    bgcs: set[BGC]\n

    Get the BGC objects.

    "},{"location":"api/genomics/#nplinker.genomics.GCF.strains","title":"strains property","text":"
    strains: StrainCollection\n

    Get the strains in the GCF.

    "},{"location":"api/genomics/#nplinker.genomics.GCF.add_bgc","title":"add_bgc","text":"
    add_bgc(bgc: BGC) -> None\n

    Add a BGC object to the GCF.

    Source code in src/nplinker/genomics/gcf.py
    def add_bgc(self, bgc: BGC) -> None:\n    \"\"\"Add a BGC object to the GCF.\"\"\"\n    bgc.parents.add(self)\n    self._bgcs.add(bgc)\n    self.bgc_ids.add(bgc.id)\n    if bgc.strain is not None:\n        self._strains.add(bgc.strain)\n    else:\n        logger.warning(\"No strain specified for the BGC %s\", bgc.id)\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.detach_bgc","title":"detach_bgc","text":"
    detach_bgc(bgc: BGC) -> None\n

    Remove a child BGC object.

    Source code in src/nplinker/genomics/gcf.py
    def detach_bgc(self, bgc: BGC) -> None:\n    \"\"\"Remove a child BGC object.\"\"\"\n    bgc.parents.remove(self)\n    self._bgcs.remove(bgc)\n    self.bgc_ids.remove(bgc.id)\n    if bgc.strain is not None:\n        for other_bgc in self._bgcs:\n            if other_bgc.strain == bgc.strain:\n                return\n        self._strains.remove(bgc.strain)\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.has_strain","title":"has_strain","text":"
    has_strain(strain: Strain) -> bool\n

    Check if the given strain exists.

    Parameters:

    Name Type Description Default strain Strain

    Strain object.

    required

    Returns:

    Type Description bool

    True when the given strain exist.

    Source code in src/nplinker/genomics/gcf.py
    def has_strain(self, strain: Strain) -> bool:\n    \"\"\"Check if the given strain exists.\n\n    Args:\n        strain: `Strain` object.\n\n    Returns:\n        True when the given strain exist.\n    \"\"\"\n    return strain in self._strains\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.has_mibig_only","title":"has_mibig_only","text":"
    has_mibig_only() -> bool\n

    Check if the GCF's children are only MIBiG BGCs.

    Returns:

    Type Description bool

    True if GCF.bgc_ids are only MIBiG BGC ids.

    Source code in src/nplinker/genomics/gcf.py
    def has_mibig_only(self) -> bool:\n    \"\"\"Check if the GCF's children are only MIBiG BGCs.\n\n    Returns:\n        True if `GCF.bgc_ids` are only MIBiG BGC ids.\n    \"\"\"\n    return all(map(lambda id: id.startswith(\"BGC\"), self.bgc_ids))\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.is_singleton","title":"is_singleton","text":"
    is_singleton() -> bool\n

    Check if the GCF contains only one BGC.

    Returns:

    Type Description bool

    True if GCF.bgc_ids contains only one BGC id.

    Source code in src/nplinker/genomics/gcf.py
    def is_singleton(self) -> bool:\n    \"\"\"Check if the GCF contains only one BGC.\n\n    Returns:\n        True if `GCF.bgc_ids` contains only one BGC id.\n    \"\"\"\n    return len(self.bgc_ids) == 1\n
    "},{"location":"api/genomics_abc/","title":"Abstract Base Classes","text":""},{"location":"api/genomics_abc/#nplinker.genomics.abc","title":"abc","text":""},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase","title":"BGCLoaderBase","text":"
    BGCLoaderBase(data_dir: str | PathLike)\n

    Bases: ABC

    Abstract base class for BGC loader.

    Parameters:

    Name Type Description Default data_dir str | PathLike

    Path to directory that contains BGC metadata files (.json) or full data genbank files (.gbk).

    required Source code in src/nplinker/genomics/abc.py
    def __init__(self, data_dir: str | PathLike) -> None:\n    \"\"\"Initialize the BGC loader.\n\n    Args:\n        data_dir: Path to directory that contains BGC metadata files\n            (.json) or full data genbank files (.gbk).\n    \"\"\"\n    self.data_dir = str(data_dir)\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase.data_dir","title":"data_dir instance-attribute","text":"
    data_dir = str(data_dir)\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase.get_files","title":"get_files abstractmethod","text":"
    get_files() -> dict[str, str]\n

    Get path to BGC files.

    Returns:

    Type Description dict[str, str]

    The key is BGC name and value is path to BGC file

    Source code in src/nplinker/genomics/abc.py
    @abstractmethod\ndef get_files(self) -> dict[str, str]:\n    \"\"\"Get path to BGC files.\n\n    Returns:\n        The key is BGC name and value is path to BGC file\n    \"\"\"\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase.get_bgcs","title":"get_bgcs abstractmethod","text":"
    get_bgcs() -> list[BGC]\n

    Get BGC objects.

    Returns:

    Type Description list[BGC]

    A list of BGC objects

    Source code in src/nplinker/genomics/abc.py
    @abstractmethod\ndef get_bgcs(self) -> list[BGC]:\n    \"\"\"Get BGC objects.\n\n    Returns:\n        A list of BGC objects\n    \"\"\"\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.GCFLoaderBase","title":"GCFLoaderBase","text":"

    Bases: ABC

    Abstract base class for GCF loader.

    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.GCFLoaderBase.get_gcfs","title":"get_gcfs abstractmethod","text":"
    get_gcfs(\n    keep_mibig_only: bool, keep_singleton: bool\n) -> list[GCF]\n

    Get GCF objects.

    Parameters:

    Name Type Description Default keep_mibig_only bool

    True to keep GCFs that contain only MIBiG BGCs.

    required keep_singleton bool

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

    required

    Returns:

    Type Description list[GCF]

    A list of GCF objects

    Source code in src/nplinker/genomics/abc.py
    @abstractmethod\ndef get_gcfs(self, keep_mibig_only: bool, keep_singleton: bool) -> list[GCF]:\n    \"\"\"Get GCF objects.\n\n    Args:\n        keep_mibig_only: True to keep GCFs that contain only MIBiG\n            BGCs.\n        keep_singleton: True to keep singleton GCFs. A singleton GCF\n            is a GCF that contains only one BGC.\n\n    Returns:\n        A list of GCF objects\n    \"\"\"\n
    "},{"location":"api/genomics_utils/","title":"Utilities","text":""},{"location":"api/genomics_utils/#nplinker.genomics.utils","title":"utils","text":""},{"location":"api/genomics_utils/#nplinker.genomics.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.generate_mappings_genome_id_bgc_id","title":"generate_mappings_genome_id_bgc_id","text":"
    generate_mappings_genome_id_bgc_id(\n    bgc_dir: str | PathLike,\n    output_file: str | PathLike | None = None,\n) -> None\n

    Generate a file that maps genome id to BGC id.

    Note that the output_file will be overwritten if it already exists.

    Parameters:

    Name Type Description Default bgc_dir str | PathLike

    The directory has one-layer of subfolders and each subfolder contains BGC files in .gbk format. It assumes that - the subfolder name is the genome id (e.g. refseq), - the BGC file name is the BGC id.

    required output_file str | PathLike | None

    The path to the output file. Note that the file will be overwritten if it already exists. Defaults to None, in which case the output file will be placed in the directory bgc_dir with a file name defined in global variable GENOME_BGC_MAPPINGS_FILENAME.

    None Source code in src/nplinker/genomics/utils.py
    def generate_mappings_genome_id_bgc_id(\n    bgc_dir: str | PathLike, output_file: str | PathLike | None = None\n) -> None:\n    \"\"\"Generate a file that maps genome id to BGC id.\n\n    Note that the `output_file` will be overwritten if it already exists.\n\n    Args:\n        bgc_dir: The directory has one-layer of subfolders and\n            each subfolder contains BGC files in `.gbk` format.\n            It assumes that\n            - the subfolder name is the genome id (e.g. refseq),\n            - the BGC file name is the BGC id.\n        output_file: The path to the output file. Note\n            that the file will be overwritten if it already exists.\n            Defaults to None, in which case the output file will be placed in\n            the directory `bgc_dir` with a file name defined in global variable\n            `GENOME_BGC_MAPPINGS_FILENAME`.\n    \"\"\"\n    bgc_dir = Path(bgc_dir)\n    genome_bgc_mappings = {}\n\n    for subdir in list_dirs(bgc_dir):\n        genome_id = Path(subdir).name\n        bgc_files = list_files(subdir, suffix=(\".gbk\"), keep_parent=False)\n        bgc_ids = [bgc_id for f in bgc_files if (bgc_id := Path(f).stem) != genome_id]\n        if bgc_ids:\n            genome_bgc_mappings[genome_id] = bgc_ids\n        else:\n            logger.warning(\"No BGC files found in %s\", subdir)\n\n    # sort mappings by genome_id and construct json data\n    genome_bgc_mappings = dict(sorted(genome_bgc_mappings.items()))\n    json_data_mappings = [{\"genome_ID\": k, \"BGC_ID\": v} for k, v in genome_bgc_mappings.items()]\n    json_data = {\"mappings\": json_data_mappings, \"version\": \"1.0\"}\n\n    # validate json data\n    validate(instance=json_data, schema=GENOME_BGC_MAPPINGS_SCHEMA)\n\n    if output_file is None:\n        output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME\n    with open(output_file, \"w\") as f:\n        json.dump(json_data, f)\n    logger.info(\"Generated genome-BGC mappings file: %s\", output_file)\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.add_strain_to_bgc","title":"add_strain_to_bgc","text":"
    add_strain_to_bgc(\n    strains: StrainCollection, bgcs: Sequence[BGC]\n) -> tuple[list[BGC], list[BGC]]\n

    Assign a Strain object to BGC.strain for input BGCs.

    BGC id is used to find the corresponding Strain object. It's possible that no Strain object is found for a BGC id.

    Note that the input list bgcs will be changed in place.

    Parameters:

    Name Type Description Default strains StrainCollection

    A collection of all strain objects.

    required bgcs Sequence[BGC]

    A list of BGC objects.

    required

    Returns:

    Type Description tuple[list[BGC], list[BGC]]

    A tuple of two lists of BGC objects,

    Raises:

    Type Description ValueError

    Multiple strain objects found for a BGC id.

    Source code in src/nplinker/genomics/utils.py
    def add_strain_to_bgc(\n    strains: StrainCollection, bgcs: Sequence[BGC]\n) -> tuple[list[BGC], list[BGC]]:\n    \"\"\"Assign a Strain object to `BGC.strain` for input BGCs.\n\n    BGC id is used to find the corresponding Strain object. It's possible that\n    no Strain object is found for a BGC id.\n\n    Note that the input list `bgcs` will be changed in place.\n\n    Args:\n        strains: A collection of all strain objects.\n        bgcs: A list of BGC objects.\n\n    Returns:\n        A tuple of two lists of BGC objects,\n\n            - the first list contains BGC objects that are updated with Strain object;\n            - the second list contains BGC objects that are not updated with\n                Strain object because no Strain object is found.\n\n    Raises:\n        ValueError: Multiple strain objects found for a BGC id.\n    \"\"\"\n    bgc_with_strain = []\n    bgc_without_strain = []\n    for bgc in bgcs:\n        try:\n            strain_list = strains.lookup(bgc.id)\n        except ValueError:\n            bgc_without_strain.append(bgc)\n            continue\n        if len(strain_list) > 1:\n            raise ValueError(\n                f\"Multiple strain objects found for BGC id '{bgc.id}'.\"\n                f\"BGC object accept only one strain.\"\n            )\n        bgc.strain = strain_list[0]\n        bgc_with_strain.append(bgc)\n\n    logger.info(\n        f\"{len(bgc_with_strain)} BGC objects updated with Strain object.\\n\"\n        f\"{len(bgc_without_strain)} BGC objects not updated with Strain object.\"\n    )\n    return bgc_with_strain, bgc_without_strain\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.add_bgc_to_gcf","title":"add_bgc_to_gcf","text":"
    add_bgc_to_gcf(\n    bgcs: Sequence[BGC], gcfs: Sequence[GCF]\n) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]\n

    Add BGC objects to GCF object based on GCF's BGC ids.

    The attribute of GCF.bgc_ids contains the ids of BGC objects. These ids are used to find BGC objects from the input bgcs list. The found BGC objects are added to the bgcs attribute of GCF object. It is possible that some BGC ids are not found in the input bgcs list, and so their BGC objects are missing in the GCF object.

    This method changes the lists bgcs and gcfs in place.

    Parameters:

    Name Type Description Default bgcs Sequence[BGC]

    A list of BGC objects.

    required gcfs Sequence[GCF]

    A list of GCF objects.

    required

    Returns:

    Type Description tuple[list[GCF], list[GCF], dict[GCF, set[str]]]

    A tuple of two lists and a dictionary,

    Source code in src/nplinker/genomics/utils.py
    def add_bgc_to_gcf(\n    bgcs: Sequence[BGC], gcfs: Sequence[GCF]\n) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]:\n    \"\"\"Add BGC objects to GCF object based on GCF's BGC ids.\n\n    The attribute of `GCF.bgc_ids` contains the ids of BGC objects. These ids\n    are used to find BGC objects from the input `bgcs` list. The found BGC\n    objects are added to the `bgcs` attribute of GCF object. It is possible that\n    some BGC ids are not found in the input `bgcs` list, and so their BGC\n    objects are missing in the GCF object.\n\n    This method changes the lists `bgcs` and `gcfs` in place.\n\n    Args:\n        bgcs: A list of BGC objects.\n        gcfs: A list of GCF objects.\n\n    Returns:\n        A tuple of two lists and a dictionary,\n\n            - The first list contains GCF objects that are updated with BGC objects;\n            - The second list contains GCF objects that are not updated with BGC objects\n                because no BGC objects are found;\n            - The dictionary contains GCF objects as keys and a set of ids of missing\n                BGC objects as values.\n    \"\"\"\n    bgc_dict = {bgc.id: bgc for bgc in bgcs}\n    gcf_with_bgc = []\n    gcf_without_bgc = []\n    gcf_missing_bgc: dict[GCF, set[str]] = {}\n    for gcf in gcfs:\n        for bgc_id in gcf.bgc_ids:\n            try:\n                bgc = bgc_dict[bgc_id]\n            except KeyError:\n                if gcf not in gcf_missing_bgc:\n                    gcf_missing_bgc[gcf] = {bgc_id}\n                else:\n                    gcf_missing_bgc[gcf].add(bgc_id)\n                continue\n            gcf.add_bgc(bgc)\n\n        if gcf.bgcs:\n            gcf_with_bgc.append(gcf)\n        else:\n            gcf_without_bgc.append(gcf)\n\n    logger.info(\n        f\"{len(gcf_with_bgc)} GCF objects updated with BGC objects.\\n\"\n        f\"{len(gcf_without_bgc)} GCF objects not updated with BGC objects.\\n\"\n        f\"{len(gcf_missing_bgc)} GCF objects have missing BGC objects.\"\n    )\n    return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.get_mibig_from_gcf","title":"get_mibig_from_gcf","text":"
    get_mibig_from_gcf(\n    gcfs: Sequence[GCF],\n) -> tuple[list[BGC], StrainCollection]\n

    Get MIBiG BGCs and strains from GCF objects.

    Parameters:

    Name Type Description Default gcfs Sequence[GCF]

    A list of GCF objects.

    required

    Returns:

    Type Description tuple[list[BGC], StrainCollection]

    A tuple of two objects,

    Source code in src/nplinker/genomics/utils.py
    def get_mibig_from_gcf(gcfs: Sequence[GCF]) -> tuple[list[BGC], StrainCollection]:\n    \"\"\"Get MIBiG BGCs and strains from GCF objects.\n\n    Args:\n        gcfs: A list of GCF objects.\n\n    Returns:\n        A tuple of two objects,\n\n            - the first is a list of MIBiG BGC objects used in the GCFs;\n            - the second is a StrainCollection object that contains all Strain objects used in the\n            GCFs.\n    \"\"\"\n    mibig_bgcs_in_use = []\n    mibig_strains_in_use = StrainCollection()\n    for gcf in gcfs:\n        for bgc in gcf.bgcs:\n            if bgc.is_mibig():\n                mibig_bgcs_in_use.append(bgc)\n                if bgc.strain is not None:\n                    mibig_strains_in_use.add(bgc.strain)\n    return mibig_bgcs_in_use, mibig_strains_in_use\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.extract_mappings_strain_id_original_genome_id","title":"extract_mappings_strain_id_original_genome_id","text":"
    extract_mappings_strain_id_original_genome_id(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"strain id <-> original genome id\".

    Parameters:

    Name Type Description Default podp_project_json_file str | PathLike

    The path to the PODP project JSON file.

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of original genome ids.

    Notes

    The podp_project_json_file is the project JSON file downloaded from PODP platform. For example, for project MSV000079284, its json file is https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.

    Source code in src/nplinker/genomics/utils.py
    def extract_mappings_strain_id_original_genome_id(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"strain id <-> original genome id\".\n\n    Args:\n        podp_project_json_file: The path to the PODP project\n            JSON file.\n\n    Returns:\n        Key is strain id and value is a set of original genome ids.\n\n    Notes:\n        The `podp_project_json_file` is the project JSON file downloaded from\n        PODP platform. For example, for project MSV000079284, its json file is\n        https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.\n    \"\"\"\n    mappings_dict: dict[str, set[str]] = {}\n    with open(podp_project_json_file, \"r\") as f:\n        json_data = json.load(f)\n\n    validate_podp_json(json_data)\n\n    for record in json_data[\"genomes\"]:\n        strain_id = record[\"genome_label\"]\n        genome_id = get_best_available_genome_id(record[\"genome_ID\"])\n        if genome_id is None:\n            logger.warning(\"Failed to extract genome ID from genome with label %s\", strain_id)\n            continue\n        if strain_id in mappings_dict:\n            mappings_dict[strain_id].add(genome_id)\n        else:\n            mappings_dict[strain_id] = {genome_id}\n    return mappings_dict\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.extract_mappings_original_genome_id_resolved_genome_id","title":"extract_mappings_original_genome_id_resolved_genome_id","text":"
    extract_mappings_original_genome_id_resolved_genome_id(\n    genome_status_json_file: str | PathLike,\n) -> dict[str, str]\n

    Extract mappings \"original_genome_id <-> resolved_genome_id\".

    Parameters:

    Name Type Description Default genome_status_json_file str | PathLike

    The path to the genome status JSON file.

    required

    Returns:

    Type Description dict[str, str]

    Key is original genome id and value is resolved genome id.

    Notes

    The genome_status_json_file is usually generated by the podp_download_and_extract_antismash_data function with a default file name defined in nplinker.defaults.GENOME_STATUS_FILENAME.

    Source code in src/nplinker/genomics/utils.py
    def extract_mappings_original_genome_id_resolved_genome_id(\n    genome_status_json_file: str | PathLike,\n) -> dict[str, str]:\n    \"\"\"Extract mappings \"original_genome_id <-> resolved_genome_id\".\n\n    Args:\n        genome_status_json_file: The path to the genome status\n            JSON file.\n\n    Returns:\n        Key is original genome id and value is resolved genome id.\n\n    Notes:\n        The `genome_status_json_file` is usually generated by the\n        `podp_download_and_extract_antismash_data` function with\n        a default file name defined in `nplinker.defaults.GENOME_STATUS_FILENAME`.\n    \"\"\"\n    gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file)\n    return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()}\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.extract_mappings_resolved_genome_id_bgc_id","title":"extract_mappings_resolved_genome_id_bgc_id","text":"
    extract_mappings_resolved_genome_id_bgc_id(\n    genome_bgc_mappings_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"resolved_genome_id <-> bgc_id\".

    Parameters:

    Name Type Description Default genome_bgc_mappings_file str | PathLike

    The path to the genome BGC mappings JSON file.

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is resolved genome id and value is a set of BGC ids.

    Notes

    The genome_bgc_mappings_file is usually generated by the generate_mappings_genome_id_bgc_id function with a default file name defined in nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME.

    Source code in src/nplinker/genomics/utils.py
    def extract_mappings_resolved_genome_id_bgc_id(\n    genome_bgc_mappings_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"resolved_genome_id <-> bgc_id\".\n\n    Args:\n        genome_bgc_mappings_file: The path to the genome BGC\n            mappings JSON file.\n\n    Returns:\n        Key is resolved genome id and value is a set of BGC ids.\n\n    Notes:\n        The `genome_bgc_mappings_file` is usually generated by the\n        `generate_mappings_genome_id_bgc_id` function with a default file name\n        defined in `nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME`.\n    \"\"\"\n    with open(genome_bgc_mappings_file, \"r\") as f:\n        json_data = json.load(f)\n\n    # validate the JSON data\n    validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA)\n\n    return {mapping[\"genome_ID\"]: set(mapping[\"BGC_ID\"]) for mapping in json_data[\"mappings\"]}\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.get_mappings_strain_id_bgc_id","title":"get_mappings_strain_id_bgc_id","text":"
    get_mappings_strain_id_bgc_id(\n    mappings_strain_id_original_genome_id: Mapping[\n        str, set[str]\n    ],\n    mappings_original_genome_id_resolved_genome_id: Mapping[\n        str, str\n    ],\n    mappings_resolved_genome_id_bgc_id: Mapping[\n        str, set[str]\n    ],\n) -> dict[str, set[str]]\n

    Get mappings \"strain_id <-> bgc_id\".

    Parameters:

    Name Type Description Default mappings_strain_id_original_genome_id Mapping[str, set[str]]

    Mappings \"strain_id <-> original_genome_id\".

    required mappings_original_genome_id_resolved_genome_id Mapping[str, str]

    Mappings \"original_genome_id <-> resolved_genome_id\".

    required mappings_resolved_genome_id_bgc_id Mapping[str, set[str]]

    Mappings \"resolved_genome_id <-> bgc_id\".

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of BGC ids.

    See Also Source code in src/nplinker/genomics/utils.py
    def get_mappings_strain_id_bgc_id(\n    mappings_strain_id_original_genome_id: Mapping[str, set[str]],\n    mappings_original_genome_id_resolved_genome_id: Mapping[str, str],\n    mappings_resolved_genome_id_bgc_id: Mapping[str, set[str]],\n) -> dict[str, set[str]]:\n    \"\"\"Get mappings \"strain_id <-> bgc_id\".\n\n    Args:\n        mappings_strain_id_original_genome_id: Mappings\n            \"strain_id <-> original_genome_id\".\n        mappings_original_genome_id_resolved_genome_id: Mappings\n            \"original_genome_id <-> resolved_genome_id\".\n        mappings_resolved_genome_id_bgc_id: Mappings\n            \"resolved_genome_id <-> bgc_id\".\n\n    Returns:\n        Key is strain id and value is a set of BGC ids.\n\n    See Also:\n        - `extract_mappings_strain_id_original_genome_id`: Extract mappings\n            \"strain_id <-> original_genome_id\".\n        - `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings\n            \"original_genome_id <-> resolved_genome_id\".\n        - `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings\n            \"resolved_genome_id <-> bgc_id\".\n    \"\"\"\n    mappings_dict = {}\n    for strain_id, original_genome_ids in mappings_strain_id_original_genome_id.items():\n        bgc_ids = set()\n        for original_genome_id in original_genome_ids:\n            resolved_genome_id = mappings_original_genome_id_resolved_genome_id[original_genome_id]\n            if (bgc_id := mappings_resolved_genome_id_bgc_id.get(resolved_genome_id)) is not None:\n                bgc_ids.update(bgc_id)\n        if bgc_ids:\n            mappings_dict[strain_id] = bgc_ids\n    return mappings_dict\n
    "},{"location":"api/gnps/","title":"GNPS","text":""},{"location":"api/gnps/#nplinker.metabolomics.gnps","title":"gnps","text":""},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat","title":"GNPSFormat","text":"

    Bases: Enum

    Enum class for GNPS format (workflow).

    The GNPS format refers to the GNPS workflow. The name of the enum is a simple short name for the workflow, and the value of the enum is the actual name of the workflow in the GNPS website.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.SNETS","title":"SNETS class-attribute instance-attribute","text":"
    SNETS = 'METABOLOMICS-SNETS'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.SNETSV2","title":"SNETSV2 class-attribute instance-attribute","text":"
    SNETSV2 = 'METABOLOMICS-SNETS-V2'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.FBMN","title":"FBMN class-attribute instance-attribute","text":"
    FBMN = 'FEATURE-BASED-MOLECULAR-NETWORKING'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.Unknown","title":"Unknown class-attribute instance-attribute","text":"
    Unknown = 'Unknown-GNPS-Workflow'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader","title":"GNPSDownloader","text":"
    GNPSDownloader(task_id: str, download_root: str | PathLike)\n

    Download GNPS zip archive for the given task id.

    Note that only GNPS workflows listed in the GNPSFormat enum are supported.

    Attributes:

    Name Type Description GNPS_DATA_DOWNLOAD_URL str

    URL template for downloading GNPS data.

    GNPS_DATA_DOWNLOAD_URL_FBMN str

    URL template for downloading GNPS data for FBMN.

    Parameters:

    Name Type Description Default task_id str

    GNPS task id, identifying the data to be downloaded.

    required download_root str | PathLike

    Path where to store the downloaded archive.

    required

    Raises:

    Type Description ValueError

    If the given task id does not correspond to a supported GNPS workflow.

    Examples:

    >>> GNPSDownloader(\"c22f44b14a3d450eb836d607cb9521bb\", \"~/downloads\")\n
    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def __init__(self, task_id: str, download_root: str | PathLike):\n    \"\"\"Initialize the GNPSDownloader.\n\n    Args:\n        task_id: GNPS task id, identifying the data to be downloaded.\n        download_root: Path where to store the downloaded archive.\n\n    Raises:\n        ValueError: If the given task id does not correspond to a supported\n            GNPS workflow.\n\n    Examples:\n        >>> GNPSDownloader(\"c22f44b14a3d450eb836d607cb9521bb\", \"~/downloads\")\n    \"\"\"\n    gnps_format = gnps_format_from_task_id(task_id)\n    if gnps_format == GNPSFormat.Unknown:\n        raise ValueError(\n            f\"Unknown workflow type for GNPS task '{task_id}'.\"\n            f\"Supported GNPS workflows are described in the GNPSFormat enum, \"\n            f\"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' \"\n            f\"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.\"\n        )\n\n    self._task_id = task_id\n    self._download_root: Path = Path(download_root)\n    self._gnps_format = gnps_format\n    self._file_name = gnps_format.value + \"-\" + self._task_id + \".zip\"\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.GNPS_DATA_DOWNLOAD_URL","title":"GNPS_DATA_DOWNLOAD_URL class-attribute instance-attribute","text":"
    GNPS_DATA_DOWNLOAD_URL: str = (\n    \"https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra\"\n)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN","title":"GNPS_DATA_DOWNLOAD_URL_FBMN class-attribute instance-attribute","text":"
    GNPS_DATA_DOWNLOAD_URL_FBMN: str = (\n    \"https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data\"\n)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.gnps_format","title":"gnps_format property","text":"
    gnps_format: GNPSFormat\n

    Get the GNPS workflow type.

    Returns:

    Type Description GNPSFormat

    GNPS workflow type.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.download","title":"download","text":"
    download() -> 'Self'\n

    Execute the downloading process.

    Note: GNPS data is downloaded using the POST method (empty payload is OK).

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def download(self) -> \"Self\":\n    \"\"\"Execute the downloading process.\n\n    Note: GNPS data is downloaded using the POST method (empty payload is OK).\n    \"\"\"\n    download_url(\n        self.get_url(), self._download_root, filename=self._file_name, http_method=\"POST\"\n    )\n    return self\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.get_download_file","title":"get_download_file","text":"
    get_download_file() -> str\n

    Get the path to the zip file.

    Returns:

    Type Description str

    Download path as string

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def get_download_file(self) -> str:\n    \"\"\"Get the path to the zip file.\n\n    Returns:\n        Download path as string\n    \"\"\"\n    return str(Path(self._download_root) / self._file_name)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.get_task_id","title":"get_task_id","text":"
    get_task_id() -> str\n

    Get the GNPS task id.

    Returns:

    Type Description str

    Task id as string.

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def get_task_id(self) -> str:\n    \"\"\"Get the GNPS task id.\n\n    Returns:\n        Task id as string.\n    \"\"\"\n    return self._task_id\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.get_url","title":"get_url","text":"
    get_url() -> str\n

    Get the full URL linking to GNPS data to be downloaded.

    Returns:

    Type Description str

    URL pointing to the GNPS data to be downloaded.

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def get_url(self) -> str:\n    \"\"\"Get the full URL linking to GNPS data to be downloaded.\n\n    Returns:\n        URL pointing to the GNPS data to be downloaded.\n    \"\"\"\n    if self.gnps_format == GNPSFormat.FBMN:\n        return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id)\n    return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSExtractor","title":"GNPSExtractor","text":"
    GNPSExtractor(\n    file: str | PathLike, extract_dir: str | PathLike\n)\n

    Class to extract files from a GNPS molecular networking archive(.zip).

    Four files are extracted and renamed to the following names:

    The files to be extracted are selected based on the GNPS workflow type, as described below (in the order of the files above):

    1. METABOLOMICS-SNETS
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv
      • METABOLOMICS-SNETS*.mgf
      • networkedges_selfloop/*.pairsinfo
      • result_specnets_DB/*.tsv
    2. METABOLOMICS-SNETS-V2
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.clustersummary
      • METABOLOMICS-SNETS-V2*.mgf
      • networkedges_selfloop/*.selfloop
      • result_specnets_DB/.tsv
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • quantification_table/.csv
      • spectra/*.mgf
      • networkedges_selfloop/*.selfloop
      • DB_result/*.tsv

    Parameters:

    Name Type Description Default file str | PathLike

    The path to the GNPS zip file.

    required extract_dir str | PathLike

    path to the directory where to extract the files to.

    required

    Raises:

    Type Description ValueError

    If the given file is an invalid GNPS archive.

    Examples:

    >>> gnps_extractor = GNPSExtractor(\"path/to/gnps_archive.zip\", \"path/to/extract_dir\")\n>>> gnps_extractor.gnps_format\n<GNPSFormat.SNETS: 'METABOLOMICS-SNETS'>\n>>> gnps_extractor.extract_dir\n'path/to/extract_dir'\n
    Source code in src/nplinker/metabolomics/gnps/gnps_extractor.py
    def __init__(self, file: str | PathLike, extract_dir: str | PathLike):\n    \"\"\"Initialize the GNPSExtractor.\n\n    Args:\n        file: The path to the GNPS zip file.\n        extract_dir: path to the directory where to extract the files to.\n\n    Raises:\n        ValueError: If the given file is an invalid GNPS archive.\n\n    Examples:\n        >>> gnps_extractor = GNPSExtractor(\"path/to/gnps_archive.zip\", \"path/to/extract_dir\")\n        >>> gnps_extractor.gnps_format\n        <GNPSFormat.SNETS: 'METABOLOMICS-SNETS'>\n        >>> gnps_extractor.extract_dir\n        'path/to/extract_dir'\n    \"\"\"\n    gnps_format = gnps_format_from_archive(file)\n    if gnps_format == GNPSFormat.Unknown:\n        raise ValueError(\n            f\"Unknown workflow type for GNPS archive '{file}'.\"\n            f\"Supported GNPS workflows are described in the GNPSFormat enum, \"\n            f\"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' \"\n            f\"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.\"\n        )\n\n    self._file = Path(file)\n    self._extract_path = Path(extract_dir)\n    self._gnps_format = gnps_format\n    # the order of filenames matters\n    self._target_files = [\n        \"file_mappings\",\n        \"spectra.mgf\",\n        \"molecular_families.tsv\",\n        \"annotations.tsv\",\n    ]\n\n    self._extract()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSExtractor.gnps_format","title":"gnps_format property","text":"
    gnps_format: GNPSFormat\n

    Get the GNPS workflow type.

    Returns:

    Type Description GNPSFormat

    GNPS workflow type.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSExtractor.extract_dir","title":"extract_dir property","text":"
    extract_dir: str\n

    Get the path where to extract the files to.

    Returns:

    Type Description str

    Path where to extract files as string.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSSpectrumLoader","title":"GNPSSpectrumLoader","text":"
    GNPSSpectrumLoader(file: str | PathLike)\n

    Bases: SpectrumLoaderBase

    Class to load mass spectra from the given GNPS MGF file.

    The file mappings file is from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • METABOLOMICS-SNETS*.mgf
    2. METABOLOMICS-SNETS-V2
      • METABOLOMICS-SNETS-V2*.mgf
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • spectra/*.mgf

    Parameters:

    Name Type Description Default file str | PathLike

    path to the MGF file.

    required

    Raises:

    Type Description ValueError

    Raises ValueError if the file is not valid.

    Examples:

    >>> loader = GNPSSpectrumLoader(\"gnps_spectra.mgf\")\n>>> print(loader.spectra[0])\n
    Source code in src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSSpectrumLoader.\n\n    Args:\n        file: path to the MGF file.\n\n    Raises:\n        ValueError: Raises ValueError if the file is not valid.\n\n    Examples:\n        >>> loader = GNPSSpectrumLoader(\"gnps_spectra.mgf\")\n        >>> print(loader.spectra[0])\n    \"\"\"\n    self._file = str(file)\n    self._spectra: list[Spectrum] = []\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSSpectrumLoader.spectra","title":"spectra property","text":"
    spectra: list[Spectrum]\n

    Get the list of Spectrum objects.

    Returns:

    Type Description list[Spectrum]

    list[Spectrum]: the loaded spectra as a list of Spectrum objects.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSMolecularFamilyLoader","title":"GNPSMolecularFamilyLoader","text":"
    GNPSMolecularFamilyLoader(file: str | PathLike)\n

    Bases: MolecularFamilyLoaderBase

    Class to load molecular families from GNPS output file.

    The molecular family file is from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • networkedges_selfloop/*.pairsinfo
    2. METABOLOMICS-SNETS-V2
      • networkedges_selfloop/*.selfloop
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • networkedges_selfloop/*.selfloop

    The \"ComponentIndex\" column in the GNPS molecular family's file is treated as family id. But for molecular families that have only one member (i.e. spectrum), named singleton molecular families, their files have the same value of \"-1\" in the \"ComponentIndex\" column. To make the family id unique,the spectrum id plus a prefix singleton- is used as the family id of singleton molecular families.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the GNPS molecular family file.

    required

    Raises:

    Type Description ValueError

    Raises ValueError if the file is not valid.

    Examples:

    >>> loader = GNPSMolecularFamilyLoader(\"gnps_molecular_families.tsv\")\n>>> print(loader.families)\n[<MolecularFamily 1>, <MolecularFamily 2>, ...]\n>>> print(loader.families[0].spectra_ids)\n{'1', '3', '7', ...}\n
    Source code in src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSMolecularFamilyLoader.\n\n    Args:\n        file: Path to the GNPS molecular family file.\n\n    Raises:\n        ValueError: Raises ValueError if the file is not valid.\n\n    Examples:\n        >>> loader = GNPSMolecularFamilyLoader(\"gnps_molecular_families.tsv\")\n        >>> print(loader.families)\n        [<MolecularFamily 1>, <MolecularFamily 2>, ...]\n        >>> print(loader.families[0].spectra_ids)\n        {'1', '3', '7', ...}\n    \"\"\"\n    self._mfs: list[MolecularFamily] = []\n    self._file = file\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSMolecularFamilyLoader.get_mfs","title":"get_mfs","text":"
    get_mfs(\n    keep_singleton: bool = False,\n) -> list[MolecularFamily]\n

    Get MolecularFamily objects.

    Parameters:

    Name Type Description Default keep_singleton bool

    True to keep singleton molecular families. A singleton molecular family is a molecular family that contains only one spectrum.

    False

    Returns:

    Type Description list[MolecularFamily]

    A list of MolecularFamily objects with their spectra ids.

    Source code in src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
    def get_mfs(self, keep_singleton: bool = False) -> list[MolecularFamily]:\n    \"\"\"Get MolecularFamily objects.\n\n    Args:\n        keep_singleton: True to keep singleton molecular families. A\n            singleton molecular family is a molecular family that contains\n            only one spectrum.\n\n    Returns:\n        A list of MolecularFamily objects with their spectra ids.\n    \"\"\"\n    mfs = self._mfs\n    if not keep_singleton:\n        mfs = [mf for mf in mfs if not mf.is_singleton()]\n    return mfs\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSAnnotationLoader","title":"GNPSAnnotationLoader","text":"
    GNPSAnnotationLoader(file: str | PathLike)\n

    Bases: AnnotationLoaderBase

    Load annotations from GNPS output file.

    The annotation file is a .tsv file from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • result_specnets_DB/*.tsv
    2. METABOLOMICS-SNETS-V2
      • result_specnets_DB/.tsv
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • DB_result/*.tsv

    Parameters:

    Name Type Description Default file str | PathLike

    The GNPS annotation file.

    required

    Examples:

    >>> loader = GNPSAnnotationLoader(\"gnps_annotations.tsv\")\n>>> print(loader.annotations[\"100\"])\n{'#Scan#': '100',\n'Adduct': 'M+H',\n'CAS_Number': 'N/A',\n'Charge': '1',\n'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',\n'Compound_Source': 'NIH Pharmacologically Active Library',\n'Data_Collector': 'VP/LMS',\n'ExactMass': '274.992',\n'INCHI': 'N/A',\n'INCHI_AUX': 'N/A',\n'Instrument': 'qTof',\n'IonMode': 'Positive',\n'Ion_Source': 'LC-ESI',\n'LibMZ': '276.003',\n'LibraryName': 'lib-00014.mgf',\n'LibraryQualityString': 'Gold',\n'Library_Class': '1',\n'MQScore': '0.704152',\n'MZErrorPPM': '405416',\n'MassDiff': '111.896',\n'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',\n'PI': 'Dorrestein',\n'Precursor_MZ': '276.003',\n'Pubmed_ID': 'N/A',\n'RT_Query': '795.979',\n'SharedPeaks': '7',\n'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',\n'SpecCharge': '1',\n'SpecMZ': '164.107',\n'SpectrumFile': 'spectra/specs_ms.pklbin',\n'SpectrumID': 'CCMSLIB00000086167',\n'TIC_Query': '986.997',\n'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',\n'tags': ' ',\n'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}\n
    Source code in src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSAnnotationLoader.\n\n    Args:\n        file: The GNPS annotation file.\n\n    Examples:\n        >>> loader = GNPSAnnotationLoader(\"gnps_annotations.tsv\")\n        >>> print(loader.annotations[\"100\"])\n        {'#Scan#': '100',\n        'Adduct': 'M+H',\n        'CAS_Number': 'N/A',\n        'Charge': '1',\n        'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',\n        'Compound_Source': 'NIH Pharmacologically Active Library',\n        'Data_Collector': 'VP/LMS',\n        'ExactMass': '274.992',\n        'INCHI': 'N/A',\n        'INCHI_AUX': 'N/A',\n        'Instrument': 'qTof',\n        'IonMode': 'Positive',\n        'Ion_Source': 'LC-ESI',\n        'LibMZ': '276.003',\n        'LibraryName': 'lib-00014.mgf',\n        'LibraryQualityString': 'Gold',\n        'Library_Class': '1',\n        'MQScore': '0.704152',\n        'MZErrorPPM': '405416',\n        'MassDiff': '111.896',\n        'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',\n        'PI': 'Dorrestein',\n        'Precursor_MZ': '276.003',\n        'Pubmed_ID': 'N/A',\n        'RT_Query': '795.979',\n        'SharedPeaks': '7',\n        'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',\n        'SpecCharge': '1',\n        'SpecMZ': '164.107',\n        'SpectrumFile': 'spectra/specs_ms.pklbin',\n        'SpectrumID': 'CCMSLIB00000086167',\n        'TIC_Query': '986.997',\n        'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',\n        'tags': ' ',\n        'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n        'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n        'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n        'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}\n    \"\"\"\n    self._file = Path(file)\n    self._annotations: dict[str, dict] = {}\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSAnnotationLoader.annotations","title":"annotations property","text":"
    annotations: dict[str, dict]\n

    Get annotations.

    Returns:

    Type Description dict[str, dict]

    Keys are spectrum ids (\"#Scan#\" in annotation file) and values are the annotations dict

    dict[str, dict]

    for each spectrum.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFileMappingLoader","title":"GNPSFileMappingLoader","text":"
    GNPSFileMappingLoader(file: str | PathLike)\n

    Bases: FileMappingLoaderBase

    Class to load file mappings from GNPS output file.

    File mappings refers to the mapping from spectrum id to files in which this spectrum occurs.

    The file mappings file is from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv
    2. METABOLOMICS-SNETS-V2
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.clustersummary
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • quantification_table/.csv

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the GNPS file mappings file.

    required

    Raises:

    Type Description ValueError

    Raises ValueError if the file is not valid.

    Examples:

    >>> loader = GNPSFileMappingLoader(\"gnps_file_mappings.tsv\")\n>>> print(loader.mappings[\"1\"])\n['26c.mzXML']\n>>> print(loader.mapping_reversed[\"26c.mzXML\"])\n{'1', '3', '7', ...}\n
    Source code in src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSFileMappingLoader.\n\n    Args:\n        file: Path to the GNPS file mappings file.\n\n    Raises:\n        ValueError: Raises ValueError if the file is not valid.\n\n    Examples:\n        >>> loader = GNPSFileMappingLoader(\"gnps_file_mappings.tsv\")\n        >>> print(loader.mappings[\"1\"])\n        ['26c.mzXML']\n        >>> print(loader.mapping_reversed[\"26c.mzXML\"])\n        {'1', '3', '7', ...}\n    \"\"\"\n    self._gnps_format = gnps_format_from_file_mapping(file)\n    if self._gnps_format is GNPSFormat.Unknown:\n        raise ValueError(\"Unknown workflow type for GNPS file mappings file \")\n\n    self._file = Path(file)\n    self._mapping: dict[str, list[str]] = {}\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFileMappingLoader.mappings","title":"mappings property","text":"
    mappings: dict[str, list[str]]\n

    Return mapping from spectrum id to files in which this spectrum occurs.

    Returns:

    Type Description dict[str, list[str]]

    Mapping from spectrum id to names of all files in which this spectrum occurs.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFileMappingLoader.mapping_reversed","title":"mapping_reversed property","text":"
    mapping_reversed: dict[str, set[str]]\n

    Return mapping from file name to all spectra that occur in this file.

    Returns:

    Type Description dict[str, set[str]]

    Mapping from file name to all spectra ids that occur in this file.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.gnps_format_from_archive","title":"gnps_format_from_archive","text":"
    gnps_format_from_archive(\n    zip_file: str | PathLike,\n) -> GNPSFormat\n

    Detect GNPS format from a downloaded GNPS zip archive.

    The detection is based on the filename of the zip file and the names of the files contained in the zip file.

    Parameters:

    Name Type Description Default zip_file str | PathLike

    Path to the downloaded GNPS zip file.

    required

    Returns:

    Type Description GNPSFormat

    The format identified in the GNPS zip file.

    Examples:

    >>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip\") == GNPSFormat.SNETS\n>>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-V2-189e8bf1-download_clustered_spectra.zip\") == GNPSFormat.SNETSV2\n>>> gnps_format_from_archive(\"downloads/ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip\") == GNPSFormat.FBMN\n
    Source code in src/nplinker/metabolomics/gnps/gnps_format.py
    def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:\n    \"\"\"Detect GNPS format from a downloaded GNPS zip archive.\n\n    The detection is based on the filename of the zip file and the names of the\n    files contained in the zip file.\n\n    Args:\n        zip_file: Path to the downloaded GNPS zip file.\n\n    Returns:\n        The format identified in the GNPS zip file.\n\n    Examples:\n        >>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip\") == GNPSFormat.SNETS\n        >>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-V2-189e8bf1-download_clustered_spectra.zip\") == GNPSFormat.SNETSV2\n        >>> gnps_format_from_archive(\"downloads/ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip\") == GNPSFormat.FBMN\n    \"\"\"\n    file = Path(zip_file)\n    # Guess the format from the filename of the zip file\n    if GNPSFormat.FBMN.value in file.name:\n        return GNPSFormat.FBMN\n    # the order of the if statements matters for the following two\n    if GNPSFormat.SNETSV2.value in file.name:\n        return GNPSFormat.SNETSV2\n    if GNPSFormat.SNETS.value in file.name:\n        return GNPSFormat.SNETS\n\n    # Guess the format from the names of the files in the zip file\n    with zipfile.ZipFile(file) as archive:\n        filenames = archive.namelist()\n    if any(GNPSFormat.FBMN.value in x for x in filenames):\n        return GNPSFormat.FBMN\n    # the order of the if statements matters for the following two\n    if any(GNPSFormat.SNETSV2.value in x for x in filenames):\n        return GNPSFormat.SNETSV2\n    if any(GNPSFormat.SNETS.value in x for x in filenames):\n        return GNPSFormat.SNETS\n\n    return GNPSFormat.Unknown\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.gnps_format_from_file_mapping","title":"gnps_format_from_file_mapping","text":"
    gnps_format_from_file_mapping(\n    file: str | PathLike,\n) -> GNPSFormat\n

    Detect GNPS format from the given file mapping file.

    The GNPS file mapping file is located in different folders depending on the GNPS workflow. Here are the locations in corresponding GNPS zip archives:

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the file to peek the format for.

    required

    Returns:

    Type Description GNPSFormat

    GNPS format identified in the file.

    Source code in src/nplinker/metabolomics/gnps/gnps_format.py
    def gnps_format_from_file_mapping(file: str | PathLike) -> GNPSFormat:\n    \"\"\"Detect GNPS format from the given file mapping file.\n\n    The GNPS file mapping file is located in different folders depending on the\n    GNPS workflow. Here are the locations in corresponding GNPS zip archives:\n\n    - METABOLOMICS-SNETS workflow: the .tsv file under folder \"clusterinfosummarygroup_attributes_withIDs_withcomponentID\"\n    - METABOLOMICS-SNETS-V2 workflow: the .clustersummary file (tsv) under folder \"clusterinfosummarygroup_attributes_withIDs_withcomponentID\"\n    - FEATURE-BASED-MOLECULAR-NETWORKING workflow: the .csv file under folder \"quantification_table\"\n\n    Args:\n        file: Path to the file to peek the format for.\n\n    Returns:\n        GNPS format identified in the file.\n    \"\"\"\n    headers = get_headers(file)\n    if \"AllFiles\" in headers:\n        return GNPSFormat.SNETS\n    if \"UniqueFileSources\" in headers:\n        return GNPSFormat.SNETSV2\n    if \"row ID\" in headers:\n        return GNPSFormat.FBMN\n    return GNPSFormat.Unknown\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.gnps_format_from_task_id","title":"gnps_format_from_task_id","text":"
    gnps_format_from_task_id(task_id: str) -> GNPSFormat\n

    Detect GNPS format for the given task id.

    Parameters:

    Name Type Description Default task_id str

    GNPS task id.

    required

    Returns:

    Type Description GNPSFormat

    The format identified in the GNPS task.

    Examples:

    >>> gnps_format_from_task_id(\"c22f44b14a3d450eb836d607cb9521bb\") == GNPSFormat.SNETS\n>>> gnps_format_from_task_id(\"189e8bf16af145758b0a900f1c44ff4a\") == GNPSFormat.SNETSV2\n>>> gnps_format_from_task_id(\"92036537c21b44c29e509291e53f6382\") == GNPSFormat.FBMN\n>>> gnps_format_from_task_id(\"0ad6535e34d449788f297e712f43068a\") == GNPSFormat.Unknown\n
    Source code in src/nplinker/metabolomics/gnps/gnps_format.py
    def gnps_format_from_task_id(task_id: str) -> GNPSFormat:\n    \"\"\"Detect GNPS format for the given task id.\n\n    Args:\n        task_id: GNPS task id.\n\n    Returns:\n        The format identified in the GNPS task.\n\n    Examples:\n        >>> gnps_format_from_task_id(\"c22f44b14a3d450eb836d607cb9521bb\") == GNPSFormat.SNETS\n        >>> gnps_format_from_task_id(\"189e8bf16af145758b0a900f1c44ff4a\") == GNPSFormat.SNETSV2\n        >>> gnps_format_from_task_id(\"92036537c21b44c29e509291e53f6382\") == GNPSFormat.FBMN\n        >>> gnps_format_from_task_id(\"0ad6535e34d449788f297e712f43068a\") == GNPSFormat.Unknown\n    \"\"\"\n    task_html = httpx.get(GNPS_TASK_URL.format(task_id))\n    soup = BeautifulSoup(task_html.text, features=\"html.parser\")\n    try:\n        # find the td tag that follows the th tag containing 'Workflow'\n        workflow_tag = soup.find(\"th\", string=\"Workflow\").find_next_sibling(\"td\")  # type: ignore\n        workflow_format = workflow_tag.contents[0].strip()  # type: ignore\n    except AttributeError:\n        return GNPSFormat.Unknown\n\n    if workflow_format == GNPSFormat.FBMN.value:\n        return GNPSFormat.FBMN\n    if workflow_format == GNPSFormat.SNETSV2.value:\n        return GNPSFormat.SNETSV2\n    if workflow_format == GNPSFormat.SNETS.value:\n        return GNPSFormat.SNETS\n    return GNPSFormat.Unknown\n
    "},{"location":"api/loader/","title":"Dataset Loader","text":""},{"location":"api/loader/#nplinker.loader","title":"loader","text":""},{"location":"api/loader/#nplinker.loader.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader","title":"DatasetLoader","text":"
    DatasetLoader(config: Dynaconf)\n

    Class to load all data.

    Attributes:

    Name Type Description config

    A Dynaconf object that contains the configuration settings. Check the nplinker.config module for more information.

    bgcs list[BGC]

    A list of BGC objects.

    gcfs list[GCF]

    A list of GCF objects.

    spectra list[Spectrum]

    A list of Spectrum objects.

    mfs list[MolecularFamily]

    A list of MolecularFamily objects.

    mibig_bgcs list[BGC]

    A list of MIBiG BGC objects.

    mibig_strains_in_use StrainCollection

    A StrainCollection object that contains the strains in use from MIBiG.

    product_types list

    A list of product types.

    strains StrainCollection

    A StrainCollection object that contains all strains.

    class_matches

    A ClassMatches object that contains class match info.

    chem_classes

    A ChemClassPredictions object that contains chemical class predictions.

    Parameters:

    Name Type Description Default config Dynaconf

    A Dynaconf object that contains the configuration settings. Check the nplinker.config module for more information.

    required Source code in src/nplinker/loader.py
    def __init__(self, config: Dynaconf):\n    \"\"\"Initialize the DatasetLoader.\n\n    Args:\n        config: A Dynaconf object that contains the configuration settings. Check the\n            `nplinker.config` module for more information.\n    \"\"\"\n    self.config = config\n\n    self.bgcs: list[BGC] = []\n    self.gcfs: list[GCF] = []\n    self.spectra: list[Spectrum] = []\n    self.mfs: list[MolecularFamily] = []\n    self.mibig_bgcs: list[BGC] = []\n    self.mibig_strains_in_use: StrainCollection = StrainCollection()\n    self.product_types: list = []\n    self.strains: StrainCollection = StrainCollection()\n\n    self.class_matches = None\n    self.chem_classes = None\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.RUN_CANOPUS_DEFAULT","title":"RUN_CANOPUS_DEFAULT class-attribute instance-attribute","text":"
    RUN_CANOPUS_DEFAULT = False\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.EXTRA_CANOPUS_PARAMS_DEFAULT","title":"EXTRA_CANOPUS_PARAMS_DEFAULT class-attribute instance-attribute","text":"
    EXTRA_CANOPUS_PARAMS_DEFAULT = (\n    \"--maxmz 600 formula zodiac structure canopus\"\n)\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.OR_CANOPUS","title":"OR_CANOPUS class-attribute instance-attribute","text":"
    OR_CANOPUS = 'canopus_dir'\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.OR_MOLNETENHANCER","title":"OR_MOLNETENHANCER class-attribute instance-attribute","text":"
    OR_MOLNETENHANCER = 'molnetenhancer_dir'\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.config","title":"config instance-attribute","text":"
    config = config\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.bgcs","title":"bgcs instance-attribute","text":"
    bgcs: list[BGC] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.gcfs","title":"gcfs instance-attribute","text":"
    gcfs: list[GCF] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.spectra","title":"spectra instance-attribute","text":"
    spectra: list[Spectrum] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.mfs","title":"mfs instance-attribute","text":"
    mfs: list[MolecularFamily] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.mibig_bgcs","title":"mibig_bgcs instance-attribute","text":"
    mibig_bgcs: list[BGC] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.mibig_strains_in_use","title":"mibig_strains_in_use instance-attribute","text":"
    mibig_strains_in_use: StrainCollection = StrainCollection()\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.product_types","title":"product_types instance-attribute","text":"
    product_types: list = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.strains","title":"strains instance-attribute","text":"
    strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.class_matches","title":"class_matches instance-attribute","text":"
    class_matches = None\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.chem_classes","title":"chem_classes instance-attribute","text":"
    chem_classes = None\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.load","title":"load","text":"
    load()\n

    Load all data.

    Source code in src/nplinker/loader.py
    def load(self):\n    \"\"\"Load all data.\"\"\"\n    if not self._load_strain_mappings():\n        return False\n\n    if not self._load_metabolomics():\n        return False\n\n    if not self._load_genomics():\n        return False\n\n    # set self.strains with all strains from input plus mibig strains in use\n    self.strains = self.strains + self.mibig_strains_in_use\n\n    if len(self.strains) == 0:\n        raise Exception(\"Failed to find *ANY* strains.\")\n\n    return True\n
    "},{"location":"api/metabolomics/","title":"Data Models","text":""},{"location":"api/metabolomics/#nplinker.metabolomics","title":"metabolomics","text":""},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily","title":"MolecularFamily","text":"
    MolecularFamily(id: str)\n

    Class to model molecular family.

    Attributes:

    Name Type Description id str

    Unique id for the molecular family.

    spectra_ids set[str]

    Set of spectrum ids in the molecular family.

    Parameters:

    Name Type Description Default id str

    Unique id for the molecular family.

    required Source code in src/nplinker/metabolomics/molecular_family.py
    def __init__(self, id: str):\n    \"\"\"Initialize the MolecularFamily.\n\n    Args:\n        id: Unique id for the molecular family.\n    \"\"\"\n    self.id: str = id\n    self.spectra_ids: set[str] = set()\n    self._spectra: set[Spectrum] = set()\n    self._strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.id","title":"id instance-attribute","text":"
    id: str = id\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.spectra_ids","title":"spectra_ids instance-attribute","text":"
    spectra_ids: set[str] = set()\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.spectra","title":"spectra property","text":"
    spectra: set[Spectrum]\n

    Get Spectrum objects in the molecular family.

    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.strains","title":"strains property","text":"
    strains: StrainCollection\n

    Get strains in the molecular family.

    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.add_spectrum","title":"add_spectrum","text":"
    add_spectrum(spectrum: Spectrum) -> None\n

    Add a Spectrum object to the molecular family.

    Parameters:

    Name Type Description Default spectrum Spectrum

    Spectrum object to add to the molecular family.

    required Source code in src/nplinker/metabolomics/molecular_family.py
    def add_spectrum(self, spectrum: Spectrum) -> None:\n    \"\"\"Add a Spectrum object to the molecular family.\n\n    Args:\n        spectrum: `Spectrum` object to add to the molecular family.\n    \"\"\"\n    self._spectra.add(spectrum)\n    self.spectra_ids.add(spectrum.id)\n    self._strains = self._strains + spectrum.strains\n    # add the molecular family to the spectrum\n    spectrum.family = self\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.detach_spectrum","title":"detach_spectrum","text":"
    detach_spectrum(spectrum: Spectrum) -> None\n

    Remove a Spectrum object from the molecular family.

    Parameters:

    Name Type Description Default spectrum Spectrum

    Spectrum object to remove from the molecular family.

    required Source code in src/nplinker/metabolomics/molecular_family.py
    def detach_spectrum(self, spectrum: Spectrum) -> None:\n    \"\"\"Remove a Spectrum object from the molecular family.\n\n    Args:\n        spectrum: `Spectrum` object to remove from the molecular family.\n    \"\"\"\n    self._spectra.remove(spectrum)\n    self.spectra_ids.remove(spectrum.id)\n    self._strains = self._update_strains()\n    # remove the molecular family from the spectrum\n    spectrum.family = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.has_strain","title":"has_strain","text":"
    has_strain(strain: Strain) -> bool\n

    Check if the given strain exists.

    Parameters:

    Name Type Description Default strain Strain

    Strain object.

    required

    Returns:

    Type Description bool

    True when the given strain exists.

    Source code in src/nplinker/metabolomics/molecular_family.py
    def has_strain(self, strain: Strain) -> bool:\n    \"\"\"Check if the given strain exists.\n\n    Args:\n        strain: `Strain` object.\n\n    Returns:\n        True when the given strain exists.\n    \"\"\"\n    return strain in self._strains\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.is_singleton","title":"is_singleton","text":"
    is_singleton() -> bool\n

    Check if the molecular family contains only one spectrum.

    Returns:

    Type Description bool

    True when MolecularFamily.spectra_ids contains only one spectrum id.

    Source code in src/nplinker/metabolomics/molecular_family.py
    def is_singleton(self) -> bool:\n    \"\"\"Check if the molecular family contains only one spectrum.\n\n    Returns:\n        True when `MolecularFamily.spectra_ids` contains only one spectrum id.\n    \"\"\"\n    return len(self.spectra_ids) == 1\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum","title":"Spectrum","text":"
    Spectrum(\n    id: str,\n    mz: list[float],\n    intensity: list[float],\n    precursor_mz: float,\n    rt: float = 0,\n    metadata: dict | None = None,\n)\n

    Class to model MS/MS Spectrum.

    Attributes:

    Name Type Description id

    the spectrum ID.

    mz

    the list of m/z values.

    intensity

    the list of intensity values.

    precursor_mz

    the m/z value of the precursor.

    rt

    the retention time in seconds.

    metadata

    the metadata of the spectrum, i.e. the header information in the MGF file.

    gnps_annotations dict

    the GNPS annotations of the spectrum.

    gnps_id str | None

    the GNPS ID of the spectrum.

    strains StrainCollection

    the strains that this spectrum belongs to.

    family MolecularFamily | None

    the molecular family that this spectrum belongs to.

    peaks ndarray

    2D array of peaks, each row is a peak of (m/z, intensity) values.

    Parameters:

    Name Type Description Default id str

    the spectrum ID.

    required mz list[float]

    the list of m/z values.

    required intensity list[float]

    the list of intensity values.

    required precursor_mz float

    the precursor m/z.

    required rt float

    the retention time in seconds. Defaults to 0.

    0 metadata dict | None

    the metadata of the spectrum, i.e. the header information in the MGF file.

    None Source code in src/nplinker/metabolomics/spectrum.py
    def __init__(\n    self,\n    id: str,\n    mz: list[float],\n    intensity: list[float],\n    precursor_mz: float,\n    rt: float = 0,\n    metadata: dict | None = None,\n) -> None:\n    \"\"\"Initialize the Spectrum.\n\n    Args:\n        id: the spectrum ID.\n        mz: the list of m/z values.\n        intensity: the list of intensity values.\n        precursor_mz: the precursor m/z.\n        rt: the retention time in seconds. Defaults to 0.\n        metadata: the metadata of the spectrum, i.e. the header information\n            in the MGF file.\n    \"\"\"\n    self.id = id\n    self.mz = mz\n    self.intensity = intensity\n    self.precursor_mz = precursor_mz\n    self.rt = rt\n    self.metadata = metadata or {}\n\n    self.gnps_annotations: dict = {}\n    self.gnps_id: str | None = None\n    self.strains: StrainCollection = StrainCollection()\n    self.family: MolecularFamily | None = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.id","title":"id instance-attribute","text":"
    id = id\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.mz","title":"mz instance-attribute","text":"
    mz = mz\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.intensity","title":"intensity instance-attribute","text":"
    intensity = intensity\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.precursor_mz","title":"precursor_mz instance-attribute","text":"
    precursor_mz = precursor_mz\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.rt","title":"rt instance-attribute","text":"
    rt = rt\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.metadata","title":"metadata instance-attribute","text":"
    metadata = metadata or {}\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.gnps_annotations","title":"gnps_annotations instance-attribute","text":"
    gnps_annotations: dict = {}\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.gnps_id","title":"gnps_id instance-attribute","text":"
    gnps_id: str | None = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.strains","title":"strains instance-attribute","text":"
    strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.family","title":"family instance-attribute","text":"
    family: MolecularFamily | None = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.peaks","title":"peaks cached property","text":"
    peaks: ndarray\n

    Get the peaks, a 2D array with each row containing the values of (m/z, intensity).

    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.has_strain","title":"has_strain","text":"
    has_strain(strain: Strain) -> bool\n

    Check if the given strain exists in the spectrum.

    Parameters:

    Name Type Description Default strain Strain

    Strain object.

    required

    Returns:

    Type Description bool

    True when the given strain exist in the spectrum.

    Source code in src/nplinker/metabolomics/spectrum.py
    def has_strain(self, strain: Strain) -> bool:\n    \"\"\"Check if the given strain exists in the spectrum.\n\n    Args:\n        strain: `Strain` object.\n\n    Returns:\n        True when the given strain exist in the spectrum.\n    \"\"\"\n    return strain in self.strains\n
    "},{"location":"api/metabolomics_abc/","title":"Abstract Base Classes","text":""},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc","title":"abc","text":""},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.SpectrumLoaderBase","title":"SpectrumLoaderBase","text":"

    Bases: ABC

    Abstract base class for SpectrumLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.SpectrumLoaderBase.spectra","title":"spectra abstractmethod property","text":"
    spectra: list[Spectrum]\n

    Get Spectrum objects.

    Returns:

    Type Description list[Spectrum]

    A sequence of Spectrum objects.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.MolecularFamilyLoaderBase","title":"MolecularFamilyLoaderBase","text":"

    Bases: ABC

    Abstract base class for MolecularFamilyLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.MolecularFamilyLoaderBase.get_mfs","title":"get_mfs abstractmethod","text":"
    get_mfs(keep_singleton: bool) -> list[MolecularFamily]\n

    Get MolecularFamily objects.

    Parameters:

    Name Type Description Default keep_singleton bool

    True to keep singleton molecular families. A singleton molecular family is a molecular family that contains only one spectrum.

    required

    Returns:

    Type Description list[MolecularFamily]

    A sequence of MolecularFamily objects.

    Source code in src/nplinker/metabolomics/abc.py
    @abstractmethod\ndef get_mfs(self, keep_singleton: bool) -> list[MolecularFamily]:\n    \"\"\"Get MolecularFamily objects.\n\n    Args:\n        keep_singleton: True to keep singleton molecular families. A\n            singleton molecular family is a molecular family that contains\n            only one spectrum.\n\n    Returns:\n        A sequence of MolecularFamily objects.\n    \"\"\"\n
    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.FileMappingLoaderBase","title":"FileMappingLoaderBase","text":"

    Bases: ABC

    Abstract base class for FileMappingLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.FileMappingLoaderBase.mappings","title":"mappings abstractmethod property","text":"
    mappings: dict[str, list[str]]\n

    Get file mappings.

    Returns:

    Type Description dict[str, list[str]]

    A mapping from spectrum ID to the names of files where the spectrum occurs.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.AnnotationLoaderBase","title":"AnnotationLoaderBase","text":"

    Bases: ABC

    Abstract base class for AnnotationLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.AnnotationLoaderBase.annotations","title":"annotations abstractmethod property","text":"
    annotations: dict[str, dict]\n

    Get annotations.

    Returns:

    Type Description dict[str, dict]

    A mapping from spectrum ID to its annotations.

    "},{"location":"api/metabolomics_utils/","title":"Utilities","text":""},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils","title":"utils","text":""},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.add_annotation_to_spectrum","title":"add_annotation_to_spectrum","text":"
    add_annotation_to_spectrum(\n    annotations: Mapping[str, dict],\n    spectra: Sequence[Spectrum],\n) -> None\n

    Add GNPS annotations to the Spectrum.gnps_annotations attribute for input spectra.

    It is possible that some spectra don't have annotations. Note that the input spectra list is changed in place.

    Parameters:

    Name Type Description Default annotations Mapping[str, dict]

    A dictionary of GNPS annotations, where the keys are spectrum ids and the values are GNPS annotations.

    required spectra Sequence[Spectrum]

    A list of Spectrum objects.

    required Source code in src/nplinker/metabolomics/utils.py
    def add_annotation_to_spectrum(\n    annotations: Mapping[str, dict], spectra: Sequence[Spectrum]\n) -> None:\n    \"\"\"Add GNPS annotations to the `Spectrum.gnps_annotations` attribute for input spectra.\n\n    It is possible that some spectra don't have annotations.\n    Note that the input `spectra` list is changed in place.\n\n    Args:\n        annotations: A dictionary of GNPS annotations, where the keys are\n            spectrum ids and the values are GNPS annotations.\n        spectra: A list of Spectrum objects.\n    \"\"\"\n    for spec in spectra:\n        if spec.id in annotations:\n            spec.gnps_annotations = annotations[spec.id]\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.add_strains_to_spectrum","title":"add_strains_to_spectrum","text":"
    add_strains_to_spectrum(\n    strains: StrainCollection, spectra: Sequence[Spectrum]\n) -> tuple[list[Spectrum], list[Spectrum]]\n

    Add Strain objects to the Spectrum.strains attribute for input spectra.

    Note that the input spectra list is changed in place.

    Parameters:

    Name Type Description Default strains StrainCollection

    A collection of strain objects.

    required spectra Sequence[Spectrum]

    A list of Spectrum objects.

    required

    Returns:

    Type Description tuple[list[Spectrum], list[Spectrum]]

    A tuple of two lists of Spectrum objects,

    Source code in src/nplinker/metabolomics/utils.py
    def add_strains_to_spectrum(\n    strains: StrainCollection, spectra: Sequence[Spectrum]\n) -> tuple[list[Spectrum], list[Spectrum]]:\n    \"\"\"Add `Strain` objects to the `Spectrum.strains` attribute for input spectra.\n\n    Note that the input `spectra` list is changed in place.\n\n    Args:\n        strains: A collection of strain objects.\n        spectra: A list of Spectrum objects.\n\n    Returns:\n        A tuple of two lists of Spectrum objects,\n\n            - the first list contains Spectrum objects that are updated with Strain objects;\n            - the second list contains Spectrum objects that are not updated with Strain objects\n            because no Strain objects are found.\n    \"\"\"\n    spectra_with_strains = []\n    spectra_without_strains = []\n    for spec in spectra:\n        try:\n            strain_list = strains.lookup(spec.id)\n        except ValueError:\n            spectra_without_strains.append(spec)\n            continue\n\n        for strain in strain_list:\n            spec.strains.add(strain)\n        spectra_with_strains.append(spec)\n\n    logger.info(\n        f\"{len(spectra_with_strains)} Spectrum objects updated with Strain objects.\\n\"\n        f\"{len(spectra_without_strains)} Spectrum objects not updated with Strain objects.\"\n    )\n\n    return spectra_with_strains, spectra_without_strains\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.add_spectrum_to_mf","title":"add_spectrum_to_mf","text":"
    add_spectrum_to_mf(\n    spectra: Sequence[Spectrum],\n    mfs: Sequence[MolecularFamily],\n) -> tuple[\n    list[MolecularFamily],\n    list[MolecularFamily],\n    dict[MolecularFamily, set[str]],\n]\n

    Add Spectrum objects to MolecularFamily objects.

    The attribute of spectra_ids of MolecularFamily object contains the ids of Spectrum objects. These ids are used to find Spectrum objects from the input spectra list. The found Spectrum objects are added to the spectra attribute of MolecularFamily object. It is possible that some spectrum ids are not found in the input spectra list, and so their Spectrum objects are missing in the MolecularFamily object.

    Note that the input mfs list is changed in place.

    Parameters:

    Name Type Description Default spectra Sequence[Spectrum]

    A list of Spectrum objects.

    required mfs Sequence[MolecularFamily]

    A list of MolecularFamily objects.

    required

    Returns:

    Type Description tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]

    A tuple of three elements,

    Source code in src/nplinker/metabolomics/utils.py
    def add_spectrum_to_mf(\n    spectra: Sequence[Spectrum], mfs: Sequence[MolecularFamily]\n) -> tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]:\n    \"\"\"Add Spectrum objects to MolecularFamily objects.\n\n    The attribute of `spectra_ids` of MolecularFamily object contains the ids of Spectrum objects.\n    These ids are used to find Spectrum objects from the input `spectra` list. The found Spectrum\n    objects are added to the `spectra` attribute of MolecularFamily object. It is possible that\n    some spectrum ids are not found in the input `spectra` list, and so their Spectrum objects are\n    missing in the MolecularFamily object.\n\n    Note that the input `mfs` list is changed in place.\n\n    Args:\n        spectra: A list of Spectrum objects.\n        mfs: A list of MolecularFamily objects.\n\n    Returns:\n        A tuple of three elements,\n\n            - the first list contains MolecularFamily objects that are updated with Spectrum objects\n            - the second list contains MolecularFamily objects that are not updated with Spectrum\n            objects (all Spectrum objects are missing).\n            - the third is a dictionary containing MolecularFamily objects as keys and a set of ids\n            of missing Spectrum objects as values.\n    \"\"\"\n    spec_dict = {spec.id: spec for spec in spectra}\n    mf_with_spec = []\n    mf_without_spec = []\n    mf_missing_spec: dict[MolecularFamily, set[str]] = {}\n    for mf in mfs:\n        for spec_id in mf.spectra_ids:\n            try:\n                spec = spec_dict[spec_id]\n            except KeyError:\n                if mf not in mf_missing_spec:\n                    mf_missing_spec[mf] = {spec_id}\n                else:\n                    mf_missing_spec[mf].add(spec_id)\n                continue\n            mf.add_spectrum(spec)\n\n        if mf.spectra:\n            mf_with_spec.append(mf)\n        else:\n            mf_without_spec.append(mf)\n\n    logger.info(\n        f\"{len(mf_with_spec)} MolecularFamily objects updated with Spectrum objects.\\n\"\n        f\"{len(mf_without_spec)} MolecularFamily objects not updated with Spectrum objects.\\n\"\n        f\"{len(mf_missing_spec)} MolecularFamily objects have missing Spectrum objects.\"\n    )\n    return mf_with_spec, mf_without_spec, mf_missing_spec\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.extract_mappings_strain_id_ms_filename","title":"extract_mappings_strain_id_ms_filename","text":"
    extract_mappings_strain_id_ms_filename(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"strain_id <-> MS_filename\".

    Parameters:

    Name Type Description Default podp_project_json_file str | PathLike

    The path to the PODP project JSON file.

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of MS filenames.

    Notes

    The podp_project_json_file is the project JSON file downloaded from PODP platform. For example, for project MSV000079284, its json file is https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.

    Source code in src/nplinker/metabolomics/utils.py
    def extract_mappings_strain_id_ms_filename(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"strain_id <-> MS_filename\".\n\n    Args:\n        podp_project_json_file: The path to the PODP project\n            JSON file.\n\n    Returns:\n        Key is strain id and value is a set of MS filenames.\n\n    Notes:\n        The `podp_project_json_file` is the project JSON file downloaded from\n        PODP platform. For example, for project MSV000079284, its json file is\n        https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.\n    \"\"\"\n    mappings_dict: dict[str, set[str]] = {}\n    with open(podp_project_json_file, \"r\") as f:\n        json_data = json.load(f)\n\n    validate_podp_json(json_data)\n\n    # Extract mappings strain id <-> metabolomics filename\n    for record in json_data[\"genome_metabolome_links\"]:\n        strain_id = record[\"genome_label\"]\n        # get the actual filename of the mzXML URL\n        filename = Path(record[\"metabolomics_file\"]).name\n        if strain_id in mappings_dict:\n            mappings_dict[strain_id].add(filename)\n        else:\n            mappings_dict[strain_id] = {filename}\n    return mappings_dict\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.extract_mappings_ms_filename_spectrum_id","title":"extract_mappings_ms_filename_spectrum_id","text":"
    extract_mappings_ms_filename_spectrum_id(\n    gnps_file_mappings_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"MS_filename <-> spectrum_id\".

    Parameters:

    Name Type Description Default gnps_file_mappings_file str | PathLike

    The path to the GNPS file mappings file (csv or tsv).

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is MS filename and value is a set of spectrum ids.

    Notes

    The gnps_file_mappings_file is generated by GNPS molecular networking. It's downloaded from GNPS website to a file with a default name defined in GNPS_FILE_MAPPINGS_FILENAME.

    See Also

    GNPSFileMappingLoader: A class to load GNPS file mappings file.

    Source code in src/nplinker/metabolomics/utils.py
    def extract_mappings_ms_filename_spectrum_id(\n    gnps_file_mappings_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"MS_filename <-> spectrum_id\".\n\n    Args:\n        gnps_file_mappings_file: The path to the GNPS file mappings file (csv or\n            tsv).\n\n    Returns:\n        Key is MS filename and value is a set of spectrum ids.\n\n    Notes:\n        The `gnps_file_mappings_file` is generated by GNPS molecular networking. It's downloaded\n        from GNPS website to a file with a default name defined in `GNPS_FILE_MAPPINGS_FILENAME`.\n\n    See Also:\n        GNPSFileMappingLoader: A class to load GNPS file mappings file.\n    \"\"\"\n    loader = GNPSFileMappingLoader(gnps_file_mappings_file)\n    return loader.mapping_reversed\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.get_mappings_strain_id_spectrum_id","title":"get_mappings_strain_id_spectrum_id","text":"
    get_mappings_strain_id_spectrum_id(\n    mappings_strain_id_ms_filename: Mapping[str, set[str]],\n    mappings_ms_filename_spectrum_id: Mapping[\n        str, set[str]\n    ],\n) -> dict[str, set[str]]\n

    Get mappings \"strain_id <-> spectrum_id\".

    Parameters:

    Name Type Description Default mappings_strain_id_ms_filename Mapping[str, set[str]]

    Mappings \"strain_id <-> MS_filename\".

    required mappings_ms_filename_spectrum_id Mapping[str, set[str]]

    Mappings \"MS_filename <-> spectrum_id\".

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of spectrum ids.

    See Also

    extract_mappings_strain_id_ms_filename: Extract mappings \"strain_id <-> MS_filename\". extract_mappings_ms_filename_spectrum_id: Extract mappings \"MS_filename <-> spectrum_id\".

    Source code in src/nplinker/metabolomics/utils.py
    def get_mappings_strain_id_spectrum_id(\n    mappings_strain_id_ms_filename: Mapping[str, set[str]],\n    mappings_ms_filename_spectrum_id: Mapping[str, set[str]],\n) -> dict[str, set[str]]:\n    \"\"\"Get mappings \"strain_id <-> spectrum_id\".\n\n    Args:\n        mappings_strain_id_ms_filename: Mappings\n            \"strain_id <-> MS_filename\".\n        mappings_ms_filename_spectrum_id: Mappings\n            \"MS_filename <-> spectrum_id\".\n\n    Returns:\n        Key is strain id and value is a set of spectrum ids.\n\n\n    See Also:\n        `extract_mappings_strain_id_ms_filename`: Extract mappings\n            \"strain_id <-> MS_filename\".\n        `extract_mappings_ms_filename_spectrum_id`: Extract mappings\n            \"MS_filename <-> spectrum_id\".\n    \"\"\"\n    mappings_dict = {}\n    for strain_id, ms_filenames in mappings_strain_id_ms_filename.items():\n        spectrum_ids = set()\n        for ms_filename in ms_filenames:\n            if (sid := mappings_ms_filename_spectrum_id.get(ms_filename)) is not None:\n                spectrum_ids.update(sid)\n        if spectrum_ids:\n            mappings_dict[strain_id] = spectrum_ids\n    return mappings_dict\n
    "},{"location":"api/mibig/","title":"MiBIG","text":""},{"location":"api/mibig/#nplinker.genomics.mibig","title":"mibig","text":""},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader","title":"MibigLoader","text":"
    MibigLoader(data_dir: str | PathLike)\n

    Bases: BGCLoaderBase

    Parse MIBiG metadata files and return BGC objects.

    MIBiG metadata file (json) contains annotations/metadata information for each BGC. See https://mibig.secondarymetabolites.org/download.

    The MiBIG accession is used as BGC id and strain name. The loaded BGC objects have Strain object as their strain attribute (i.e. BGC.strain).

    Parameters:

    Name Type Description Default data_dir str | PathLike

    Path to the directory of MIBiG metadata json files

    required Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def __init__(self, data_dir: str | PathLike):\n    \"\"\"Initialize the MIBiG metadata loader.\n\n    Args:\n        data_dir: Path to the directory of MIBiG metadata json files\n    \"\"\"\n    self.data_dir = str(data_dir)\n    self._file_dict = self.parse_data_dir(self.data_dir)\n    self._metadata_dict = self._parse_metadata()\n    self._bgcs = self._parse_bgcs()\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.data_dir","title":"data_dir instance-attribute","text":"
    data_dir = str(data_dir)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.get_files","title":"get_files","text":"
    get_files() -> dict[str, str]\n

    Get the path of all MIBiG metadata json files.

    Returns:

    Type Description dict[str, str]

    The key is metadata file name (BGC accession), and the value is path to the metadata

    dict[str, str]

    json file

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def get_files(self) -> dict[str, str]:\n    \"\"\"Get the path of all MIBiG metadata json files.\n\n    Returns:\n        The key is metadata file name (BGC accession), and the value is path to the metadata\n        json file\n    \"\"\"\n    return self._file_dict\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.parse_data_dir","title":"parse_data_dir staticmethod","text":"
    parse_data_dir(data_dir: str | PathLike) -> dict[str, str]\n

    Parse metadata directory and return paths to all metadata json files.

    Parameters:

    Name Type Description Default data_dir str | PathLike

    path to the directory of MIBiG metadata json files

    required

    Returns:

    Type Description dict[str, str]

    The key is metadata file name (BGC accession), and the value is path to the metadata

    dict[str, str]

    json file

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    @staticmethod\ndef parse_data_dir(data_dir: str | PathLike) -> dict[str, str]:\n    \"\"\"Parse metadata directory and return paths to all metadata json files.\n\n    Args:\n        data_dir: path to the directory of MIBiG metadata json files\n\n    Returns:\n        The key is metadata file name (BGC accession), and the value is path to the metadata\n        json file\n    \"\"\"\n    file_dict = {}\n    json_files = list_files(data_dir, prefix=\"BGC\", suffix=\".json\")\n    for file in json_files:\n        fname = Path(file).stem\n        file_dict[fname] = file\n    return file_dict\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.get_metadata","title":"get_metadata","text":"
    get_metadata() -> dict[str, MibigMetadata]\n

    Get MibigMetadata objects.

    Returns:

    Type Description dict[str, MibigMetadata]

    The key is BGC accession (file name) and the value is MibigMetadata object

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def get_metadata(self) -> dict[str, MibigMetadata]:\n    \"\"\"Get MibigMetadata objects.\n\n    Returns:\n        The key is BGC accession (file name) and the value is MibigMetadata object\n    \"\"\"\n    return self._metadata_dict\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.get_bgcs","title":"get_bgcs","text":"
    get_bgcs() -> list[BGC]\n

    Get BGC objects.

    The BGC objects use MiBIG accession as id and have Strain object as their strain attribute (i.e. BGC.strain), where the name of the Strain object is also MiBIG accession.

    Returns:

    Type Description list[BGC]

    A list of BGC objects

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def get_bgcs(self) -> list[BGC]:\n    \"\"\"Get BGC objects.\n\n    The BGC objects use MiBIG accession as id and have Strain object as\n    their strain attribute (i.e. `BGC.strain`), where the name of the Strain\n    object is also MiBIG accession.\n\n    Returns:\n        A list of BGC objects\n    \"\"\"\n    return self._bgcs\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata","title":"MibigMetadata","text":"
    MibigMetadata(file: str | PathLike)\n

    Class to model the BGC metadata/annotations defined in MIBiG.

    MIBiG is a specification of BGC metadata and use JSON schema to represent BGC metadata. More details see: https://mibig.secondarymetabolites.org/download.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the json file of MIBiG BGC metadata

    required

    Examples:

    >>> metadata = MibigMetadata(\"/data/BGC0000001.json\")\n
    Source code in src/nplinker/genomics/mibig/mibig_metadata.py
    def __init__(self, file: str | PathLike) -> None:\n    \"\"\"Initialize the MIBiG metadata object.\n\n    Args:\n        file: Path to the json file of MIBiG BGC metadata\n\n    Examples:\n        >>> metadata = MibigMetadata(\"/data/BGC0000001.json\")\n    \"\"\"\n    self.file = str(file)\n    with open(self.file, \"rb\") as f:\n        self.metadata = json.load(f)\n\n    self._mibig_accession: str\n    self._biosyn_class: tuple[str]\n    self._parse_metadata()\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.file","title":"file instance-attribute","text":"
    file = str(file)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.metadata","title":"metadata instance-attribute","text":"
    metadata = load(f)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.mibig_accession","title":"mibig_accession property","text":"
    mibig_accession: str\n

    Get the value of metadata item 'mibig_accession'.

    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.biosyn_class","title":"biosyn_class property","text":"
    biosyn_class: tuple[str]\n

    Get the value of metadata item 'biosyn_class'.

    The 'biosyn_class' is biosynthetic class(es), namely the type of natural product or secondary metabolite.

    MIBiG defines 6 major biosynthetic classes, including \"NRP\", \"Polyketide\", \"RiPP\", \"Terpene\", \"Saccharide\" and \"Alkaloid\". Note that natural products created by all other biosynthetic mechanisms fall under the category \"Other\". More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.

    "},{"location":"api/mibig/#nplinker.genomics.mibig.download_and_extract_mibig_metadata","title":"download_and_extract_mibig_metadata","text":"
    download_and_extract_mibig_metadata(\n    download_root: str | PathLike,\n    extract_path: str | PathLike,\n    version: str = \"3.1\",\n)\n

    Download and extract MIBiG metadata json files.

    Note that it does not matter whether the metadata json files are in nested folders or not in the archive, all json files will be extracted to the same location, i.e. extract_path. The nested folders will be removed if they exist. So the extract_path will have only json files.

    Parameters:

    Name Type Description Default download_root str | PathLike

    Path to the directory in which to place the downloaded archive.

    required extract_path str | PathLike

    Path to an empty directory where the json files will be extracted. The directory must be empty if it exists. If it doesn't exist, the directory will be created.

    required version str

    description. Defaults to \"3.1\".

    '3.1'

    Examples:

    >>> download_and_extract_mibig_metadata(\"/data/download\", \"/data/mibig_metadata\")\n
    Source code in src/nplinker/genomics/mibig/mibig_downloader.py
    def download_and_extract_mibig_metadata(\n    download_root: str | os.PathLike,\n    extract_path: str | os.PathLike,\n    version: str = \"3.1\",\n):\n    \"\"\"Download and extract MIBiG metadata json files.\n\n    Note that it does not matter whether the metadata json files are in nested folders or not in the archive,\n    all json files will be extracted to the same location, i.e. `extract_path`. The nested\n    folders will be removed if they exist. So the `extract_path` will have only json files.\n\n    Args:\n        download_root: Path to the directory in which to place the downloaded archive.\n        extract_path: Path to an empty directory where the json files will be extracted.\n            The directory must be empty if it exists. If it doesn't exist, the directory will be created.\n        version: _description_. Defaults to \"3.1\".\n\n    Examples:\n        >>> download_and_extract_mibig_metadata(\"/data/download\", \"/data/mibig_metadata\")\n    \"\"\"\n    download_root = Path(download_root)\n    extract_path = Path(extract_path)\n\n    if download_root == extract_path:\n        raise ValueError(\"Identical path of download directory and extract directory\")\n\n    # check if extract_path is empty\n    if not extract_path.exists():\n        extract_path.mkdir(parents=True)\n    else:\n        if len(list(extract_path.iterdir())) != 0:\n            raise ValueError(f'Nonempty directory: \"{extract_path}\"')\n\n    # download and extract\n    md5 = _MD5_MIBIG_METADATA[version]\n    download_and_extract_archive(\n        url=MIBIG_METADATA_URL.format(version=version),\n        download_root=download_root,\n        extract_root=extract_path,\n        md5=md5,\n    )\n\n    # After extracting mibig archive, it's either one dir or many json files,\n    # if it's a dir, then move all json files from it to extract_path\n    subdirs = list_dirs(extract_path)\n    if len(subdirs) > 1:\n        raise ValueError(f\"Expected one extracted directory, got {len(subdirs)}\")\n\n    if len(subdirs) == 1:\n        subdir_path = subdirs[0]\n        for fname in list_files(subdir_path, prefix=\"BGC\", suffix=\".json\", keep_parent=False):\n            shutil.move(os.path.join(subdir_path, fname), os.path.join(extract_path, fname))\n        # delete subdir\n        if subdir_path != extract_path:\n            shutil.rmtree(subdir_path)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.parse_bgc_metadata_json","title":"parse_bgc_metadata_json","text":"
    parse_bgc_metadata_json(file: str | PathLike) -> BGC\n

    Parse MIBiG metadata file and return BGC object.

    Note that the MiBIG accession is used as the BGC id and strain name. The BGC object has Strain object as its strain attribute.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the MIBiG metadata json file

    required

    Returns:

    Type Description BGC

    BGC object

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def parse_bgc_metadata_json(file: str | PathLike) -> BGC:\n    \"\"\"Parse MIBiG metadata file and return BGC object.\n\n    Note that the MiBIG accession is used as the BGC id and strain name. The BGC\n    object has Strain object as its strain attribute.\n\n    Args:\n        file: Path to the MIBiG metadata json file\n\n    Returns:\n        BGC object\n    \"\"\"\n    metadata = MibigMetadata(str(file))\n    mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class)\n    mibig_bgc.mibig_bgc_class = metadata.biosyn_class\n    mibig_bgc.strain = Strain(metadata.mibig_accession)\n    return mibig_bgc\n
    "},{"location":"api/nplinker/","title":"NPLinker","text":""},{"location":"api/nplinker/#nplinker","title":"nplinker","text":""},{"location":"api/nplinker/#nplinker.NPLinker","title":"NPLinker","text":"
    NPLinker(config_file: str | PathLike)\n

    Main class for the NPLinker application.

    Attributes:

    Name Type Description config

    The configuration object for the current NPLinker application.

    root_dir str

    The path to the root directory of the current NPLinker application.

    output_dir str

    The path to the output directory of the current NPLinker application.

    bgcs list[BGC]

    A list of all BGC objects.

    gcfs list[GCF]

    A list of all GCF objects.

    spectra list[Spectrum]

    A list of all Spectrum objects.

    mfs list[MolecularFamily]

    A list of all MolecularFamily objects.

    mibig_bgcs list[BGC]

    A list of all MiBIG BGC objects.

    strains StrainCollection

    A StrainCollection object containing all Strain objects.

    product_types list[str]

    A list of all BiGSCAPE product types.

    scoring_methods list[str]

    A list of all valid scoring methods.

    Examples:

    To start a NPLinker application:

    >>> from nplinker import NPLinker\n>>> npl = NPLinker(\"path/to/config.toml\")\n

    To load all data into memory:

    >>> npl.load_data()\n

    To check the number of GCF objects:

    >>> len(npl.gcfs)\n

    To get the links for all GCF objects using the Metcalf scoring method, the result is a LinkGraph object:

    >>> lg = npl.get_links(npl.gcfs, \"metcalf\")\n

    To get the link data between two objects:

    >>> link_data = lg.get_link_data(npl.gcfs[0], npl.spectra[0])\n{\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0, \"standardised\": False})}\n

    Parameters:

    Name Type Description Default config_file str | PathLike

    Path to the configuration file to use.

    required Source code in src/nplinker/nplinker.py
    def __init__(self, config_file: str | PathLike):\n    \"\"\"Initialise an NPLinker instance.\n\n    Args:\n        config_file: Path to the configuration file to use.\n    \"\"\"\n    # Load the configuration file\n    self.config = load_config(config_file)\n\n    # Setup logging for the application\n    setup_logging(\n        level=self.config.log.level,\n        file=self.config.log.get(\"file\", \"\"),\n        use_console=self.config.log.use_console,\n    )\n    logger.info(\n        \"Configuration:\\n %s\", pformat(self.config.as_dict(), width=20, sort_dicts=False)\n    )\n\n    # Setup the output directory\n    self._output_dir = self.config.root_dir / OUTPUT_DIRNAME\n    self._output_dir.mkdir(exist_ok=True)\n\n    # Initialise data containers that will be populated by the `load_data` method\n    self._bgc_dict: dict[str, BGC] = {}\n    self._gcf_dict: dict[str, GCF] = {}\n    self._spec_dict: dict[str, Spectrum] = {}\n    self._mf_dict: dict[str, MolecularFamily] = {}\n    self._mibig_bgcs: list[BGC] = []\n    self._strains: StrainCollection = StrainCollection()\n    self._product_types: list = []\n    self._chem_classes = None  # TODO: to be refactored\n    self._class_matches = None  # TODO: to be refactored\n\n    # Flags to keep track of whether the scoring methods have been set up\n    self._scoring_methods_setup_done = {name: False for name in self._valid_scoring_methods}\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.config","title":"config instance-attribute","text":"
    config = load_config(config_file)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.root_dir","title":"root_dir property","text":"
    root_dir: str\n

    Get the path to the root directory of the current NPLinker instance.

    "},{"location":"api/nplinker/#nplinker.NPLinker.output_dir","title":"output_dir property","text":"
    output_dir: str\n

    Get the path to the output directory of the current NPLinker instance.

    "},{"location":"api/nplinker/#nplinker.NPLinker.bgcs","title":"bgcs property","text":"
    bgcs: list[BGC]\n

    Get all BGC objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.gcfs","title":"gcfs property","text":"
    gcfs: list[GCF]\n

    Get all GCF objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.spectra","title":"spectra property","text":"
    spectra: list[Spectrum]\n

    Get all Spectrum objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.mfs","title":"mfs property","text":"
    mfs: list[MolecularFamily]\n

    Get all MolecularFamily objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.mibig_bgcs","title":"mibig_bgcs property","text":"
    mibig_bgcs: list[BGC]\n

    Get all MiBIG BGC objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.strains","title":"strains property","text":"
    strains: StrainCollection\n

    Get all Strain objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.product_types","title":"product_types property","text":"
    product_types: list[str]\n

    Get all BiGSCAPE product types.

    "},{"location":"api/nplinker/#nplinker.NPLinker.chem_classes","title":"chem_classes property","text":"
    chem_classes\n

    Returns loaded ChemClassPredictions with the class predictions.

    "},{"location":"api/nplinker/#nplinker.NPLinker.class_matches","title":"class_matches property","text":"
    class_matches\n

    ClassMatches with the matched classes and scoring tables from MIBiG.

    "},{"location":"api/nplinker/#nplinker.NPLinker.scoring_methods","title":"scoring_methods property","text":"
    scoring_methods: list[str]\n

    Get names of all valid scoring methods.

    "},{"location":"api/nplinker/#nplinker.NPLinker.load_data","title":"load_data","text":"
    load_data()\n

    Load all data from local files into memory.

    This method is a convenience function that calls the DatasetArranger and DatasetLoader classes to load all data from the local filesystem into memory. The loaded data is then stored in various private data containers for easy access.

    Source code in src/nplinker/nplinker.py
    def load_data(self):\n    \"\"\"Load all data from local files into memory.\n\n    This method is a convenience function that calls the `DatasetArranger` and `DatasetLoader`\n    classes to load all data from the local filesystem into memory. The loaded data is then\n    stored in various private data containers for easy access.\n    \"\"\"\n    arranger = DatasetArranger(self.config)\n    arranger.arrange()\n    loader = DatasetLoader(self.config)\n    loader.load()\n\n    self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}\n    self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}\n    self._spec_dict = {spec.id: spec for spec in loader.spectra}\n    self._mf_dict = {mf.id: mf for mf in loader.mfs}\n\n    self._mibig_bgcs = loader.mibig_bgcs\n    self._strains = loader.strains\n    self._product_types = loader.product_types\n    self._chem_classes = loader.chem_classes\n    self._class_matches = loader.class_matches\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.get_links","title":"get_links","text":"
    get_links(\n    objects: (\n        Sequence[BGC]\n        | Sequence[GCF]\n        | Sequence[Spectrum]\n        | Sequence[MolecularFamily]\n    ),\n    scoring_method: str,\n    **scoring_params: Any\n) -> LinkGraph\n

    Get the links for the given objects using the specified scoring method and parameters.

    Parameters:

    Name Type Description Default objects Sequence[BGC] | Sequence[GCF] | Sequence[Spectrum] | Sequence[MolecularFamily]

    A sequence of objects to get the links for. The objects must be of the same type, i.e. BGC, GCF, Spectrum or MolecularFamily type. For scoring method metcalf, the BGC objects are not supported.

    required scoring_method str

    The scoring method to use. Must be one of the valid scoring methods self.scoring_methods, such as \"metcalf\".

    required scoring_params Any

    Parameters to pass to the scoring method. If not provided, the default parameters for the scoring method will be used.

    {}

    Returns:

    Type Description LinkGraph

    A LinkGraph object containing the links for the given objects.

    Raises:

    Type Description ValueError

    If input objects are empty or if the scoring method is invalid.

    TypeError

    If the input objects are not of the same type or if the object type is invalid.

    Source code in src/nplinker/nplinker.py
    def get_links(\n    self,\n    objects: Sequence[BGC] | Sequence[GCF] | Sequence[Spectrum] | Sequence[MolecularFamily],\n    scoring_method: str,\n    **scoring_params: Any,\n) -> LinkGraph:\n    \"\"\"Get the links for the given objects using the specified scoring method and parameters.\n\n    Args:\n        objects: A sequence of objects to get the links for. The objects must be of the same\n            type, i.e. `BGC`, `GCF`, `Spectrum` or `MolecularFamily` type.\n            For scoring method `metcalf`, the BGC objects are not supported.\n        scoring_method: The scoring method to use. Must be one of the valid scoring methods\n            `self.scoring_methods`, such as \"metcalf\".\n        scoring_params: Parameters to pass to the scoring method. If not provided, the default\n            parameters for the scoring method will be used.\n\n    Returns:\n        A LinkGraph object containing the links for the given objects.\n\n    Raises:\n        ValueError: If input objects are empty or if the scoring method is invalid.\n        TypeError: If the input objects are not of the same type or if the object type is invalid.\n    \"\"\"\n    # Validate objects\n    if len(objects) == 0:\n        raise ValueError(\"No objects provided to get links for\")\n    # check if all objects are of the same type\n    types = {type(i) for i in objects}\n    if len(types) > 1:\n        raise TypeError(\"Input objects must be of the same type.\")\n    # check if the object type is valid\n    obj_type = next(iter(types))\n    if obj_type not in (BGC, GCF, Spectrum, MolecularFamily):\n        raise TypeError(\n            f\"Invalid type {obj_type}. Input objects must be BGC, GCF, Spectrum or MolecularFamily objects.\"\n        )\n\n    # Validate scoring method\n    if scoring_method not in self._valid_scoring_methods:\n        raise ValueError(f\"Invalid scoring method {scoring_method}.\")\n\n    # Check if the scoring method has been set up\n    if not self._scoring_methods_setup_done[scoring_method]:\n        self._valid_scoring_methods[scoring_method].setup(self)\n        self._scoring_methods_setup_done[scoring_method] = True\n\n    # Initialise the scoring method\n    scoring = self._valid_scoring_methods[scoring_method]()\n\n    return scoring.get_links(*objects, **scoring_params)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_bgc","title":"lookup_bgc","text":"
    lookup_bgc(id: str) -> BGC | None\n

    Get the BGC object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the BGC to look up.

    required

    Returns:

    Type Description BGC | None

    The BGC object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_bgc(self, id: str) -> BGC | None:\n    \"\"\"Get the BGC object with the given ID.\n\n    Args:\n        id: the ID of the BGC to look up.\n\n    Returns:\n        The BGC object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._bgc_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_gcf","title":"lookup_gcf","text":"
    lookup_gcf(id: str) -> GCF | None\n

    Get the GCF object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the GCF to look up.

    required

    Returns:

    Type Description GCF | None

    The GCF object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_gcf(self, id: str) -> GCF | None:\n    \"\"\"Get the GCF object with the given ID.\n\n    Args:\n        id: the ID of the GCF to look up.\n\n    Returns:\n        The GCF object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._gcf_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_spectrum","title":"lookup_spectrum","text":"
    lookup_spectrum(id: str) -> Spectrum | None\n

    Get the Spectrum object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the Spectrum to look up.

    required

    Returns:

    Type Description Spectrum | None

    The Spectrum object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_spectrum(self, id: str) -> Spectrum | None:\n    \"\"\"Get the Spectrum object with the given ID.\n\n    Args:\n        id: the ID of the Spectrum to look up.\n\n    Returns:\n        The Spectrum object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._spec_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_mf","title":"lookup_mf","text":"
    lookup_mf(id: str) -> MolecularFamily | None\n

    Get the MolecularFamily object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the MolecularFamily to look up.

    required

    Returns:

    Type Description MolecularFamily | None

    The MolecularFamily object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_mf(self, id: str) -> MolecularFamily | None:\n    \"\"\"Get the MolecularFamily object with the given ID.\n\n    Args:\n        id: the ID of the MolecularFamily to look up.\n\n    Returns:\n        The MolecularFamily object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._mf_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.save_data","title":"save_data","text":"
    save_data(\n    file: str | PathLike, links: LinkGraph | None = None\n) -> None\n

    Pickle data to a file.

    The data to be pickled is a tuple containing the BGCs, GCFs, Spectra, MolecularFamilies, StrainCollection and links, i.e. (bgcs, gcfs, spectra, mfs, strains, links). If the links are not provided, None will be used.

    Parameters:

    Name Type Description Default file str | PathLike

    The path to the pickle file to save the data to.

    required links LinkGraph | None

    The LinkGraph object to save.

    None Source code in src/nplinker/nplinker.py
    def save_data(\n    self,\n    file: str | PathLike,\n    links: LinkGraph | None = None,\n) -> None:\n    \"\"\"Pickle data to a file.\n\n    The data to be pickled is a tuple containing the BGCs, GCFs, Spectra, MolecularFamilies,\n    StrainCollection and links, i.e. `(bgcs, gcfs, spectra, mfs, strains, links)`. If the links\n    are not provided, `None` will be used.\n\n    Args:\n        file: The path to the pickle file to save the data to.\n        links: The LinkGraph object to save.\n    \"\"\"\n    data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)\n    with open(file, \"wb\") as f:\n        pickle.dump(data, f)\n
    "},{"location":"api/nplinker/#nplinker.setup_logging","title":"setup_logging","text":"
    setup_logging(\n    level: str = \"INFO\",\n    file: str = \"\",\n    use_console: bool = True,\n) -> None\n

    Setup logging configuration for the ancestor logger \"nplinker\".

    Parameters:

    Name Type Description Default level str

    The log level, use the logging module's log level constants. Valid levels are: \"NOTSET\", \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\".

    'INFO' file str

    The file to write the log to. If the file does not exist, it will be created. The log will be written to the file in append mode. If the file is an empty string (by default), the log will not be written to a file.

    '' use_console bool

    Whether to log to the console.

    True Source code in src/nplinker/logger.py
    def setup_logging(level: str = \"INFO\", file: str = \"\", use_console: bool = True) -> None:\n    \"\"\"Setup logging configuration for the ancestor logger \"nplinker\".\n\n    Args:\n        level: The log level, use the logging module's log level constants. Valid levels are:\n            \"NOTSET\", \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\".\n        file: The file to write the log to. If the file does not exist, it will be created. The log\n            will be written to the file in append mode. If the file is an empty string (by default),\n            the log will not be written to a file.\n        use_console: Whether to log to the console.\n    \"\"\"\n    # Get the ancestor logger \"nplinker\"\n    logger = logging.getLogger(\"nplinker\")\n    logger.setLevel(level)\n\n    # File handler\n    if file:\n        logger.addHandler(\n            RichHandler(\n                console=Console(file=open(file, \"a\"), width=120),  # force the line width to 120\n                omit_repeated_times=False,\n                rich_tracebacks=True,\n                tracebacks_show_locals=True,\n                log_time_format=\"[%Y-%m-%d %X]\",\n            )\n        )\n\n    # Console handler\n    if use_console:\n        logger.addHandler(\n            RichHandler(\n                omit_repeated_times=False,\n                rich_tracebacks=True,\n                tracebacks_show_locals=True,\n                log_time_format=\"[%Y-%m-%d %X]\",\n            )\n        )\n
    "},{"location":"api/nplinker/#nplinker.defaults","title":"defaults","text":""},{"location":"api/nplinker/#nplinker.defaults.NPLINKER_APP_DATA_DIR","title":"NPLINKER_APP_DATA_DIR module-attribute","text":"
    NPLINKER_APP_DATA_DIR: Final = parent / 'data'\n
    "},{"location":"api/nplinker/#nplinker.defaults.STRAIN_MAPPINGS_FILENAME","title":"STRAIN_MAPPINGS_FILENAME module-attribute","text":"
    STRAIN_MAPPINGS_FILENAME: Final = 'strain_mappings.json'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME","title":"GENOME_BGC_MAPPINGS_FILENAME module-attribute","text":"
    GENOME_BGC_MAPPINGS_FILENAME: Final = (\n    \"genome_bgc_mappings.json\"\n)\n
    "},{"location":"api/nplinker/#nplinker.defaults.GENOME_STATUS_FILENAME","title":"GENOME_STATUS_FILENAME module-attribute","text":"
    GENOME_STATUS_FILENAME: Final = 'genome_status.json'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_SPECTRA_FILENAME","title":"GNPS_SPECTRA_FILENAME module-attribute","text":"
    GNPS_SPECTRA_FILENAME: Final = 'spectra.mgf'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_MOLECULAR_FAMILY_FILENAME","title":"GNPS_MOLECULAR_FAMILY_FILENAME module-attribute","text":"
    GNPS_MOLECULAR_FAMILY_FILENAME: Final = (\n    \"molecular_families.tsv\"\n)\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_ANNOTATIONS_FILENAME","title":"GNPS_ANNOTATIONS_FILENAME module-attribute","text":"
    GNPS_ANNOTATIONS_FILENAME: Final = 'annotations.tsv'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_FILE_MAPPINGS_TSV","title":"GNPS_FILE_MAPPINGS_TSV module-attribute","text":"
    GNPS_FILE_MAPPINGS_TSV: Final = 'file_mappings.tsv'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_FILE_MAPPINGS_CSV","title":"GNPS_FILE_MAPPINGS_CSV module-attribute","text":"
    GNPS_FILE_MAPPINGS_CSV: Final = 'file_mappings.csv'\n
    "},{"location":"api/nplinker/#nplinker.defaults.STRAINS_SELECTED_FILENAME","title":"STRAINS_SELECTED_FILENAME module-attribute","text":"
    STRAINS_SELECTED_FILENAME: Final = 'strains_selected.json'\n
    "},{"location":"api/nplinker/#nplinker.defaults.DOWNLOADS_DIRNAME","title":"DOWNLOADS_DIRNAME module-attribute","text":"
    DOWNLOADS_DIRNAME: Final = 'downloads'\n
    "},{"location":"api/nplinker/#nplinker.defaults.MIBIG_DIRNAME","title":"MIBIG_DIRNAME module-attribute","text":"
    MIBIG_DIRNAME: Final = 'mibig'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_DIRNAME","title":"GNPS_DIRNAME module-attribute","text":"
    GNPS_DIRNAME: Final = 'gnps'\n
    "},{"location":"api/nplinker/#nplinker.defaults.ANTISMASH_DIRNAME","title":"ANTISMASH_DIRNAME module-attribute","text":"
    ANTISMASH_DIRNAME: Final = 'antismash'\n
    "},{"location":"api/nplinker/#nplinker.defaults.BIGSCAPE_DIRNAME","title":"BIGSCAPE_DIRNAME module-attribute","text":"
    BIGSCAPE_DIRNAME: Final = 'bigscape'\n
    "},{"location":"api/nplinker/#nplinker.defaults.BIGSCAPE_RUNNING_OUTPUT_DIRNAME","title":"BIGSCAPE_RUNNING_OUTPUT_DIRNAME module-attribute","text":"
    BIGSCAPE_RUNNING_OUTPUT_DIRNAME: Final = (\n    \"bigscape_running_output\"\n)\n
    "},{"location":"api/nplinker/#nplinker.defaults.OUTPUT_DIRNAME","title":"OUTPUT_DIRNAME module-attribute","text":"
    OUTPUT_DIRNAME: Final = 'output'\n
    "},{"location":"api/nplinker/#nplinker.config","title":"config","text":""},{"location":"api/nplinker/#nplinker.config.CONFIG_VALIDATORS","title":"CONFIG_VALIDATORS module-attribute","text":"
    CONFIG_VALIDATORS = [\n    Validator(\n        \"root_dir\",\n        required=True,\n        cast=transform_to_full_path,\n        condition=lambda v: is_dir(),\n    ),\n    Validator(\n        \"mode\",\n        required=True,\n        cast=lambda v: lower(),\n        is_in=[\"local\", \"podp\"],\n    ),\n    Validator(\n        \"podp_id\",\n        required=True,\n        when=Validator(\"mode\", eq=\"podp\"),\n    ),\n    Validator(\n        \"podp_id\",\n        required=False,\n        when=Validator(\"mode\", eq=\"local\"),\n    ),\n    Validator(\n        \"log.level\",\n        is_type_of=str,\n        cast=lambda v: upper(),\n        is_in=[\n            \"NOTSET\",\n            \"DEBUG\",\n            \"INFO\",\n            \"WARNING\",\n            \"ERROR\",\n            \"CRITICAL\",\n        ],\n    ),\n    Validator(\"log.file\", is_type_of=str),\n    Validator(\"log.use_console\", is_type_of=bool),\n    Validator(\n        \"mibig.to_use\", required=True, is_type_of=bool\n    ),\n    Validator(\n        \"mibig.version\",\n        required=True,\n        is_type_of=str,\n        when=Validator(\"mibig.to_use\", eq=True),\n    ),\n    Validator(\n        \"bigscape.parameters\", required=True, is_type_of=str\n    ),\n    Validator(\n        \"bigscape.cutoff\", required=True, is_type_of=str\n    ),\n    Validator(\n        \"scoring.methods\",\n        required=True,\n        cast=lambda v: [lower() for i in v],\n        is_type_of=list,\n        len_min=1,\n        condition=lambda v: issubset(\n            {\"metcalf\", \"rosetta\"}\n        ),\n    ),\n]\n
    "},{"location":"api/nplinker/#nplinker.config.load_config","title":"load_config","text":"
    load_config(config_file: str | PathLike) -> Dynaconf\n

    Load and validate the configuration file.

    Parameters:

    Name Type Description Default config_file str | PathLike

    Path to the configuration file.

    required

    Returns:

    Name Type Description Dynaconf Dynaconf

    A Dynaconf object containing the configuration settings.

    Raises:

    Type Description FileNotFoundError

    If the configuration file does not exist.

    Source code in src/nplinker/config.py
    def load_config(config_file: str | PathLike) -> Dynaconf:\n    \"\"\"Load and validate the configuration file.\n\n    Args:\n        config_file: Path to the configuration file.\n\n    Returns:\n        Dynaconf: A Dynaconf object containing the configuration settings.\n\n    Raises:\n        FileNotFoundError: If the configuration file does not exist.\n    \"\"\"\n    config_file = transform_to_full_path(config_file)\n    if not config_file.exists():\n        raise FileNotFoundError(f\"Config file '{config_file}' not found\")\n\n    # Locate the default config file\n    default_config_file = Path(__file__).resolve().parent / \"nplinker_default.toml\"\n\n    # Load config files\n    config = Dynaconf(settings_files=[config_file], preload=[default_config_file])\n\n    # Validate configs\n    config.validators.register(*CONFIG_VALIDATORS)\n    config.validators.validate()\n\n    return config\n
    "},{"location":"api/schema/","title":"Schemas","text":""},{"location":"api/schema/#nplinker.schemas","title":"schemas","text":""},{"location":"api/schema/#nplinker.schemas.PODP_ADAPTED_SCHEMA","title":"PODP_ADAPTED_SCHEMA module-attribute","text":"
    PODP_ADAPTED_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.SCHEMA_DIR","title":"SCHEMA_DIR module-attribute","text":"
    SCHEMA_DIR = parent\n
    "},{"location":"api/schema/#nplinker.schemas.GENOME_STATUS_SCHEMA","title":"GENOME_STATUS_SCHEMA module-attribute","text":"
    GENOME_STATUS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.GENOME_BGC_MAPPINGS_SCHEMA","title":"GENOME_BGC_MAPPINGS_SCHEMA module-attribute","text":"
    GENOME_BGC_MAPPINGS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.STRAIN_MAPPINGS_SCHEMA","title":"STRAIN_MAPPINGS_SCHEMA module-attribute","text":"
    STRAIN_MAPPINGS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.USER_STRAINS_SCHEMA","title":"USER_STRAINS_SCHEMA module-attribute","text":"
    USER_STRAINS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.validate_podp_json","title":"validate_podp_json","text":"
    validate_podp_json(json_data: dict) -> None\n

    Validate a dictionary of JSON data against the PODP JSON schema.

    All validation error messages are collected and raised as a single ValueError.

    Parameters:

    Name Type Description Default json_data dict

    The JSON data to validate.

    required

    Raises:

    Type Description ValueError

    If the JSON data does not match the schema.

    Source code in src/nplinker/schemas/utils.py
    def validate_podp_json(json_data: dict) -> None:\n    \"\"\"Validate a dictionary of JSON data against the PODP JSON schema.\n\n    All validation error messages are collected and raised as a single\n    ValueError.\n\n    Args:\n        json_data: The JSON data to validate.\n\n    Raises:\n        ValueError: If the JSON data does not match the schema.\n    \"\"\"\n    validator = Draft7Validator(PODP_ADAPTED_SCHEMA)\n    errors = sorted(validator.iter_errors(json_data), key=lambda e: e.path)\n    if errors:\n        error_messages = [f\"{e.json_path}: {e.message}\" for e in errors]\n        raise ValueError(\n            \"Not match PODP adapted schema, here are the detailed error:\\n  - \"\n            + \"\\n  - \".join(error_messages)\n        )\n
    "},{"location":"api/scoring/","title":"Data Models","text":""},{"location":"api/scoring/#nplinker.scoring","title":"scoring","text":""},{"location":"api/scoring/#nplinker.scoring.LinkGraph","title":"LinkGraph","text":"
    LinkGraph()\n

    A class to represent the links between objects in NPLinker.

    This class wraps the networkx.Graph class to provide a more user-friendly interface for working with the links.

    The links between objects are stored as edges in a graph, while the objects themselves are stored as nodes.

    The scoring data for each link (or link data) is stored as the key/value attributes of the edge.

    Examples:

    Create a LinkGraph object:

    >>> lg = LinkGraph()\n

    Add a link between a GCF and a Spectrum object:

    >>> lg.add_link(gcf, spectrum, metcalf=Score(\"metcalf\", 1.0, {\"cutoff\": 0.5}))\n

    Get all links for a given object:

    >>> lg[gcf]\n{spectrum: {\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})}}\n

    Get all links:

    >>> lg.links\n[(gcf, spectrum, {\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})})]\n

    Check if there is a link between two objects:

    >>> lg.has_link(gcf, spectrum)\nTrue\n

    Get the link data between two objects:

    >>> lg.get_link_data(gcf, spectrum)\n{\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})}\n
    Source code in src/nplinker/scoring/link_graph.py
    def __init__(self) -> None:\n    self._g: Graph = Graph()\n
    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.links","title":"links property","text":"
    links: list[LINK]\n

    Get all links.

    Returns:

    Type Description list[LINK]

    A list of tuples containing the links between objects.

    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.add_link","title":"add_link","text":"
    add_link(u: Entity, v: Entity, **data: Score) -> None\n

    Add a link between two objects.

    The objects u and v must be different types, i.e. one must be a GCF and the other must be a Spectrum or MolecularFamily.

    Parameters:

    Name Type Description Default u Entity

    the first object, either a GCF, Spectrum, or MolecularFamily

    required v Entity

    the second object, either a GCF, Spectrum, or MolecularFamily

    required data Score

    keyword arguments. At least one scoring method and its data must be provided. The key must be the name of the scoring method defined in ScoringMethod, and the value is a Score object, e.g. metcalf=Score(\"metcalf\", 1.0, {\"cutoff\": 0.5}).

    {} Source code in src/nplinker/scoring/link_graph.py
    @validate_uv\ndef add_link(\n    self,\n    u: Entity,\n    v: Entity,\n    **data: Score,\n) -> None:\n    \"\"\"Add a link between two objects.\n\n    The objects `u` and `v` must be different types, i.e. one must be a GCF and the other must be\n    a Spectrum or MolecularFamily.\n\n    Args:\n        u: the first object, either a GCF, Spectrum, or MolecularFamily\n        v: the second object, either a GCF, Spectrum, or MolecularFamily\n        data: keyword arguments. At least one scoring method and its data must be provided.\n            The key must be the name of the scoring method defined in `ScoringMethod`, and the\n            value is a `Score` object, e.g. `metcalf=Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})`.\n    \"\"\"\n    # validate the data\n    if not data:\n        raise ValueError(\"At least one scoring method and its data must be provided.\")\n    for key, value in data.items():\n        if not ScoringMethod.has_value(key):\n            raise ValueError(\n                f\"{key} is not a valid name of scoring method. See `ScoringMethod` for valid names.\"\n            )\n        if not isinstance(value, Score):\n            raise TypeError(f\"{value} is not a Score object.\")\n\n    self._g.add_edge(u, v, **data)\n
    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.has_link","title":"has_link","text":"
    has_link(u: Entity, v: Entity) -> bool\n

    Check if there is a link between two objects.

    Parameters:

    Name Type Description Default u Entity

    the first object, either a GCF, Spectrum, or MolecularFamily

    required v Entity

    the second object, either a GCF, Spectrum, or MolecularFamily

    required

    Returns:

    Type Description bool

    True if there is a link between the two objects, False otherwise

    Source code in src/nplinker/scoring/link_graph.py
    @validate_uv\ndef has_link(self, u: Entity, v: Entity) -> bool:\n    \"\"\"Check if there is a link between two objects.\n\n    Args:\n        u: the first object, either a GCF, Spectrum, or MolecularFamily\n        v: the second object, either a GCF, Spectrum, or MolecularFamily\n\n    Returns:\n        True if there is a link between the two objects, False otherwise\n    \"\"\"\n    return self._g.has_edge(u, v)\n
    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.get_link_data","title":"get_link_data","text":"
    get_link_data(u: Entity, v: Entity) -> LINK_DATA | None\n

    Get the data for a link between two objects.

    Parameters:

    Name Type Description Default u Entity

    the first object, either a GCF, Spectrum, or MolecularFamily

    required v Entity

    the second object, either a GCF, Spectrum, or MolecularFamily

    required

    Returns:

    Type Description LINK_DATA | None

    A dictionary of scoring methods and their data for the link between the two objects, or

    LINK_DATA | None

    None if there is no link between the two objects.

    Source code in src/nplinker/scoring/link_graph.py
    @validate_uv\ndef get_link_data(\n    self,\n    u: Entity,\n    v: Entity,\n) -> LINK_DATA | None:\n    \"\"\"Get the data for a link between two objects.\n\n    Args:\n        u: the first object, either a GCF, Spectrum, or MolecularFamily\n        v: the second object, either a GCF, Spectrum, or MolecularFamily\n\n    Returns:\n        A dictionary of scoring methods and their data for the link between the two objects, or\n        None if there is no link between the two objects.\n    \"\"\"\n    return self._g.get_edge_data(u, v)  # type: ignore\n
    "},{"location":"api/scoring/#nplinker.scoring.Score","title":"Score dataclass","text":"
    Score(name: str, value: float, parameter: dict)\n

    A data class to represent score data.

    Attributes:

    Name Type Description name str

    the name of the scoring method. See ScoringMethod for valid values.

    value float

    the score value.

    parameter dict

    the parameters used for the scoring method.

    "},{"location":"api/scoring/#nplinker.scoring.Score.name","title":"name instance-attribute","text":"
    name: str\n
    "},{"location":"api/scoring/#nplinker.scoring.Score.value","title":"value instance-attribute","text":"
    value: float\n
    "},{"location":"api/scoring/#nplinker.scoring.Score.parameter","title":"parameter instance-attribute","text":"
    parameter: dict\n
    "},{"location":"api/scoring_abc/","title":"Abstract Base Classes","text":""},{"location":"api/scoring_abc/#nplinker.scoring.abc","title":"abc","text":""},{"location":"api/scoring_abc/#nplinker.scoring.abc.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase","title":"ScoringBase","text":"

    Bases: ABC

    Abstract base class of scoring methods.

    Attributes:

    Name Type Description name str

    The name of the scoring method.

    npl NPLinker | None

    The NPLinker object.

    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.name","title":"name class-attribute instance-attribute","text":"
    name: str = 'ScoringBase'\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.npl","title":"npl class-attribute instance-attribute","text":"
    npl: NPLinker | None = None\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.setup","title":"setup abstractmethod classmethod","text":"
    setup(npl: NPLinker)\n

    Setup class level attributes.

    Source code in src/nplinker/scoring/abc.py
    @classmethod\n@abstractmethod\ndef setup(cls, npl: NPLinker):\n    \"\"\"Setup class level attributes.\"\"\"\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.get_links","title":"get_links abstractmethod","text":"
    get_links(*objects, **parameters) -> LinkGraph\n

    Get links information for the given objects.

    Parameters:

    Name Type Description Default objects

    A list of objects to get links for.

    () parameters

    The parameters used for scoring.

    {}

    Returns:

    Type Description LinkGraph

    The LinkGraph object.

    Source code in src/nplinker/scoring/abc.py
    @abstractmethod\ndef get_links(\n    self,\n    *objects,\n    **parameters,\n) -> LinkGraph:\n    \"\"\"Get links information for the given objects.\n\n    Args:\n        objects: A list of objects to get links for.\n        parameters: The parameters used for scoring.\n\n    Returns:\n        The LinkGraph object.\n    \"\"\"\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.format_data","title":"format_data abstractmethod","text":"
    format_data(data) -> str\n

    Format the scoring data to a string.

    Source code in src/nplinker/scoring/abc.py
    @abstractmethod\ndef format_data(self, data) -> str:\n    \"\"\"Format the scoring data to a string.\"\"\"\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.sort","title":"sort abstractmethod","text":"
    sort(objects, reverse=True) -> list\n

    Sort the given objects based on the scoring data.

    Source code in src/nplinker/scoring/abc.py
    @abstractmethod\ndef sort(self, objects, reverse=True) -> list:\n    \"\"\"Sort the given objects based on the scoring data.\"\"\"\n
    "},{"location":"api/scoring_methods/","title":"Scoring Methods","text":""},{"location":"api/scoring_methods/#nplinker.scoring","title":"scoring","text":""},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod","title":"ScoringMethod","text":"

    Bases: Enum

    Enum class for scoring methods.

    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.METCALF","title":"METCALF class-attribute instance-attribute","text":"
    METCALF = 'metcalf'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.ROSETTA","title":"ROSETTA class-attribute instance-attribute","text":"
    ROSETTA = 'rosetta'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.NPLCLASS","title":"NPLCLASS class-attribute instance-attribute","text":"
    NPLCLASS = 'nplclass'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.has_value","title":"has_value classmethod","text":"
    has_value(value: str) -> bool\n

    Check if the enum has a value.

    Source code in src/nplinker/scoring/scoring_method.py
    @classmethod\ndef has_value(cls, value: str) -> bool:\n    \"\"\"Check if the enum has a value.\"\"\"\n    return any(value == item.value for item in cls)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring","title":"MetcalfScoring","text":"

    Bases: ScoringBase

    Metcalf scoring method.

    Attributes:

    Name Type Description name

    The name of this scoring method, set to a fixed value metcalf.

    npl NPLinker | None

    The NPLinker object.

    CACHE str

    The name of the cache file to use for storing the MetcalfScoring.

    presence_gcf_strain DataFrame

    A DataFrame to store presence of gcfs with respect to strains. The index of the DataFrame are the GCF objects and the columns are Strain objects. The values are 1 where the gcf occurs in the strain, 0 otherwise.

    presence_spec_strain DataFrame

    A DataFrame to store presence of spectra with respect to strains. The index of the DataFrame are the Spectrum objects and the columns are Strain objects. The values are 1 where the spectrum occurs in the strain, 0 otherwise.

    presence_mf_strain DataFrame

    A DataFrame to store presence of molecular families with respect to strains. The index of the DataFrame are the MolecularFamily objects and the columns are Strain objects. The values are 1 where the molecular family occurs in the strain, 0 otherwise.

    raw_score_spec_gcf DataFrame

    A DataFrame to store the raw Metcalf scores for spectrum-gcf links. The columns are \"spec\", \"gcf\" and \"score\":

    raw_score_mf_gcf DataFrame

    A DataFrame to store the raw Metcalf scores for molecular family-gcf links. The columns are \"mf\", \"gcf\" and \"score\":

    metcalf_mean ndarray | None

    A numpy array to store the mean value used for standardising Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains.

    metcalf_std ndarray | None

    A numpy array to store the standard deviation value used for standardising Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains.

    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.name","title":"name class-attribute instance-attribute","text":"
    name = METCALF.value\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.npl","title":"npl class-attribute instance-attribute","text":"
    npl: NPLinker | None = None\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.CACHE","title":"CACHE class-attribute instance-attribute","text":"
    CACHE: str = 'cache_metcalf_scoring.pckl'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.metcalf_weights","title":"metcalf_weights class-attribute instance-attribute","text":"
    metcalf_weights: tuple[int, int, int, int] = (10, -10, 0, 1)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.presence_gcf_strain","title":"presence_gcf_strain class-attribute instance-attribute","text":"
    presence_gcf_strain: DataFrame = DataFrame()\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.presence_spec_strain","title":"presence_spec_strain class-attribute instance-attribute","text":"
    presence_spec_strain: DataFrame = DataFrame()\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.presence_mf_strain","title":"presence_mf_strain class-attribute instance-attribute","text":"
    presence_mf_strain: DataFrame = DataFrame()\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.raw_score_spec_gcf","title":"raw_score_spec_gcf class-attribute instance-attribute","text":"
    raw_score_spec_gcf: DataFrame = DataFrame(\n    columns=[\"spec\", \"gcf\", \"score\"]\n)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.raw_score_mf_gcf","title":"raw_score_mf_gcf class-attribute instance-attribute","text":"
    raw_score_mf_gcf: DataFrame = DataFrame(\n    columns=[\"mf\", \"gcf\", \"score\"]\n)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.metcalf_mean","title":"metcalf_mean class-attribute instance-attribute","text":"
    metcalf_mean: ndarray | None = None\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.metcalf_std","title":"metcalf_std class-attribute instance-attribute","text":"
    metcalf_std: ndarray | None = None\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.setup","title":"setup classmethod","text":"
    setup(npl: NPLinker)\n

    Setup the MetcalfScoring object.

    This method is only called once to setup the MetcalfScoring object.

    Parameters:

    Name Type Description Default npl NPLinker

    The NPLinker object.

    required Source code in src/nplinker/scoring/metcalf_scoring.py
    @classmethod\ndef setup(cls, npl: NPLinker):\n    \"\"\"Setup the MetcalfScoring object.\n\n    This method is only called once to setup the MetcalfScoring object.\n\n    Args:\n        npl: The NPLinker object.\n    \"\"\"\n    if cls.npl is not None:\n        logger.info(\"MetcalfScoring.setup already called, skipping.\")\n        return\n\n    logger.info(\n        f\"MetcalfScoring.setup starts: #bgcs={len(npl.bgcs)}, #gcfs={len(npl.gcfs)}, \"\n        f\"#spectra={len(npl.spectra)}, #mfs={len(npl.mfs)}, #strains={npl.strains}\"\n    )\n    cls.npl = npl\n\n    # calculate presence of gcfs/spectra/mfs with respect to strains\n    cls.presence_gcf_strain = get_presence_gcf_strain(npl.gcfs, npl.strains)\n    cls.presence_spec_strain = get_presence_spec_strain(npl.spectra, npl.strains)\n    cls.presence_mf_strain = get_presence_mf_strain(npl.mfs, npl.strains)\n\n    # calculate raw Metcalf scores for spec-gcf links\n    raw_score_spec_gcf = cls._calc_raw_score(\n        cls.presence_spec_strain, cls.presence_gcf_strain, cls.metcalf_weights\n    )\n    cls.raw_score_spec_gcf = raw_score_spec_gcf.reset_index().melt(id_vars=\"index\")\n    cls.raw_score_spec_gcf.columns = [\"spec\", \"gcf\", \"score\"]  # type: ignore\n\n    # calculate raw Metcalf scores for spec-gcf links\n    raw_score_mf_gcf = cls._calc_raw_score(\n        cls.presence_mf_strain, cls.presence_gcf_strain, cls.metcalf_weights\n    )\n    cls.raw_score_mf_gcf = raw_score_mf_gcf.reset_index().melt(id_vars=\"index\")\n    cls.raw_score_mf_gcf.columns = [\"mf\", \"gcf\", \"score\"]  # type: ignore\n\n    # calculate mean and std for standardising Metcalf scores\n    cls.metcalf_mean, cls.metcalf_std = cls._calc_mean_std(\n        len(npl.strains), cls.metcalf_weights\n    )\n\n    logger.info(\"MetcalfScoring.setup completed\")\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.get_links","title":"get_links","text":"
    get_links(*objects, **parameters)\n

    Get links for the given objects.

    Parameters:

    Name Type Description Default objects

    The objects to get links for. All objects must be of the same type, i.e. GCF, Spectrum or MolecularFamily type. If no objects are provided, all detected objects (npl.gcfs) will be used.

    () parameters

    The scoring parameters to use for the links. The parameters are:

    - cutoff: The minimum score to consider a link (\u2265cutoff). Default is 0.\n- standardised: Whether to use standardised scores. Default is False.\n
    {}

    Returns:

    Type Description

    The LinkGraph object containing the links involving the input objects with the Metcalf scores.

    Raises:

    Type Description TypeError

    If the input objects are not of the same type or the object type is invalid.

    Source code in src/nplinker/scoring/metcalf_scoring.py
    def get_links(self, *objects, **parameters):\n    \"\"\"Get links for the given objects.\n\n    Args:\n        objects: The objects to get links for. All objects must be of the same type, i.e. `GCF`,\n            `Spectrum` or `MolecularFamily` type.\n            If no objects are provided, all detected objects (`npl.gcfs`) will be used.\n        parameters: The scoring parameters to use for the links. The parameters are:\n\n                - cutoff: The minimum score to consider a link (\u2265cutoff). Default is 0.\n                - standardised: Whether to use standardised scores. Default is False.\n\n    Returns:\n        The `LinkGraph` object containing the links involving the input objects with the Metcalf\n            scores.\n\n    Raises:\n        TypeError: If the input objects are not of the same type or the object type is invalid.\n    \"\"\"\n    # validate input objects\n    if len(objects) == 0:\n        objects = self.npl.gcfs\n    # check if all objects are of the same type\n    types = {type(i) for i in objects}\n    if len(types) > 1:\n        raise TypeError(\"Input objects must be of the same type.\")\n    # check if the object type is valid\n    obj_type = next(iter(types))\n    if obj_type not in (GCF, Spectrum, MolecularFamily):\n        raise TypeError(\n            f\"Invalid type {obj_type}. Input objects must be GCF, Spectrum or MolecularFamily objects.\"\n        )\n\n    # validate scoring parameters\n    self._cutoff: float = parameters.get(\"cutoff\", 0)\n    self._standardised: bool = parameters.get(\"standardised\", False)\n    parameters.update({\"cutoff\": self._cutoff, \"standardised\": self._standardised})\n\n    logger.info(\n        f\"MetcalfScoring: #objects={len(objects)}, type={obj_type}, cutoff={self._cutoff}, \"\n        f\"standardised={self._standardised}\"\n    )\n    if not self._standardised:\n        scores_list = self._get_links(*objects, obj_type=obj_type, score_cutoff=self._cutoff)\n    else:\n        if self.metcalf_mean is None or self.metcalf_std is None:\n            raise ValueError(\n                \"MetcalfScoring.metcalf_mean and metcalf_std are not set. Run MetcalfScoring.setup first.\"\n            )\n        # use negative infinity as the score cutoff to ensure we get all links\n        scores_list = self._get_links(*objects, obj_type=obj_type, score_cutoff=-np.inf)\n        scores_list = self._calc_standardised_score(scores_list)\n\n    links = LinkGraph()\n    for score_df in scores_list:\n        for row in score_df.itertuples(index=False):  # row has attributes: spec/mf, gcf, score\n            met = row.spec if score_df.name == LinkType.SPEC_GCF else row.mf\n            links.add_link(\n                row.gcf,\n                met,\n                metcalf=Score(self.name, row.score, parameters),\n            )\n\n    logger.info(f\"MetcalfScoring: completed! Found {len(links.links)} links in total.\")\n    return links\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.format_data","title":"format_data","text":"
    format_data(data)\n

    Format the data for display.

    Source code in src/nplinker/scoring/metcalf_scoring.py
    def format_data(self, data):\n    \"\"\"Format the data for display.\"\"\"\n    # for metcalf the data will just be a floating point value (i.e. the score)\n    return f\"{data:.4f}\"\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.sort","title":"sort","text":"
    sort(objects, reverse=True)\n

    Sort the objects based on the score.

    Source code in src/nplinker/scoring/metcalf_scoring.py
    def sort(self, objects, reverse=True):\n    \"\"\"Sort the objects based on the score.\"\"\"\n    # sort based on score\n    return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse)\n
    "},{"location":"api/scoring_utils/","title":"Utilities","text":""},{"location":"api/scoring_utils/#nplinker.scoring.utils","title":"utils","text":""},{"location":"api/scoring_utils/#nplinker.scoring.utils.get_presence_gcf_strain","title":"get_presence_gcf_strain","text":"
    get_presence_gcf_strain(\n    gcfs: Sequence[GCF], strains: StrainCollection\n) -> DataFrame\n

    Get the occurrence of strains in gcfs.

    The occurrence is a DataFrame with GCF objects as index and Strain objects as columns, and the values are 1 if the gcf occurs in the strain, 0 otherwise.

    Source code in src/nplinker/scoring/utils.py
    def get_presence_gcf_strain(gcfs: Sequence[GCF], strains: StrainCollection) -> pd.DataFrame:\n    \"\"\"Get the occurrence of strains in gcfs.\n\n    The occurrence is a DataFrame with GCF objects as index and Strain objects as columns, and the\n    values are 1 if the gcf occurs in the strain,  0 otherwise.\n    \"\"\"\n    df_gcf_strain = pd.DataFrame(\n        0,\n        index=gcfs,\n        columns=list(strains),\n        dtype=int,\n    )  # type: ignore\n    for gcf in gcfs:\n        for strain in strains:\n            if gcf.has_strain(strain):\n                df_gcf_strain.loc[gcf, strain] = 1\n    return df_gcf_strain  # type: ignore\n
    "},{"location":"api/scoring_utils/#nplinker.scoring.utils.get_presence_spec_strain","title":"get_presence_spec_strain","text":"
    get_presence_spec_strain(\n    spectra: Sequence[Spectrum], strains: StrainCollection\n) -> DataFrame\n

    Get the occurrence of strains in spectra.

    The occurrence is a DataFrame with Spectrum objects as index and Strain objects as columns, and the values are 1 if the spectrum occurs in the strain, 0 otherwise.

    Source code in src/nplinker/scoring/utils.py
    def get_presence_spec_strain(\n    spectra: Sequence[Spectrum], strains: StrainCollection\n) -> pd.DataFrame:\n    \"\"\"Get the occurrence of strains in spectra.\n\n    The occurrence is a DataFrame with Spectrum objects as index and Strain objects as columns, and\n    the values are 1 if the spectrum occurs in the strain, 0 otherwise.\n    \"\"\"\n    df_spec_strain = pd.DataFrame(\n        0,\n        index=spectra,\n        columns=list(strains),\n        dtype=int,\n    )  # type: ignore\n    for spectrum in spectra:\n        for strain in strains:\n            if spectrum.has_strain(strain):\n                df_spec_strain.loc[spectrum, strain] = 1\n    return df_spec_strain  # type: ignore\n
    "},{"location":"api/scoring_utils/#nplinker.scoring.utils.get_presence_mf_strain","title":"get_presence_mf_strain","text":"
    get_presence_mf_strain(\n    mfs: Sequence[MolecularFamily],\n    strains: StrainCollection,\n) -> DataFrame\n

    Get the occurrence of strains in molecular families.

    The occurrence is a DataFrame with MolecularFamily objects as index and Strain objects as columns, and the values are 1 if the molecular family occurs in the strain, 0 otherwise.

    Source code in src/nplinker/scoring/utils.py
    def get_presence_mf_strain(\n    mfs: Sequence[MolecularFamily], strains: StrainCollection\n) -> pd.DataFrame:\n    \"\"\"Get the occurrence of strains in molecular families.\n\n    The occurrence is a DataFrame with MolecularFamily objects as index and Strain objects as\n    columns, and the values are 1 if the molecular family occurs in the strain, 0 otherwise.\n    \"\"\"\n    df_mf_strain = pd.DataFrame(\n        0,\n        index=mfs,\n        columns=list(strains),\n        dtype=int,\n    )  # type: ignore\n    for mf in mfs:\n        for strain in strains:\n            if mf.has_strain(strain):\n                df_mf_strain.loc[mf, strain] = 1\n    return df_mf_strain  # type: ignore\n
    "},{"location":"api/strain/","title":"Data Models","text":""},{"location":"api/strain/#nplinker.strain","title":"strain","text":""},{"location":"api/strain/#nplinker.strain.Strain","title":"Strain","text":"
    Strain(id: str)\n

    To model the mapping between strain id and its aliases.

    It's recommended to use NCBI taxonomy strain id or name as the primary id.

    Parameters:

    Name Type Description Default id str

    the representative id of the strain.

    required Source code in src/nplinker/strain/strain.py
    def __init__(self, id: str) -> None:\n    \"\"\"To model the mapping between strain id and its aliases.\n\n    Args:\n        id: the representative id of the strain.\n    \"\"\"\n    self.id: str = id\n    self._aliases: set[str] = set()\n
    "},{"location":"api/strain/#nplinker.strain.Strain.id","title":"id instance-attribute","text":"
    id: str = id\n
    "},{"location":"api/strain/#nplinker.strain.Strain.names","title":"names property","text":"
    names: set[str]\n

    Get the set of strain names including id and aliases.

    Returns:

    Type Description set[str]

    A set of names associated with the strain.

    "},{"location":"api/strain/#nplinker.strain.Strain.aliases","title":"aliases property","text":"
    aliases: set[str]\n

    Get the set of known aliases.

    Returns:

    Type Description set[str]

    A set of aliases associated with the strain.

    "},{"location":"api/strain/#nplinker.strain.Strain.add_alias","title":"add_alias","text":"
    add_alias(alias: str) -> None\n

    Add an alias to the list of known aliases.

    Parameters:

    Name Type Description Default alias str

    The alias to add to the list of known aliases.

    required Source code in src/nplinker/strain/strain.py
    def add_alias(self, alias: str) -> None:\n    \"\"\"Add an alias to the list of known aliases.\n\n    Args:\n        alias: The alias to add to the list of known aliases.\n    \"\"\"\n    if not isinstance(alias, str):\n        raise TypeError(f\"Expected str, got {type(alias)}\")\n    if len(alias) == 0:\n        logger.warning(\"Refusing to add an empty-string alias to strain {%s}\", self)\n    else:\n        self._aliases.add(alias)\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection","title":"StrainCollection","text":"
    StrainCollection()\n

    A collection of Strain objects.

    Source code in src/nplinker/strain/strain_collection.py
    def __init__(self):\n    # the order of strains is needed for scoring part, so use a list\n    self._strains: list[Strain] = []\n    self._strain_dict_name: dict[str, list[Strain]] = {}\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.add","title":"add","text":"
    add(strain: Strain) -> None\n

    Add strain to the collection.

    If the strain already exists, merge the aliases.

    Parameters:

    Name Type Description Default strain Strain

    The strain to add.

    required Source code in src/nplinker/strain/strain_collection.py
    def add(self, strain: Strain) -> None:\n    \"\"\"Add strain to the collection.\n\n    If the strain already exists, merge the aliases.\n\n    Args:\n        strain: The strain to add.\n    \"\"\"\n    if strain in self._strains:\n        # only one strain object per id\n        strain_ref = self._strain_dict_name[strain.id][0]\n        new_aliases = [alias for alias in strain.aliases if alias not in strain_ref.aliases]\n        for alias in new_aliases:\n            strain_ref.add_alias(alias)\n            if alias not in self._strain_dict_name:\n                self._strain_dict_name[alias] = [strain_ref]\n            else:\n                self._strain_dict_name[alias].append(strain_ref)\n    else:\n        self._strains.append(strain)\n        for name in strain.names:\n            if name not in self._strain_dict_name:\n                self._strain_dict_name[name] = [strain]\n            else:\n                self._strain_dict_name[name].append(strain)\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.remove","title":"remove","text":"
    remove(strain: Strain)\n

    Remove a strain from the collection.

    It removes the given strain object from the collection by strain id. If the strain id is not found, raise ValueError.

    Parameters:

    Name Type Description Default strain Strain

    The strain to remove.

    required

    Raises:

    Type Description ValueError

    If the strain is not found in the collection.

    Source code in src/nplinker/strain/strain_collection.py
    def remove(self, strain: Strain):\n    \"\"\"Remove a strain from the collection.\n\n    It removes the given strain object from the collection by strain id.\n    If the strain id is not found, raise ValueError.\n\n    Args:\n        strain: The strain to remove.\n\n    Raises:\n        ValueError: If the strain is not found in the collection.\n    \"\"\"\n    if strain in self._strains:\n        self._strains.remove(strain)\n        # only one strain object per id\n        strain_ref = self._strain_dict_name[strain.id][0]\n        for name in strain_ref.names:\n            if name in self._strain_dict_name:\n                new_strain_list = [s for s in self._strain_dict_name[name] if s.id != strain.id]\n                if not new_strain_list:\n                    del self._strain_dict_name[name]\n                else:\n                    self._strain_dict_name[name] = new_strain_list\n    else:\n        raise ValueError(f\"Strain {strain} not found in strain collection.\")\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.filter","title":"filter","text":"
    filter(strain_set: set[Strain])\n

    Remove all strains that are not in strain_set from the strain collection.

    Parameters:

    Name Type Description Default strain_set set[Strain]

    Set of strains to keep.

    required Source code in src/nplinker/strain/strain_collection.py
    def filter(self, strain_set: set[Strain]):\n    \"\"\"Remove all strains that are not in strain_set from the strain collection.\n\n    Args:\n        strain_set: Set of strains to keep.\n    \"\"\"\n    # note that we need to copy the list of strains, as we are modifying it\n    for strain in self._strains.copy():\n        if strain not in strain_set:\n            self.remove(strain)\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.intersection","title":"intersection","text":"
    intersection(other: StrainCollection) -> StrainCollection\n

    Get the intersection of two strain collections.

    Parameters:

    Name Type Description Default other StrainCollection

    The other strain collection to compare.

    required

    Returns:

    Type Description StrainCollection

    StrainCollection object containing the strains that are in both collections.

    Source code in src/nplinker/strain/strain_collection.py
    def intersection(self, other: StrainCollection) -> StrainCollection:\n    \"\"\"Get the intersection of two strain collections.\n\n    Args:\n        other: The other strain collection to compare.\n\n    Returns:\n        StrainCollection object containing the strains that are in both collections.\n    \"\"\"\n    intersection = StrainCollection()\n    for strain in self:\n        if strain in other:\n            intersection.add(strain)\n    return intersection\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.has_name","title":"has_name","text":"
    has_name(name: str) -> bool\n

    Check if the strain collection contains the given strain name (id or alias).

    Parameters:

    Name Type Description Default name str

    Strain name (id or alias) to check.

    required

    Returns:

    Type Description bool

    True if the strain name is in the collection, False otherwise.

    Source code in src/nplinker/strain/strain_collection.py
    def has_name(self, name: str) -> bool:\n    \"\"\"Check if the strain collection contains the given strain name (id or alias).\n\n    Args:\n        name: Strain name (id or alias) to check.\n\n    Returns:\n        True if the strain name is in the collection, False otherwise.\n    \"\"\"\n    return name in self._strain_dict_name\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.lookup","title":"lookup","text":"
    lookup(name: str) -> list[Strain]\n

    Lookup a strain by name (id or alias).

    Parameters:

    Name Type Description Default name str

    Strain name (id or alias) to lookup.

    required

    Returns:

    Type Description list[Strain]

    List of Strain objects with the given name.

    Raises:

    Type Description ValueError

    If the strain name is not found.

    Source code in src/nplinker/strain/strain_collection.py
    def lookup(self, name: str) -> list[Strain]:\n    \"\"\"Lookup a strain by name (id or alias).\n\n    Args:\n        name: Strain name (id or alias) to lookup.\n\n    Returns:\n        List of Strain objects with the given name.\n\n    Raises:\n        ValueError: If the strain name is not found.\n    \"\"\"\n    if name in self._strain_dict_name:\n        return self._strain_dict_name[name]\n    raise ValueError(f\"Strain {name} not found in the strain collection.\")\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.read_json","title":"read_json staticmethod","text":"
    read_json(file: str | PathLike) -> 'StrainCollection'\n

    Read a strain mappings JSON file and return a StrainCollection object.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the strain mappings JSON file.

    required

    Returns:

    Type Description 'StrainCollection'

    StrainCollection object.

    Source code in src/nplinker/strain/strain_collection.py
    @staticmethod\ndef read_json(file: str | PathLike) -> \"StrainCollection\":\n    \"\"\"Read a strain mappings JSON file and return a StrainCollection object.\n\n    Args:\n        file: Path to the strain mappings JSON file.\n\n    Returns:\n        StrainCollection object.\n    \"\"\"\n    with open(file, \"r\") as f:\n        json_data = json.load(f)\n\n    # validate json data\n    validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA)\n\n    strain_collection = StrainCollection()\n    for data in json_data[\"strain_mappings\"]:\n        strain = Strain(data[\"strain_id\"])\n        for alias in data[\"strain_alias\"]:\n            strain.add_alias(alias)\n        strain_collection.add(strain)\n    return strain_collection\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.to_json","title":"to_json","text":"
    to_json(file: str | PathLike | None = None) -> str | None\n

    Convert the StrainCollection object to a JSON string.

    Parameters:

    Name Type Description Default file str | PathLike | None

    Path to output JSON file. If None, return the JSON string instead.

    None

    Returns:

    Type Description str | None

    If file is None, return the JSON string. Otherwise, write the JSON string to the given

    str | None

    file.

    Source code in src/nplinker/strain/strain_collection.py
    def to_json(self, file: str | PathLike | None = None) -> str | None:\n    \"\"\"Convert the StrainCollection object to a JSON string.\n\n    Args:\n        file: Path to output JSON file. If None,\n            return the JSON string instead.\n\n    Returns:\n        If `file` is None, return the JSON string. Otherwise, write the JSON string to the given\n        file.\n    \"\"\"\n    data_list = [\n        {\"strain_id\": strain.id, \"strain_alias\": list(strain.aliases)} for strain in self\n    ]\n    json_data = {\"strain_mappings\": data_list, \"version\": \"1.0\"}\n\n    # validate json data\n    validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA)\n\n    if file is not None:\n        with open(file, \"w\") as f:\n            json.dump(json_data, f)\n        return None\n    return json.dumps(json_data)\n
    "},{"location":"api/strain_utils/","title":"Utilities","text":""},{"location":"api/strain_utils/#nplinker.strain.utils","title":"utils","text":""},{"location":"api/strain_utils/#nplinker.strain.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/strain_utils/#nplinker.strain.utils.load_user_strains","title":"load_user_strains","text":"
    load_user_strains(json_file: str | PathLike) -> set[Strain]\n

    Load user specified strains from a JSON file.

    The JSON file must follow the schema defined in schemas/user_strains.json.

    An example content of the JSON file
    {\"strain_ids\": [\"strain1\", \"strain2\"]}\n

    Parameters:

    Name Type Description Default json_file str | PathLike

    Path to the JSON file containing user specified strains.

    required

    Returns:

    Type Description set[Strain]

    A set of user specified strains.

    Source code in src/nplinker/strain/utils.py
    def load_user_strains(json_file: str | PathLike) -> set[Strain]:\n    \"\"\"Load user specified strains from a JSON file.\n\n    The JSON file must follow the schema defined in `schemas/user_strains.json`.\n\n    An example content of the JSON file:\n        ```\n        {\"strain_ids\": [\"strain1\", \"strain2\"]}\n        ```\n\n    Args:\n        json_file: Path to the JSON file containing user specified strains.\n\n    Returns:\n        A set of user specified strains.\n    \"\"\"\n    with open(json_file, \"r\") as f:\n        json_data = json.load(f)\n\n    # validate json data\n    validate(instance=json_data, schema=USER_STRAINS_SCHEMA)\n\n    strains = set()\n    for strain_id in json_data[\"strain_ids\"]:\n        strains.add(Strain(strain_id))\n\n    return strains\n
    "},{"location":"api/strain_utils/#nplinker.strain.utils.podp_generate_strain_mappings","title":"podp_generate_strain_mappings","text":"
    podp_generate_strain_mappings(\n    podp_project_json_file: str | PathLike,\n    genome_status_json_file: str | PathLike,\n    genome_bgc_mappings_file: str | PathLike,\n    gnps_file_mappings_file: str | PathLike,\n    output_json_file: str | PathLike,\n) -> StrainCollection\n

    Generate strain mappings JSON file for PODP pipeline.

    To get the strain mappings, we need to combine the following mappings:

    These mappings are extracted from the following files:

    Parameters:

    Name Type Description Default podp_project_json_file str | PathLike

    The path to the PODP project JSON file.

    required genome_status_json_file str | PathLike

    The path to the genome status JSON file.

    required genome_bgc_mappings_file str | PathLike

    The path to the genome BGC mappings JSON file.

    required gnps_file_mappings_file str | PathLike

    The path to the GNPS file mappings file (csv or tsv).

    required output_json_file str | PathLike

    The path to the output JSON file.

    required

    Returns:

    Type Description StrainCollection

    The strain mappings stored in a StrainCollection object.

    See Also Source code in src/nplinker/strain/utils.py
    def podp_generate_strain_mappings(\n    podp_project_json_file: str | PathLike,\n    genome_status_json_file: str | PathLike,\n    genome_bgc_mappings_file: str | PathLike,\n    gnps_file_mappings_file: str | PathLike,\n    output_json_file: str | PathLike,\n) -> StrainCollection:\n    \"\"\"Generate strain mappings JSON file for PODP pipeline.\n\n    To get the strain mappings, we need to combine the following mappings:\n\n    - strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id\n    - strain_id <-> MS_filename <-> spectrum_id\n\n    These mappings are extracted from the following files:\n\n    - \"strain_id <-> original_genome_id\" is extracted from `podp_project_json_file`.\n    - \"original_genome_id <-> resolved_genome_id\" is extracted from `genome_status_json_file`.\n    - \"resolved_genome_id <-> bgc_id\" is extracted from `genome_bgc_mappings_file`.\n    - \"strain_id <-> MS_filename\" is extracted from `podp_project_json_file`.\n    - \"MS_filename <-> spectrum_id\" is extracted from `gnps_file_mappings_file`.\n\n    Args:\n        podp_project_json_file: The path to the PODP project\n            JSON file.\n        genome_status_json_file: The path to the genome status\n            JSON file.\n        genome_bgc_mappings_file: The path to the genome BGC\n            mappings JSON file.\n        gnps_file_mappings_file: The path to the GNPS file\n            mappings file (csv or tsv).\n        output_json_file: The path to the output JSON file.\n\n    Returns:\n        The strain mappings stored in a StrainCollection object.\n\n    See Also:\n        - `extract_mappings_strain_id_original_genome_id`: Extract mappings\n            \"strain_id <-> original_genome_id\".\n        - `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings\n            \"original_genome_id <-> resolved_genome_id\".\n        - `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings\n            \"resolved_genome_id <-> bgc_id\".\n        - `get_mappings_strain_id_bgc_id`: Get mappings \"strain_id <-> bgc_id\".\n        - `extract_mappings_strain_id_ms_filename`: Extract mappings\n            \"strain_id <-> MS_filename\".\n        - `extract_mappings_ms_filename_spectrum_id`: Extract mappings\n            \"MS_filename <-> spectrum_id\".\n        - `get_mappings_strain_id_spectrum_id`: Get mappings \"strain_id <-> spectrum_id\".\n    \"\"\"\n    # Get mappings strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id\n    mappings_strain_id_bgc_id = get_mappings_strain_id_bgc_id(\n        extract_mappings_strain_id_original_genome_id(podp_project_json_file),\n        extract_mappings_original_genome_id_resolved_genome_id(genome_status_json_file),\n        extract_mappings_resolved_genome_id_bgc_id(genome_bgc_mappings_file),\n    )\n\n    # Get mappings strain_id <-> MS_filename <-> spectrum_id\n    mappings_strain_id_spectrum_id = get_mappings_strain_id_spectrum_id(\n        extract_mappings_strain_id_ms_filename(podp_project_json_file),\n        extract_mappings_ms_filename_spectrum_id(gnps_file_mappings_file),\n    )\n\n    # Get mappings strain_id <-> bgc_id / spectrum_id\n    mappings = mappings_strain_id_bgc_id.copy()\n    for strain_id, spectrum_ids in mappings_strain_id_spectrum_id.items():\n        if strain_id in mappings:\n            mappings[strain_id].update(spectrum_ids)\n        else:\n            mappings[strain_id] = spectrum_ids.copy()\n\n    # Create StrainCollection\n    sc = StrainCollection()\n    for strain_id, bgc_ids in mappings.items():\n        if not sc.has_name(strain_id):\n            strain = Strain(strain_id)\n            for bgc_id in bgc_ids:\n                strain.add_alias(bgc_id)\n            sc.add(strain)\n        else:\n            # strain_list has only one element\n            strain_list = sc.lookup(strain_id)\n            for bgc_id in bgc_ids:\n                strain_list[0].add_alias(bgc_id)\n\n    # Write strain mappings JSON file\n    sc.to_json(output_json_file)\n    logger.info(\"Generated strain mappings JSON file: %s\", output_json_file)\n\n    return sc\n
    "},{"location":"api/utils/","title":"General Utilities","text":""},{"location":"api/utils/#nplinker.utils","title":"utils","text":""},{"location":"api/utils/#nplinker.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/utils/#nplinker.utils.calculate_md5","title":"calculate_md5","text":"
    calculate_md5(\n    fpath: str | PathLike, chunk_size: int = 1024 * 1024\n) -> str\n

    Calculate the MD5 checksum of a file.

    Parameters:

    Name Type Description Default fpath str | PathLike

    Path to the file.

    required chunk_size int

    Chunk size for reading the file. Defaults to 1024*1024.

    1024 * 1024

    Returns:

    Type Description str

    MD5 checksum of the file.

    Source code in src/nplinker/utils.py
    def calculate_md5(fpath: str | PathLike, chunk_size: int = 1024 * 1024) -> str:\n    \"\"\"Calculate the MD5 checksum of a file.\n\n    Args:\n        fpath: Path to the file.\n        chunk_size: Chunk size for reading the file. Defaults to 1024*1024.\n\n    Returns:\n        MD5 checksum of the file.\n    \"\"\"\n    if sys.version_info >= (3, 9):\n        md5 = hashlib.md5(usedforsecurity=False)\n    else:\n        md5 = hashlib.md5()\n    with open(fpath, \"rb\") as f:\n        for chunk in iter(lambda: f.read(chunk_size), b\"\"):\n            md5.update(chunk)\n    return md5.hexdigest()\n
    "},{"location":"api/utils/#nplinker.utils.check_md5","title":"check_md5","text":"
    check_md5(fpath: str | PathLike, md5: str) -> bool\n

    Verify the MD5 checksum of a file.

    Parameters:

    Name Type Description Default fpath str | PathLike

    Path to the file.

    required md5 str

    MD5 checksum to verify.

    required

    Returns:

    Type Description bool

    True if the MD5 checksum matches, False otherwise.

    Source code in src/nplinker/utils.py
    def check_md5(fpath: str | PathLike, md5: str) -> bool:\n    \"\"\"Verify the MD5 checksum of a file.\n\n    Args:\n        fpath: Path to the file.\n        md5: MD5 checksum to verify.\n\n    Returns:\n        True if the MD5 checksum matches, False otherwise.\n    \"\"\"\n    return md5 == calculate_md5(fpath)\n
    "},{"location":"api/utils/#nplinker.utils.download_and_extract_archive","title":"download_and_extract_archive","text":"
    download_and_extract_archive(\n    url: str,\n    download_root: str | PathLike,\n    extract_root: str | Path | None = None,\n    filename: str | None = None,\n    md5: str | None = None,\n    remove_finished: bool = False,\n) -> None\n

    Download a file from url and extract it.

    This method is a wrapper of download_url and extract_archive methods.

    Parameters:

    Name Type Description Default url str

    URL to download file from

    required download_root str | PathLike

    Path to the directory to place downloaded file in. If it doesn't exist, it will be created.

    required extract_root str | Path | None

    Path to the directory the file will be extracted to. The given directory will be created if not exist. If omitted, the download_root is used.

    None filename str | None

    Name to save the downloaded file under. If None, use the basename of the URL

    None md5 str | None

    MD5 checksum of the download. If None, do not check

    None remove_finished bool

    If True, remove the downloaded file after the extraction. Defaults to False.

    False Source code in src/nplinker/utils.py
    def download_and_extract_archive(\n    url: str,\n    download_root: str | PathLike,\n    extract_root: str | Path | None = None,\n    filename: str | None = None,\n    md5: str | None = None,\n    remove_finished: bool = False,\n) -> None:\n    \"\"\"Download a file from url and extract it.\n\n       This method is a wrapper of `download_url` and `extract_archive` methods.\n\n    Args:\n        url: URL to download file from\n        download_root: Path to the directory to place downloaded\n            file in. If it doesn't exist, it will be created.\n        extract_root: Path to the directory the file\n            will be extracted to. The given directory will be created if not exist.\n            If omitted, the `download_root` is used.\n        filename: Name to save the downloaded file under.\n            If None, use the basename of the URL\n        md5: MD5 checksum of the download. If None, do not check\n        remove_finished: If `True`, remove the downloaded file\n             after the extraction. Defaults to False.\n    \"\"\"\n    download_root = Path(download_root)\n    if extract_root is None:\n        extract_root = download_root\n    else:\n        extract_root = Path(extract_root)\n    if not filename:\n        filename = Path(url).name\n\n    download_url(url, download_root, filename, md5)\n\n    archive = download_root / filename\n    extract_archive(archive, extract_root, remove_finished=remove_finished)\n
    "},{"location":"api/utils/#nplinker.utils.download_url","title":"download_url","text":"
    download_url(\n    url: str,\n    root: str | PathLike,\n    filename: str | None = None,\n    md5: str | None = None,\n    http_method: str = \"GET\",\n    allow_http_redirect: bool = True,\n) -> None\n

    Download a file from a url and place it in root.

    Parameters:

    Name Type Description Default url str

    URL to download file from

    required root str | PathLike

    Directory to place downloaded file in. If it doesn't exist, it will be created.

    required filename str | None

    Name to save the file under. If None, use the basename of the URL.

    None md5 str | None

    MD5 checksum of the download. If None, do not check.

    None http_method str

    HTTP request method, e.g. \"GET\", \"POST\". Defaults to \"GET\".

    'GET' allow_http_redirect bool

    If true, enable following redirects for all HTTP (\"http:\") methods.

    True Source code in src/nplinker/utils.py
    def download_url(\n    url: str,\n    root: str | PathLike,\n    filename: str | None = None,\n    md5: str | None = None,\n    http_method: str = \"GET\",\n    allow_http_redirect: bool = True,\n) -> None:\n    \"\"\"Download a file from a url and place it in root.\n\n    Args:\n        url: URL to download file from\n        root: Directory to place downloaded file in. If it doesn't exist, it will be created.\n        filename: Name to save the file under. If None, use the\n            basename of the URL.\n        md5: MD5 checksum of the download. If None, do not check.\n        http_method: HTTP request method, e.g. \"GET\", \"POST\".\n            Defaults to \"GET\".\n        allow_http_redirect: If true, enable following redirects for all HTTP (\"http:\") methods.\n    \"\"\"\n    root = transform_to_full_path(root)\n    # create the download directory if not exist\n    root.mkdir(exist_ok=True)\n    if not filename:\n        filename = Path(url).name\n    fpath = root / filename\n\n    # check if file is already present locally\n    if fpath.is_file() and md5 is not None and check_md5(fpath, md5):\n        logger.info(\"Using downloaded and verified file: \" + str(fpath))\n        return\n\n    # download the file\n    with open(fpath, \"wb\") as fh:\n        with httpx.stream(http_method, url, follow_redirects=allow_http_redirect) as response:\n            if not response.is_success:\n                fpath.unlink(missing_ok=True)\n                raise RuntimeError(\n                    f\"Failed to download url {url} with status code {response.status_code}\"\n                )\n            total = int(response.headers.get(\"Content-Length\", 0))\n\n            with Progress(\n                TextColumn(\"[progress.description]{task.description}\"),\n                BarColumn(bar_width=None),\n                \"[progress.percentage]{task.percentage:>3.1f}%\",\n                \"\u2022\",\n                DownloadColumn(),\n                \"\u2022\",\n                TransferSpeedColumn(),\n                \"\u2022\",\n                TimeRemainingColumn(),\n                \"\u2022\",\n                TimeElapsedColumn(),\n            ) as progress:\n                task = progress.add_task(f\"[hot_pink]Downloading {fpath.name}\", total=total)\n                for chunk in response.iter_bytes():\n                    fh.write(chunk)\n                    progress.update(task, advance=len(chunk))\n\n    # check integrity of downloaded file\n    if md5 is not None and not check_md5(fpath, md5):\n        raise RuntimeError(\"MD5 validation failed.\")\n
    "},{"location":"api/utils/#nplinker.utils.extract_archive","title":"extract_archive","text":"
    extract_archive(\n    from_path: str | PathLike,\n    extract_root: str | PathLike | None = None,\n    members: list | None = None,\n    remove_finished: bool = False,\n) -> str\n

    Extract an archive.

    The archive type and a possible compression is automatically detected from the file name. If the file is compressed but not an archive the call is dispatched to :func:decompress.

    Parameters:

    Name Type Description Default from_path str | PathLike

    Path to the file to be extracted.

    required extract_root str | PathLike | None

    Path to the directory the file will be extracted to. The given directory will be created if not exist. If omitted, the directory of the archive file is used.

    None members list | None

    Optional selection of members to extract. If not specified, all members are extracted. Members must be a subset of the list returned by - zipfile.ZipFile.namelist() or a list of strings for zip file - tarfile.TarFile.getmembers() for tar file

    None remove_finished bool

    If True, remove the file after the extraction.

    False

    Returns:

    Type Description str

    Path to the directory the file was extracted to.

    Source code in src/nplinker/utils.py
    def extract_archive(\n    from_path: str | PathLike,\n    extract_root: str | PathLike | None = None,\n    members: list | None = None,\n    remove_finished: bool = False,\n) -> str:\n    \"\"\"Extract an archive.\n\n    The archive type and a possible compression is automatically detected from\n    the file name. If the file is compressed but not an archive the call is\n    dispatched to :func:`decompress`.\n\n    Args:\n        from_path: Path to the file to be extracted.\n        extract_root: Path to the directory the file will be extracted to.\n            The given directory will be created if not exist.\n            If omitted, the directory of the archive file is used.\n        members: Optional selection of members to extract. If not specified,\n            all members are extracted.\n            Members must be a subset of the list returned by\n            - `zipfile.ZipFile.namelist()` or a list of strings for zip file\n            - `tarfile.TarFile.getmembers()` for tar file\n        remove_finished: If `True`, remove the file after the extraction.\n\n    Returns:\n        Path to the directory the file was extracted to.\n    \"\"\"\n    from_path = Path(from_path)\n\n    if extract_root is None:\n        extract_root = from_path.parent\n    else:\n        extract_root = Path(extract_root)\n\n    # create the extract directory if not exist\n    extract_root.mkdir(exist_ok=True)\n\n    logger.info(f\"Extracting {from_path} to {extract_root}\")\n    suffix, archive_type, compression = _detect_file_type(from_path)\n    if not archive_type:\n        return _decompress(\n            from_path,\n            extract_root / from_path.name.replace(suffix, \"\"),\n            remove_finished=remove_finished,\n        )\n\n    extractor = _ARCHIVE_EXTRACTORS[archive_type]\n\n    extractor(str(from_path), str(extract_root), members, compression)\n    if remove_finished:\n        from_path.unlink()\n\n    return str(extract_root)\n
    "},{"location":"api/utils/#nplinker.utils.find_delimiter","title":"find_delimiter","text":"
    find_delimiter(file: str | PathLike) -> str\n

    Detect the delimiter for the given tabular file.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to tabular file.

    required

    Returns:

    Type Description str

    Detected delimiter character.

    Examples:

    >>> delim = find_delimiter(\"~/table.csv\")\n
    Source code in src/nplinker/utils.py
    def find_delimiter(file: str | PathLike) -> str:\n    \"\"\"Detect the delimiter for the given tabular file.\n\n    Args:\n        file: Path to tabular file.\n\n    Returns:\n        Detected delimiter character.\n\n    Examples:\n        >>> delim = find_delimiter(\"~/table.csv\")\n    \"\"\"\n    sniffer = csv.Sniffer()\n    with open(file, mode=\"rt\", encoding=\"utf-8\") as fp:\n        delimiter = sniffer.sniff(fp.read(5000)).delimiter\n    return delimiter\n
    "},{"location":"api/utils/#nplinker.utils.get_headers","title":"get_headers","text":"
    get_headers(file: str | PathLike) -> list[str]\n

    Read headers from the given tabular file.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the file to read the header from.

    required

    Returns:

    Type Description list[str]

    A list of column names from the header.

    Source code in src/nplinker/utils.py
    def get_headers(file: str | PathLike) -> list[str]:\n    \"\"\"Read headers from the given tabular file.\n\n    Args:\n        file: Path to the file to read the header from.\n\n    Returns:\n        A list of column names from the header.\n    \"\"\"\n    with open(file) as f:\n        headers = f.readline().strip()\n        dl = find_delimiter(file)\n        return headers.split(dl)\n
    "},{"location":"api/utils/#nplinker.utils.is_file_format","title":"is_file_format","text":"
    is_file_format(\n    file: str | PathLike, format: str = \"tsv\"\n) -> bool\n

    Check if the file is in the given format.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the file to check.

    required format str

    The format to check for, either \"tsv\" or \"csv\".

    'tsv'

    Returns:

    Type Description bool

    True if the file is in the given format, False otherwise.

    Source code in src/nplinker/utils.py
    def is_file_format(file: str | PathLike, format: str = \"tsv\") -> bool:\n    \"\"\"Check if the file is in the given format.\n\n    Args:\n        file: Path to the file to check.\n        format: The format to check for, either \"tsv\" or \"csv\".\n\n    Returns:\n        True if the file is in the given format, False otherwise.\n    \"\"\"\n    try:\n        with open(file, \"rt\") as f:\n            if format == \"tsv\":\n                reader = csv.reader(f, delimiter=\"\\t\")\n            elif format == \"csv\":\n                reader = csv.reader(f, delimiter=\",\")\n            else:\n                raise ValueError(f\"Unknown format '{format}'.\")\n            for _ in reader:\n                pass\n        return True\n    except csv.Error:\n        return False\n
    "},{"location":"api/utils/#nplinker.utils.list_dirs","title":"list_dirs","text":"
    list_dirs(\n    root: str | PathLike, keep_parent: bool = True\n) -> list[str]\n

    List all directories at a given root.

    Parameters:

    Name Type Description Default root str | PathLike

    Path to directory whose folders need to be listed

    required keep_parent bool

    If true, prepends the path to each result, otherwise only returns the name of the directories found

    True Source code in src/nplinker/utils.py
    def list_dirs(root: str | PathLike, keep_parent: bool = True) -> list[str]:\n    \"\"\"List all directories at a given root.\n\n    Args:\n        root: Path to directory whose folders need to be listed\n        keep_parent: If true, prepends the path to each result, otherwise\n            only returns the name of the directories found\n    \"\"\"\n    root = transform_to_full_path(root)\n    directories = [str(p) for p in root.iterdir() if p.is_dir()]\n    if not keep_parent:\n        directories = [os.path.basename(d) for d in directories]\n    return directories\n
    "},{"location":"api/utils/#nplinker.utils.list_files","title":"list_files","text":"
    list_files(\n    root: str | PathLike,\n    prefix: str | tuple[str, ...] = \"\",\n    suffix: str | tuple[str, ...] = \"\",\n    keep_parent: bool = True,\n) -> list[str]\n

    List all files at a given root.

    Parameters:

    Name Type Description Default root str | PathLike

    Path to directory whose files need to be listed

    required prefix str | tuple[str, ...]

    Prefix of the file names to match, Defaults to empty string '\"\"'.

    '' suffix str | tuple[str, ...]

    Suffix of the files to match, e.g. \".png\" or (\".jpg\", \".png\"). Defaults to empty string '\"\"'.

    '' keep_parent bool

    If true, prepends the parent path to each result, otherwise only returns the name of the files found. Defaults to False.

    True Source code in src/nplinker/utils.py
    def list_files(\n    root: str | PathLike,\n    prefix: str | tuple[str, ...] = \"\",\n    suffix: str | tuple[str, ...] = \"\",\n    keep_parent: bool = True,\n) -> list[str]:\n    \"\"\"List all files at a given root.\n\n    Args:\n        root: Path to directory whose files need to be listed\n        prefix: Prefix of the file names to match,\n            Defaults to empty string '\"\"'.\n        suffix: Suffix of the files to match, e.g. \".png\" or\n            (\".jpg\", \".png\").\n            Defaults to empty string '\"\"'.\n        keep_parent: If true, prepends the parent path to each\n            result, otherwise only returns the name of the files found.\n            Defaults to False.\n    \"\"\"\n    root = Path(root)\n    files = [\n        str(p)\n        for p in root.iterdir()\n        if p.is_file() and p.name.startswith(prefix) and p.name.endswith(suffix)\n    ]\n\n    if not keep_parent:\n        files = [os.path.basename(f) for f in files]\n\n    return files\n
    "},{"location":"api/utils/#nplinker.utils.transform_to_full_path","title":"transform_to_full_path","text":"
    transform_to_full_path(p: str | PathLike) -> Path\n

    Transform a path to a full path.

    The path is expanded (i.e. the ~ will be replaced with actual path) and converted to an absolute path (i.e. . or .. will be replaced with actual path).

    Parameters:

    Name Type Description Default p str | PathLike

    The path to transform.

    required

    Returns:

    Type Description Path

    The transformed full path.

    Source code in src/nplinker/utils.py
    def transform_to_full_path(p: str | PathLike) -> Path:\n    \"\"\"Transform a path to a full path.\n\n    The path is expanded (i.e. the `~` will be replaced with actual path) and converted to an\n    absolute path (i.e. `.` or `..` will be replaced with actual path).\n\n    Args:\n        p: The path to transform.\n\n    Returns:\n        The transformed full path.\n    \"\"\"\n    # Multiple calls to `Path` are used to ensure static typing compatibility.\n    p = Path(p).expanduser()\n    p = Path(p).resolve()\n    return Path(p)\n
    "},{"location":"concepts/bigscape/","title":"BigScape","text":"

    NPLinker can run BigScape automatically if the bigscape directory does not exist in the working directory.

    To run BigScape, NPLinker requires the following BigScape parameters:

    And the following parameters are not allowed:

    If BigScape parameter --mibig is set, make sure setting the mibig.to_use to true in your config file nplinker.toml and mibig.version to the version of mibig used by bigscape.

    See the default configurations for the default parameters of BigScape.

    "},{"location":"concepts/config_file/","title":"Config File","text":""},{"location":"concepts/config_file/#configuration-template","title":"Configuration Template","text":"
    #############################\n# NPLinker configuration file\n#############################\n\n# The root directory of the NPLinker project. You need to create it first.\n# The value is required and must be a full path.\nroot_dir = \"<NPLinker root directory>\"\n# The mode for preparing dataset.\n# The available modes are \"podp\" and \"local\".\n# \"podp\" mode is for using the PODP platform (https://pairedomicsdata.bioinformatics.nl/) to prepare the dataset.\n# \"local\" mode is for preparing the dataset locally. So uers do not need to upload their data to the PODP platform.\n# The value is required.\nmode = \"podp\"\n# The PODP project identifier.\n# The value is required if the mode is \"podp\".\npodp_id = \"\"\n\n\n[log]\n# Log level. The available levels are same as the levels in python package `logging`:\n# \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\".\n# The default value is \"INFO\".\nlevel = \"INFO\"\n# The log file to append log messages.\n# The value is optional. \n# If not set or use empty string, log messages will not be written to a file.\n# The file will be created if it does not exist. Log messages will be appended to the file if it exists.\nfile = \"path/to/logfile\"\n# Whether to write log meesages to console.\n# The default value is true.\nuse_console = true\n\n\n[mibig]\n# Whether to use mibig metadta (json).\n# The default value is true.\nto_use = true\n# The version of mibig metadata.\n# Make sure using the same version of mibig in bigscape.\n# The default value is \"3.1\"\nversion = \"3.1\"\n\n\n[bigscape]\n# The parameters to use for running BiG-SCAPE.\n# Required bigscape parameters are `--mix`, `--include_singletons` and `--cutoffs`. NPLinker needs\n# them to run the analysis properly.\n# Parameters that must NOT exist: `--inputdir`, `--outputdir`, `--pfam_dir`. NPLinker will\n# automatically configure them.\n# If parameter `--mibig` is set, make sure setting the config `mibig.to_use` to true and\n# `mibig.version` to the version of mibig in bigscape.\n# The default value is \"--mibig --clans-off --mix --include_singletons --cutoffs 0.30\".\nparameters = \"--mibig --clans-off --mix --include_singletons --cutoffs 0.30\"\n# Which bigscape cutoff to use for NPLinker analysis.\n# There might be multiple cutoffs in bigscape output.\n# Note that this value must be a string.\n# The default value is \"0.30\".\ncutoff = \"0.30\"\n\n\n[scoring]\n# Scoring methods.\n# Valid values are \"metcalf\" and \"rosetta\".\n# The default value is \"metcalf\".\nmethods = [\"metcalf\"]\n
    "},{"location":"concepts/config_file/#default-configurations","title":"Default Configurations","text":"

    The default configurations are automatically used by NPLinker if you don't set them in your config file.

    # NPLinker default configurations\n\n[log]\nlevel = \"INFO\"\nuse_console = true\n\n[mibig]\nto_use = true\nversion = \"3.1\"\n\n[bigscape]\nparameters = \"--mibig --clans-off --mix --include_singletons --cutoffs 0.30\"\ncutoff = \"0.30\"\n\n[scoring]\nmethods = [\"metcalf\"]\n
    "},{"location":"concepts/config_file/#config-loader","title":"Config loader","text":"

    You can load the configuration file using the load_config function.

    from nplinker.config import load_config\nconfig = load_config('path/to/nplinker.toml')\n

    When you use NPLinker as an application, you can get access to the configuration object directly:

    from nplinker import NPLinker\nnpl = NPLinker('path/to/nplinker.toml')\nprint(npl.config)\n
    "},{"location":"concepts/gnps_data/","title":"GNPS Data","text":"

    NPLinker requires GNPS molecular networking data as input. It currently accepts data from the following GNPS workflows:

    "},{"location":"concepts/gnps_data/#mappings-from-gnps-data-to-nplinker-input","title":"Mappings from GNPS data to NPLinker input","text":"METABOLOMICS-SNETS workflowMETABOLOMICS-SNETS-V2FEATURE-BASED-MOLECULAR-NETWORKING NPLinker input GNPS file in the archive of Download Clustered Spectra as MGF spectra.mgf METABOLOMICS-SNETS*.mgf molecular_families.tsv networkedges_selfloop/*.pairsinfo annotations.tsv result_specnets_DB/*.tsv file_mappings.tsv clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv

    For example, the file METABOLOMICS-SNETS*.mgf from the downloaded zip archive is used as the spectra.mgf input file of NPLinker.

    When manually preparing GNPS data for NPLinker, the METABOLOMICS-SNETS*.mgf must be renamed to spectra.mgf and placed in the gnps sub-directory of the NPLinker working directory.

    NPLinker input GNPS file in the archive of Download Clustered Spectra as MGF spectra.mgf METABOLOMICS-SNETS-V2*.mgf molecular_families.tsv networkedges_selfloop/*.selfloop annotations.tsv result_specnets_DB/*.tsv file_mappings.tsv clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.clustersummary NPLinker input GNPS file in the archive of Download Cytoscape Data spectra.mgf spectra/*.mgf molecular_families.tsv networkedges_selfloop/*.selfloop annotations.tsv DB_result/*.tsv file_mappings.csv quantification_table/*.csv

    Note that file_mappings.csv is a CSV file, not a TSV file, different from the other workflows.

    "},{"location":"concepts/working_dir_structure/","title":"Working Directory Structure","text":"

    NPLinker requires a fixed structure of working directory with fixed names for the input and output data.

    root_dir # (1)!\n    \u2502\n    \u251c\u2500\u2500 nplinker.toml                           # (2)!\n    \u251c\u2500\u2500 strain_mappings.json                [F] # (3)!\n    \u251c\u2500\u2500 strains_selected.json               [F][O] # (4)!\n    \u2502\n    \u251c\u2500\u2500 gnps                                [F] # (5)!\n    \u2502       \u251c\u2500\u2500 spectra.mgf                 [F]\n    \u2502       \u251c\u2500\u2500 molecular_families.tsv      [F]\n    \u2502       \u251c\u2500\u2500 annotations.tsv             [F]\n    \u2502       \u2514\u2500\u2500 file_mappings.tsv (.csv)    [F] # (6)!\n    \u2502\n    \u251c\u2500\u2500 antismash                           [F] # (7)!\n    \u2502   \u251c\u2500\u2500 GCF_000514975.1\n    \u2502   \u2502   \u251c\u2500\u2500 xxx.region001.gbk\n    \u2502   \u2502   \u2514\u2500\u2500 ...\n    \u2502   \u251c\u2500\u2500 GCF_000016425.1\n    \u2502   \u2502   \u251c\u2500\u2500 xxxx.region001.gbk\n    \u2502   \u2502   \u2514\u2500\u2500 ...\n    \u2502   \u2514\u2500\u2500 ...\n    \u2502\n    \u251c\u2500\u2500 bigscape                            [F][O] # (8)!\n    \u2502   \u251c\u2500\u2500 mix_clustering_c0.30.tsv        [F]    # (9)!\n    \u2502   \u2514\u2500\u2500 bigscape_running_output\n    \u2502       \u2514\u2500\u2500 ...\n    \u2502\n    \u251c\u2500\u2500 downloads                           [F][A] # (10)!\n    \u2502       \u251c\u2500\u2500 paired_datarecord_4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.json # (11)!\n    \u2502       \u251c\u2500\u2500 GCF_000016425.1.zip\n    \u2502       \u251c\u2500\u2500 GCF_0000514975.1.zip\n    \u2502       \u251c\u2500\u2500 c22f44b14a3d450eb836d607cb9521bb.zip\n    \u2502       \u251c\u2500\u2500 genome_status.json\n    \u2502       \u2514\u2500\u2500 mibig_json_3.1.tar.gz\n    \u2502\n    \u251c\u2500\u2500 mibig                               [F][A] # (12)!\n    \u2502   \u251c\u2500\u2500 BGC0000001.json\n    \u2502   \u251c\u2500\u2500 BGC0000002.json\n    \u2502   \u2514\u2500\u2500 ...\n    \u2502\n    \u251c\u2500\u2500 output                              [F][A] # (13)!\n    \u2502   \u2514\u2500\u2500 ...\n    \u2502\n    \u2514\u2500\u2500 ...                                        # (14)!\n
    1. root_dir is the working directory you created, used as the root directory for NPLinker.
    2. nplinker.toml is the configuration file (toml format) provided by the user for running NPLinker.
    3. strain_mappings.json contains the mappings from strain to genomics and metabolomics data. It is generated by NPLinker for podp mode; for local mode, users need to create it manually. [F] means the file name nplinker.toml is a fixed name (including the extension) and must be named as shown.
    4. strains_selected.json is an optional file containing the list of strains to be used in the analysis. If it is not provided, NPLinker will use all strains detected from the input data. [O] means the file strains_selected.json is optional for users to provide.
    5. gnps directory contains the GNPS data. The files in this directory must be named as shown. See XXX for more information about the GNPS data.
    6. This file could be .tsv or .csv format.
    7. antismash directory contains a collection of AntiSMASH BGC data. The BGC data (*.region*.gbk files) must be stored in subdirectories named after NCBI accession number (e.g. GCF_000514975.1).
    8. bigscape directory is optional and contains the output of BigScape. If the directory is not provided, NPLinker will run BigScape automatically to generate the data using the AntiSMASH BGC data.
    9. mix_clustering_c0.30.tsv is an example output of BigScape. The file name must follow the pattern mix_clustering_c{cutoff}.tsv, where {cutoff} is the cutoff value used in the BigScape run.
    10. downloads directory is automatically created and managed by NPLinker. It stores the downloaded data from the internet. Users can also use it to store their own downloaded data. [A] means the directory is automatically created and/or managed by NPLinker.
    11. This is an example file, the actual file would be different. Same as the other files in the downloads directory.
    12. mibig directory contains the MIBiG metadata, which is automatically created and downloaded by NPLinker. Users should not interfere with this directory and its content.
    13. output directory is automatically created by NPLinker. It stores the output data of NPLinker.
    14. It's flexible to extend NPLinker by adding other types of data.

    Tip

    "},{"location":"diagrams/arranger/","title":"Dataset Arranging Pipeline","text":"

    The DatasetArranger is implemented according to the following flowcharts.

    "},{"location":"diagrams/arranger/#strain-mappings-file","title":"Strain mappings file","text":"
    flowchart TD\n    StrainMappings[`strain_mappings.json`] --> SM{Is the mode PODP?}\n    SM --> |No |SM0[Validate the file]\n    SM --> |Yes|SM1[Generate the file] --> SM0
    "},{"location":"diagrams/arranger/#strain-selection-file","title":"Strain selection file","text":"
    flowchart TD\n    StrainsSelected[`strains_selected.json`] --> S{Does the file exist?}\n    S --> |No | S0[Nothing to do]\n    S --> |Yes| S1[Validate the file]
    "},{"location":"diagrams/arranger/#podp-project-metadata-json-file","title":"PODP project metadata json file","text":"
    flowchart TD\n    podp[PODP project metadata json file] --> A{Is the mode PODP?}\n    A --> |No | A0[Nothing to do]\n    A --> |Yes| P{Does the file exist?}\n    P --> |No | P0[Download the file] --> P1\n    P --> |Yes| P1[Validate the file]
    "},{"location":"diagrams/arranger/#gnps-antismash-and-bigscape","title":"GNPS, AntiSMASH and BigScape","text":"
    flowchart TD\n    ConfigError[Dynaconf config validation error]\n    DataError[Data validation error]\n    UseIt[Use the data]\n    Download[First remove existing data if relevent, then download or generate data]\n\n    A[GNPS, antiSMASH and BigSCape] --> B{Pass Dynaconf config validation?}\n    B -->|No | ConfigError\n    B -->|Yes| G{Is the mode PODP?}\n\n    G -->|No, local mode| G1{Does data dir exist?}\n    G1 -->|No | DataError\n    G1 -->|Yes| H{Pass data validation?}\n    H --> |No | DataError\n    H --> |Yes| UseIt \n\n    G -->|Yes, podp mode| G2{Does data dir exist?}\n    G2 --> |No | Download\n    G2 --> |Yes | J{Pass data validation?}\n    J -->|No | Download --> |try max 2 times| J\n    J -->|Yes| UseIt
    "},{"location":"diagrams/arranger/#mibig-data","title":"MIBiG Data","text":"

    MIBiG data is always downloaded automatically. Users cannot provide their own MIBiG data.

    flowchart TD\n    Mibig[MIBiG] --> M0{Pass Dynaconf config validation?}\n    M0 -->|No | M01[Dynaconf config validation error]\n    M0 -->|Yes | MibigDownload[First remove existing data if relevant and then download data]
    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"NPLinker","text":"

    NPLinker is a python framework for data mining microbial natural products by integrating genomics and metabolomics data.

    For a deep understanding of NPLinker, please refer to the original paper.

    Under Development

    NPLinker v2 is under active development (see its pre-releases). The documentation is not complete yet. If you have any questions, please contact us via GitHub Issues.

    "},{"location":"install/","title":"Installation","text":"Requirements

    NPLinker is a python package that has both pypi packages and non-pypi packages as dependencies. It requires ~4.5GB of disk space to install all the dependencies.

    Install nplinker package as following:

    Install nplinker package
    # Check python version (\u22653.9)\npython --version\n\n# Create a new virtual environment\npython -m venv env          # (1)!\nsource env/bin/activate\n\n# install nplinker package (requiring ~300MB of disk space)\npip install nplinker==2.0.0a2 # (2)! \n\n# install nplinker non-pypi dependencies and databases (~4GB)\ninstall-nplinker-deps\n
    1. A virtual environment is required to install the the non-pypi dependencies. You can also use conda to create a new environment. But NPLinker is not available on conda yet.
    2. NPLinker v2 is still under development and released as pre-release. To install the pre-release, you have to explicitly specifiy the version. The command pip install nplinker will install the legacy NPLinker (v1.3.2), which is not recommended.
    "},{"location":"install/#install-from-source-code","title":"Install from source code","text":"

    You can also install NPLinker from source code:

    Install from latest source code
    pip install git+https://github.com/nplinker/nplinker@dev  # (1)!\ninstall-nplinker-deps\n
    1. The @dev is the branch name. You can replace it with the branch name, commit or tag.
    "},{"location":"logging/","title":"Logging","text":"

    NPLinker uses the standard library logging module for managing log messages and the python library rich to colorize the log messages. Depending on how you use NPLinker, you can set up logging in different ways.

    "},{"location":"logging/#nplinker-as-an-application","title":"NPLinker as an application","text":"

    If you're using NPLinker as an application, you're running the whole workflow of NPLinker as described in the Quickstart. In this case, you can set up logging in the nplinker configuration file nplinker.toml.

    "},{"location":"logging/#nplinker-as-a-library","title":"NPLinker as a library","text":"

    If you're using NPLinker as a library, you're using only some functions and classes of NPLinker in your script. By default, NPLinker will not log any messages. However, you can set up logging in your script to log messages.

    Set up logging in 'your_script.py'
    # Set up logging configuration first\nfrom nplinker import setup_logging\n\nsetup_logging(level=\"DEBUG\", file=\"nplinker.log\", use_console=True) # (1)!\n\n# Your business code here\n# e.g. download and extract nplinker example data\nfrom nplinker.utils import download_and_extract_archive\n\ndownload_and_extract_archive(\n    url=\"https://zenodo.org/records/10822604/files/nplinker_local_mode_example.zip\",\n    download_root=\".\",\n)\n
    1. The setup_logging function sets up the logging configuration. The level argument sets the logging level. The file argument sets the log file. The use_console argument sets whether to log messages to the console.

    The log messages will be written to the log file nplinker.log and displayed in the console with a format like this: [Date Time] Level Log-message Module:Line.

    Run your script in a terminal
    # Run your script\n$ python your_script.py\nDownloading nplinker_local_mode_example.zip \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 100.0% \u2022 195.3/195.3 MB \u2022 2.6 MB/s \u2022 0:00:00 \u2022 0:01:02 # (1)!\n[2024-05-10 15:14:48] INFO     Extracting nplinker_local_mode_example.zip to .                      utils.py:401\n\n# Check the log file\n$ cat nplinker.log\n[2024-05-10 15:14:48] INFO     Extracting nplinker_local_mode_example.zip to .                      utils.py:401\n
    1. This is a progress bar but not a log message.
    "},{"location":"quickstart/","title":"Quickstart","text":"

    NPLinker allows you to run in two modes:

    local modepodp mode

    The local mode assumes that the data required by NPLinker is available on your local machine.

    The required input data includes:

    The podp mode assumes that you use an identifier of Paired Omics Data Platform (PODP) as the input for NPLinker. Then NPLinker will download and prepare all data necessary based on the PODP id which refers to the metadata of the dataset.

    So, which mode will you use? The answer is important for the next steps.

    "},{"location":"quickstart/#1-create-a-working-directory","title":"1. Create a working directory","text":"

    The working directory is used to store all input and output data for NPLinker. You can name this directory as you like, for example nplinker_quickstart:

    Create a working directory
    mkdir nplinker_quickstart\n

    Important

    Before going to the next step, make sure you get familiar with how NPLinker organizes data in the working directory, see Working Directory Structure page.

    "},{"location":"quickstart/#2-prepare-input-data-local-mode-only","title":"2. Prepare input data (local mode only)","text":"Details

    Skip this step if you choose to use the podp mode.

    If you choose to use the local mode, meaning you have input data of NPLinker stored on your local machine, you need to move the input data to the working directory created in the previous step.

    "},{"location":"quickstart/#gnps-data","title":"GNPS data","text":"

    NPLinker accepts data from the output of the following GNPS workflows:

    NPLinker provides the tools GNPSDownloader and GNPSExtractor to download and extract the GNPS data with ease. What you need to give is a valid GNPS task ID, referring to a task of the GNPS workflows supported by NPLinker.

    GNPS task id and workflow

    Given an example of GNPS task at https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=c22f44b14a3d450eb836d607cb9521bb, the task id is the last part of this url, i.e. c22f44b14a3d450eb836d607cb9521bb. Open this link, you can find the worklow info at the row \"Workflow\" of the table \"Job Status\", for this case, it is METABOLOMICS-SNETS.

    Download & Extract GNPS data
    from nplinker.metabolomics.gnps import GNPSDownloader, GNPSExtractor\n\n# Go to the working directory\ncd nplinker_quickstart\n\n# Download GNPS data & get the path to the downloaded archive\ndownloader = GNPSDownloader(\"gnps_task_id\", \"downloads\") # (1)!\ndownloaded_archive = downloader.download().get_download_file()\n\n# Extract GNPS data to `gnps` directory\nextractor = GNPSExtractor(downloaded_archive, \"gnps\") # (2)!\n
    1. If you already have the downloaded archive of GNPS data, you can skip the download steps.
    2. Replace downloaded_archive with the actuall path to your GNPS data archive if you skipped the download steps.

    The required data for NPLinker will be extracted to the gnps subdirectory of the working directory.

    Info

    Not all GNPS data are required by NPLinker, and only the necessary data will be extracted. During the extraction, these data will be renamed to the standard names used by NPLinker. See the page GNPS Data for more information.

    Prepare GNPS data manually

    If you have GNPS data but it is not the archive format as downloaded from GNPS, it's recommended to re-download the data from GNPS.

    If (re-)downloading is not possible, you could manually prepare data for the gnps directory. In this case, you must make sure that the data is organized as expected by NPLinker. See the page GNPS Data for examples of how to prepare the data.

    "},{"location":"quickstart/#antismash-data","title":"AntiSMASH data","text":"

    NPLinker requires AntiSMASH BGC data as input, which are organized in the antismash subdirectory of the working directory.

    For each output of AntiSMASH run, the BGC data must be stored in a subdirectory named after the NCBI accession number (e.g. GCF_000514975.1). And only the *.region*.gbk files are required by NPLinker.

    When manually preparing AntiSMASH data for NPLinker, you must make sure that the data is organized as expected by NPLinker. See the page Working Directory Structure for more information.

    "},{"location":"quickstart/#bigscape-data-optional","title":"BigScape data (optional)","text":"

    It is optional to provide the output of BigScape to NPLinker. If the output of BigScape is not provided, NPLinker will run BigScape automatically to generate the data using the AntiSMASH BGC data.

    If you have the output of BigScape, you can put its mix_clustering_c{cutoff}.tsv file in the bigscape subdirectory of the NPLinker working directory, where {cutoff} is the cutoff value used in the BigScape run.

    "},{"location":"quickstart/#strain-mappings-file","title":"Strain mappings file","text":"

    The strain mappings file strain_mapping.json is required by NPLinker to map the strain to genomics and metabolomics data.

    `strain_mappings.json` example
    {\n    \"strain_mappings\": [\n        {\n            \"strain_id\": \"strain_id_1\", # (1)!\n            \"strain_alias\": [\"bgc_id_1\", \"spectrum_id_1\", ...] # (2)!\n        },\n        {\n            \"strain_id\": \"strain_id_2\",\n            \"strain_alias\": [\"bgc_id_2\", \"spectrum_id_2\", ...]\n        },\n        ...\n    ],\n    \"version\": \"1.0\" # (3)!\n}\n
    1. strain_id is the unique identifier of the strain.
    2. strain_alias is a list of aliases of the strain, which are the identifiers of the BGCs and spectra of the strain.
    3. version is the schema version of this file. It is recommended to use the latest version of the schema. The current latest version is 1.0.

    The BGC id is same as the name of the BGC file in the antismash directory, for example, given a BGC file xxxx.region001.gbk, the BGC id is xxxx.region001.

    The spectrum id is same as the scan number in the spectra.mgf file in the gnps directory, for example, given a spectrum in the mgf file with a scan SCANS=1, the spectrum id is 1.

    If you labelled the mzXML files (input for GNPS) with the strain id, you may need the function extract_mappings_ms_filename_spectrum_id to extract the mappings from mzXML files to the spectrum ids.

    For the local mode, you need to create this file manually and put it in the working directory. It takes some effort to prepare this file manually, especially when you have a large number of strains.

    "},{"location":"quickstart/#3-prepare-config-file","title":"3. Prepare config file","text":"

    The configuration file nplinker.toml is required by NPLinker to specify the working directory, mode, and other settings for the run of NPLinker. You can put the nplinker.toml file in any place, but it is recommended to put it in the working directory created in step 2.

    The details of all settings can be found at this page Config File.

    To keep it simple, default settings will be used automatically by NPLinker if you don't set them in your nplinker.toml config file.

    What you need to do is to set the root_dir and mode in the nplinker.toml file.

    local modepodp mode nplinker.toml
    root_dir = \"absolute/path/to/working/directory\" # (1)!\nmode = \"local\"\n# and other settings you want to override the default settings \n
    1. Replace absolute/path/to/working/directory with the absolute path to the working directory created in step 2.
    nplinker.toml
    root_dir = \"absolute/path/to/working/directory\" # (1)!\nmode = \"podp\"\npodp_id = \"podp_id\" # (2)!\n# and other settings you want to override the default settings \n
    1. Replace absolute/path/to/working/directory with the absolute path to the working directory created in step 2.
    2. Replace podp_id with the identifier of the dataset in the Paired Omics Data Platform (PODP).
    "},{"location":"quickstart/#4-run-nplinker","title":"4. Run NPLinker","text":"

    Before running NPLinker, make sure your working directory has the correct directory structure and names as described in the Working Directory Structure page.

    Run NPLinker in your working directory
    from nplinker import NPLinker\n\n# create an instance of NPLinker\nnpl = NPLinker(\"nplinker.toml\") # (1)!\n\n# load data\nnpl.load_data()\n\n# check loaded data\nprint(npl.bgcs)\nprint(npl.gcfs)\nprint(npl.spectra)\nprint(npl.mfs)\nprint(npl.strains)\n\n# compute the links for the first 3 GCFs using metcalf scoring method\nlink_graph = npl.get_links(npl.gcfs[:3], \"metcalf\")  # (2)!\n\n# get links as a list of tuples\nlink_graph.links \n\n# get the link data between two objects or entities\nlink_graph.get_link_data(npl.gcfs[0], npl.spectra[0]) \n\n# Save data to a pickle file\nnpl.save_data(\"npl.pkl\", link_graph)\n
    1. Replace nplinker.toml with the actual path to your configuration file.
    2. The get_links returns a LinkGraph object that represents the calculated links between the GCFs and other entities as a graph.

    For more info about the classes and methods, see the API Documentation.

    "},{"location":"api/antismash/","title":"AntiSMASH","text":""},{"location":"api/antismash/#nplinker.genomics.antismash","title":"antismash","text":""},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader","title":"AntismashBGCLoader","text":"
    AntismashBGCLoader(data_dir: str | PathLike)\n

    Bases: BGCLoaderBase

    Build a loader for AntiSMASH BGC genbank (.gbk) files.

    Note

    AntiSMASH BGC directory must follow the structure below:

    antismash\n    \u251c\u2500\u2500 genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)\n    \u2502\u00a0 \u251c\u2500\u2500 GCF_000514775.1.gbk\n    \u2502\u00a0 \u251c\u2500\u2500 NZ_AZWO01000004.region001.gbk\n    \u2502\u00a0 \u2514\u2500\u2500 ...\n    \u251c\u2500\u2500 genome_id_2\n    \u2502\u00a0 \u251c\u2500\u2500 ...\n    \u2514\u2500\u2500 ...\n

    Parameters:

    Name Type Description Default data_dir str | PathLike

    Path to AntiSMASH directory that contains a collection of AntiSMASH outputs.

    required Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def __init__(self, data_dir: str | PathLike) -> None:\n    \"\"\"Initialize the AntiSMASH BGC loader.\n\n    Args:\n        data_dir: Path to AntiSMASH directory that contains a\n            collection of AntiSMASH outputs.\n    \"\"\"\n    self.data_dir = str(data_dir)\n    self._file_dict = self._parse_data_dir(self.data_dir)\n    self._bgcs = self._parse_bgcs(self._file_dict)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.data_dir","title":"data_dir instance-attribute","text":"
    data_dir = str(data_dir)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.get_bgc_genome_mapping","title":"get_bgc_genome_mapping","text":"
    get_bgc_genome_mapping() -> dict[str, str]\n

    Get the mapping from BGC to genome.

    Note that the directory name of the gbk file is treated as genome id.

    Returns:

    Type Description dict[str, str]

    The key is BGC name (gbk file name) and value is genome id (the directory name of the

    dict[str, str]

    gbk file).

    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def get_bgc_genome_mapping(self) -> dict[str, str]:\n    \"\"\"Get the mapping from BGC to genome.\n\n    Note that the directory name of the gbk file is treated as genome id.\n\n    Returns:\n        The key is BGC name (gbk file name) and value is genome id (the directory name of the\n        gbk file).\n    \"\"\"\n    return {\n        bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()\n    }\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.get_files","title":"get_files","text":"
    get_files() -> dict[str, str]\n

    Get BGC gbk files.

    Returns:

    Type Description dict[str, str]

    The key is BGC name (gbk file name) and value is path to the gbk file.

    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def get_files(self) -> dict[str, str]:\n    \"\"\"Get BGC gbk files.\n\n    Returns:\n        The key is BGC name (gbk file name) and value is path to the gbk file.\n    \"\"\"\n    return self._file_dict\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.AntismashBGCLoader.get_bgcs","title":"get_bgcs","text":"
    get_bgcs() -> list[BGC]\n

    Get all BGC objects.

    Returns:

    Type Description list[BGC]

    A list of BGC objects

    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def get_bgcs(self) -> list[BGC]:\n    \"\"\"Get all BGC objects.\n\n    Returns:\n        A list of BGC objects\n    \"\"\"\n    return self._bgcs\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus","title":"GenomeStatus","text":"
    GenomeStatus(\n    original_id: str,\n    resolved_refseq_id: str = \"\",\n    resolve_attempted: bool = False,\n    bgc_path: str = \"\",\n)\n

    A class to represent the status of a single genome.

    The status of genomes is tracked in a JSON file which has a name defined in variable GENOME_STATUS_FILENAME.

    Parameters:

    Name Type Description Default original_id str

    The original ID of the genome.

    required resolved_refseq_id str

    The resolved RefSeq ID of the genome. Defaults to \"\".

    '' resolve_attempted bool

    A flag indicating whether an attempt to resolve the RefSeq ID has been made. Defaults to False.

    False bgc_path str

    The path to the downloaded BGC file for the genome. Defaults to \"\".

    '' Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    def __init__(\n    self,\n    original_id: str,\n    resolved_refseq_id: str = \"\",\n    resolve_attempted: bool = False,\n    bgc_path: str = \"\",\n):\n    \"\"\"Initialize a GenomeStatus object for the given genome.\n\n    Args:\n        original_id: The original ID of the genome.\n        resolved_refseq_id: The resolved RefSeq ID of the\n            genome. Defaults to \"\".\n        resolve_attempted: A flag indicating whether an\n            attempt to resolve the RefSeq ID has been made. Defaults to False.\n        bgc_path: The path to the downloaded BGC file for\n            the genome. Defaults to \"\".\n    \"\"\"\n    self.original_id = original_id\n    self.resolved_refseq_id = \"\" if resolved_refseq_id == \"None\" else resolved_refseq_id\n    self.resolve_attempted = resolve_attempted\n    self.bgc_path = bgc_path\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.original_id","title":"original_id instance-attribute","text":"
    original_id = original_id\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.resolved_refseq_id","title":"resolved_refseq_id instance-attribute","text":"
    resolved_refseq_id = (\n    \"\"\n    if resolved_refseq_id == \"None\"\n    else resolved_refseq_id\n)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.resolve_attempted","title":"resolve_attempted instance-attribute","text":"
    resolve_attempted = resolve_attempted\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.bgc_path","title":"bgc_path instance-attribute","text":"
    bgc_path = bgc_path\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.read_json","title":"read_json staticmethod","text":"
    read_json(\n    file: str | PathLike,\n) -> dict[str, \"GenomeStatus\"]\n

    Get a dict of GenomeStatus objects by loading given genome status file.

    Note that an empty dict is returned if the given file doesn't exist.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to genome status file.

    required

    Returns:

    Type Description dict[str, 'GenomeStatus']

    Dict keys are genome original id and values are GenomeStatus objects. An empty dict is returned if the given file doesn't exist.

    Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    @staticmethod\ndef read_json(file: str | PathLike) -> dict[str, \"GenomeStatus\"]:\n    \"\"\"Get a dict of GenomeStatus objects by loading given genome status file.\n\n    Note that an empty dict is returned if the given file doesn't exist.\n\n    Args:\n        file: Path to genome status file.\n\n    Returns:\n        Dict keys are genome original id and values are GenomeStatus\n            objects. An empty dict is returned if the given file doesn't exist.\n    \"\"\"\n    genome_status_dict = {}\n    if Path(file).exists():\n        with open(file, \"r\") as f:\n            data = json.load(f)\n\n        # validate json data before using it\n        validate(data, schema=GENOME_STATUS_SCHEMA)\n\n        genome_status_dict = {\n            gs[\"original_id\"]: GenomeStatus(**gs) for gs in data[\"genome_status\"]\n        }\n    return genome_status_dict\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.GenomeStatus.to_json","title":"to_json staticmethod","text":"
    to_json(\n    genome_status_dict: Mapping[str, \"GenomeStatus\"],\n    file: str | PathLike | None = None,\n) -> str | None\n

    Convert the genome status dictionary to a JSON string.

    If a file path is provided, the JSON string is written to the file. If the file already exists, it is overwritten.

    Parameters:

    Name Type Description Default genome_status_dict Mapping[str, 'GenomeStatus']

    A dictionary of genome status objects. The keys are the original genome IDs and the values are GenomeStatus objects.

    required file str | PathLike | None

    The path to the output JSON file. If None, the JSON string is returned but not written to a file.

    None

    Returns:

    Type Description str | None

    The JSON string if file is None, otherwise None.

    Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    @staticmethod\ndef to_json(\n    genome_status_dict: Mapping[str, \"GenomeStatus\"], file: str | PathLike | None = None\n) -> str | None:\n    \"\"\"Convert the genome status dictionary to a JSON string.\n\n    If a file path is provided, the JSON string is written to the file. If\n    the file already exists, it is overwritten.\n\n    Args:\n        genome_status_dict: A dictionary of genome\n            status objects. The keys are the original genome IDs and the values\n            are GenomeStatus objects.\n        file: The path to the output JSON file.\n            If None, the JSON string is returned but not written to a file.\n\n    Returns:\n        The JSON string if `file` is None, otherwise None.\n    \"\"\"\n    gs_list = [gs._to_dict() for gs in genome_status_dict.values()]\n    json_data = {\"genome_status\": gs_list, \"version\": \"1.0\"}\n\n    # validate json object before dumping\n    validate(json_data, schema=GENOME_STATUS_SCHEMA)\n\n    if file is not None:\n        with open(file, \"w\") as f:\n            json.dump(json_data, f)\n        return None\n    return json.dumps(json_data)\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.download_and_extract_antismash_data","title":"download_and_extract_antismash_data","text":"
    download_and_extract_antismash_data(\n    antismash_id: str,\n    download_root: str | PathLike,\n    extract_root: str | PathLike,\n) -> None\n

    Download and extract antiSMASH BGC archive for a specified genome.

    The antiSMASH database (https://antismash-db.secondarymetabolites.org/) is used to download the BGC archive. And antiSMASH use RefSeq assembly id of a genome as the id of the archive.

    Parameters:

    Name Type Description Default antismash_id str

    The id used to download BGC archive from antiSMASH database. If the id is versioned (e.g., \"GCF_004339725.1\") please be sure to specify the version as well.

    required download_root str | PathLike

    Path to the directory to place downloaded archive in.

    required extract_root str | PathLike

    Path to the directory data files will be extracted to. Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.

    required

    Raises:

    Type Description ValueError

    if download_root and extract_root dirs are the same.

    ValueError

    if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.

    Examples:

    >>> download_and_extract_antismash_metadata(\"GCF_004339725.1\", \"/data/download\", \"/data/extracted\")\n
    Source code in src/nplinker/genomics/antismash/antismash_downloader.py
    def download_and_extract_antismash_data(\n    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike\n) -> None:\n    \"\"\"Download and extract antiSMASH BGC archive for a specified genome.\n\n    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)\n    is used to download the BGC archive. And antiSMASH use RefSeq assembly id\n    of a genome as the id of the archive.\n\n    Args:\n        antismash_id: The id used to download BGC archive from antiSMASH database.\n            If the id is versioned (e.g., \"GCF_004339725.1\") please be sure to\n            specify the version as well.\n        download_root: Path to the directory to place downloaded archive in.\n        extract_root: Path to the directory data files will be extracted to.\n            Note that an `antismash` directory will be created in the specified `extract_root` if\n            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.\n\n    Raises:\n        ValueError: if `download_root` and `extract_root` dirs are the same.\n        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.\n\n    Examples:\n        >>> download_and_extract_antismash_metadata(\"GCF_004339725.1\", \"/data/download\", \"/data/extracted\")\n    \"\"\"\n    download_root = Path(download_root)\n    extract_root = Path(extract_root)\n    extract_path = extract_root / \"antismash\" / antismash_id\n    _check_roots(download_root, extract_root)\n\n    try:\n        if extract_path.exists():\n            _check_extract_path(extract_path)\n        else:\n            extract_path.mkdir(parents=True, exist_ok=True)\n\n        for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:\n            url = base_url.format(antismash_id, antismash_id + \".zip\")\n            download_and_extract_archive(url, download_root, extract_path, antismash_id + \".zip\")\n            break\n\n        # delete subdirs\n        for subdir_path in list_dirs(extract_path):\n            shutil.rmtree(subdir_path)\n\n        # delete unnecessary files\n        files_to_keep = list_files(extract_path, suffix=(\".json\", \".gbk\"))\n        for file in list_files(extract_path):\n            if file not in files_to_keep:\n                os.remove(file)\n\n        logger.info(\"antiSMASH BGC data of %s is downloaded and extracted.\", antismash_id)\n\n    except Exception as e:\n        shutil.rmtree(extract_path)\n        logger.warning(e)\n        raise e\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.parse_bgc_genbank","title":"parse_bgc_genbank","text":"
    parse_bgc_genbank(file: str | PathLike) -> BGC\n

    Parse a single BGC gbk file to BGC object.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to BGC gbk file

    required

    Returns:

    Type Description BGC

    BGC object

    Examples:

    >>> bgc = AntismashBGCLoader.parse_bgc(\n...    \"/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk\")\n
    Source code in src/nplinker/genomics/antismash/antismash_loader.py
    def parse_bgc_genbank(file: str | PathLike) -> BGC:\n    \"\"\"Parse a single BGC gbk file to BGC object.\n\n    Args:\n        file: Path to BGC gbk file\n\n    Returns:\n        BGC object\n\n    Examples:\n        >>> bgc = AntismashBGCLoader.parse_bgc(\n        ...    \"/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk\")\n    \"\"\"\n    file = Path(file)\n    fname = file.stem\n\n    record = SeqIO.read(file, format=\"genbank\")\n    description = record.description  # \"DEFINITION\" in gbk file\n    antismash_id = record.id  # \"VERSION\" in gbk file\n    features = _parse_antismash_genbank(record)\n    product_prediction = features.get(\"product\")\n    if product_prediction is None:\n        raise ValueError(f\"Not found product prediction in antiSMASH Genbank file {file}\")\n\n    # init BGC\n    bgc = BGC(fname, *product_prediction)\n    bgc.description = description\n    bgc.antismash_id = antismash_id\n    bgc.antismash_file = str(file)\n    bgc.antismash_region = features.get(\"region_number\")\n    bgc.smiles = features.get(\"smiles\")\n    bgc.strain = Strain(fname)\n    return bgc\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.get_best_available_genome_id","title":"get_best_available_genome_id","text":"
    get_best_available_genome_id(\n    genome_id_data: Mapping[str, str]\n) -> str | None\n

    Get the best available ID from genome_id_data dict.

    Parameters:

    Name Type Description Default genome_id_data Mapping[str, str]

    dictionary containing information for each genome record present.

    required

    Returns:

    Type Description str | None

    ID for the genome, if present, otherwise None.

    Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | None:\n    \"\"\"Get the best available ID from genome_id_data dict.\n\n    Args:\n        genome_id_data: dictionary containing information for each genome record present.\n\n    Returns:\n        ID for the genome, if present, otherwise None.\n    \"\"\"\n    if \"RefSeq_accession\" in genome_id_data:\n        best_id = genome_id_data[\"RefSeq_accession\"]\n    elif \"GenBank_accession\" in genome_id_data:\n        best_id = genome_id_data[\"GenBank_accession\"]\n    elif \"JGI_Genome_ID\" in genome_id_data:\n        best_id = genome_id_data[\"JGI_Genome_ID\"]\n    else:\n        best_id = None\n\n    if best_id is None or len(best_id) == 0:\n        logger.warning(f\"Failed to get valid genome ID in genome data: {genome_id_data}\")\n        return None\n    return best_id\n
    "},{"location":"api/antismash/#nplinker.genomics.antismash.podp_download_and_extract_antismash_data","title":"podp_download_and_extract_antismash_data","text":"
    podp_download_and_extract_antismash_data(\n    genome_records: Sequence[\n        Mapping[str, Mapping[str, str]]\n    ],\n    project_download_root: str | PathLike,\n    project_extract_root: str | PathLike,\n)\n

    Download and extract antiSMASH BGC archive for the given genome records.

    Parameters:

    Name Type Description Default genome_records Sequence[Mapping[str, Mapping[str, str]]]

    list of dicts representing genome records. The dict of each genome record contains - key(str): \"genome_ID\" - value(dict[str, str]): a dict containing information about genome type, label and accession ids (RefSeq, GenBank, and/or JGI).

    required project_download_root str | PathLike

    Path to the directory to place downloaded archive in.

    required project_extract_root str | PathLike

    Path to the directory downloaded archive will be extracted to. Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.

    required Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
    def podp_download_and_extract_antismash_data(\n    genome_records: Sequence[Mapping[str, Mapping[str, str]]],\n    project_download_root: str | PathLike,\n    project_extract_root: str | PathLike,\n):\n    \"\"\"Download and extract antiSMASH BGC archive for the given genome records.\n\n    Args:\n        genome_records: list of dicts\n            representing genome records. The dict of each genome record contains\n                - key(str): \"genome_ID\"\n                - value(dict[str, str]): a dict containing information about genome\n                type, label and accession ids (RefSeq, GenBank, and/or JGI).\n        project_download_root: Path to the directory to place\n            downloaded archive in.\n        project_extract_root: Path to the directory downloaded archive\n            will be extracted to.\n            Note that an `antismash` directory will be created in the specified\n            `extract_root` if it doesn't exist. The files will be extracted to\n            `<extract_root>/antismash/<antismash_id>` directory.\n    \"\"\"\n    if not Path(project_download_root).exists():\n        # otherwise in case of failed first download, the folder doesn't exist and\n        # genome_status_file can't be written\n        Path(project_download_root).mkdir(parents=True, exist_ok=True)\n\n    gs_file = Path(project_download_root, GENOME_STATUS_FILENAME)\n    gs_dict = GenomeStatus.read_json(gs_file)\n\n    for i, genome_record in enumerate(genome_records):\n        # get the best available ID from the dict\n        genome_id_data = genome_record[\"genome_ID\"]\n        raw_genome_id = get_best_available_genome_id(genome_id_data)\n        if raw_genome_id is None or len(raw_genome_id) == 0:\n            logger.warning(\n                f'Ignoring genome record \"{genome_record}\" due to missing genome ID field'\n            )\n            continue\n\n        # check if genome ID exist in the genome status file\n        if raw_genome_id not in gs_dict:\n            gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id)\n\n        gs_obj = gs_dict[raw_genome_id]\n\n        logger.info(\n            f\"Checking for antismash data {i + 1}/{len(genome_records)}, \"\n            f\"current genome ID={raw_genome_id}\"\n        )\n        # first, check if BGC data is downloaded\n        if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists():\n            logger.info(f\"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}\")\n            continue\n        # second, check if lookup attempted previously\n        if gs_obj.resolve_attempted:\n            logger.info(f\"Genome ID {raw_genome_id} skipped due to previous failure\")\n            continue\n\n        # if not downloaded or lookup attempted, then try to resolve the ID\n        # and download\n        logger.info(f\"Beginning lookup process for genome ID {raw_genome_id}\")\n        gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data)\n        gs_obj.resolve_attempted = True\n\n        if gs_obj.resolved_refseq_id == \"\":\n            # give up on this one\n            logger.warning(f\"Failed lookup for genome ID {raw_genome_id}\")\n            continue\n\n        # if resolved id is valid, try to download and extract antismash data\n        try:\n            download_and_extract_antismash_data(\n                gs_obj.resolved_refseq_id, project_download_root, project_extract_root\n            )\n\n            gs_obj.bgc_path = str(\n                Path(project_download_root, gs_obj.resolved_refseq_id + \".zip\").absolute()\n            )\n\n            output_path = Path(project_extract_root, \"antismash\", gs_obj.resolved_refseq_id)\n            if output_path.exists():\n                Path.touch(output_path / \"completed\", exist_ok=True)\n\n        except Exception:\n            gs_obj.bgc_path = \"\"\n\n    missing = len([gs for gs in gs_dict.values() if not gs.bgc_path])\n    logger.info(\n        f\"Dataset has {missing} missing sets of antiSMASH data \"\n        f\" (from a total of {len(genome_records)}).\"\n    )\n\n    # save updated genome status to json file\n    GenomeStatus.to_json(gs_dict, gs_file)\n\n    if missing == len(genome_records):\n        raise ValueError(\"No antiSMASH data found for any genome\")\n
    "},{"location":"api/arranger/","title":"Dataset Arranger","text":""},{"location":"api/arranger/#nplinker.arranger","title":"arranger","text":""},{"location":"api/arranger/#nplinker.arranger.PODP_PROJECT_URL","title":"PODP_PROJECT_URL module-attribute","text":"
    PODP_PROJECT_URL = \"https://pairedomicsdata.bioinformatics.nl/api/projects/{}\"\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger","title":"DatasetArranger","text":"
    DatasetArranger(config: Dynaconf)\n

    Arrange the dataset required by NPLinker.

    This class is used to arrange the datasets required by NPLinker according to the configuration. The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.

    If self.config.mode is \"local\", the datasets are validated. If self.config.mode is \"podp\", the datasets are downloaded or generated.

    Attributes:

    Name Type Description config

    A Dynaconf object that contains the configuration settings. Check nplinker.config module for more details.

    root_dir

    The root directory of the datasets.

    downloads_dir

    The directory to store downloaded files.

    mibig_dir

    The directory to store MIBiG metadata.

    gnps_dir

    The directory to store GNPS data.

    antismash_dir

    The directory to store antiSMASH data.

    bigscape_dir

    The directory to store BiG-SCAPE data.

    bigscape_running_output_dir

    The directory to store the running output of BiG-SCAPE.

    Parameters:

    Name Type Description Default config Dynaconf

    A Dynaconf object that contains the configuration settings. Check nplinker.config module for more details.

    required Source code in src/nplinker/arranger.py
    def __init__(self, config: Dynaconf) -> None:\n    \"\"\"Initialize the DatasetArranger.\n\n    Args:\n        config: A Dynaconf object that contains the configuration settings. Check `nplinker.config`\n            module for more details.\n    \"\"\"\n    self.config = config\n    self.root_dir = config.root_dir\n    self.downloads_dir = self.root_dir / defaults.DOWNLOADS_DIRNAME\n    self.downloads_dir.mkdir(exist_ok=True)\n\n    self.mibig_dir = self.root_dir / defaults.MIBIG_DIRNAME\n    self.gnps_dir = self.root_dir / defaults.GNPS_DIRNAME\n    self.antismash_dir = self.root_dir / defaults.ANTISMASH_DIRNAME\n    self.bigscape_dir = self.root_dir / defaults.BIGSCAPE_DIRNAME\n    self.bigscape_running_output_dir = (\n        self.bigscape_dir / defaults.BIGSCAPE_RUNNING_OUTPUT_DIRNAME\n    )\n\n    self.arrange_podp_project_json()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.config","title":"config instance-attribute","text":"
    config = config\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.root_dir","title":"root_dir instance-attribute","text":"
    root_dir = root_dir\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.downloads_dir","title":"downloads_dir instance-attribute","text":"
    downloads_dir = root_dir / DOWNLOADS_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.mibig_dir","title":"mibig_dir instance-attribute","text":"
    mibig_dir = root_dir / MIBIG_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.gnps_dir","title":"gnps_dir instance-attribute","text":"
    gnps_dir = root_dir / GNPS_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.antismash_dir","title":"antismash_dir instance-attribute","text":"
    antismash_dir = root_dir / ANTISMASH_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.bigscape_dir","title":"bigscape_dir instance-attribute","text":"
    bigscape_dir = root_dir / BIGSCAPE_DIRNAME\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.bigscape_running_output_dir","title":"bigscape_running_output_dir instance-attribute","text":"
    bigscape_running_output_dir = (\n    bigscape_dir / BIGSCAPE_RUNNING_OUTPUT_DIRNAME\n)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange","title":"arrange","text":"
    arrange() -> None\n

    Arrange the datasets according to the configuration.

    The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.

    Source code in src/nplinker/arranger.py
    def arrange(self) -> None:\n    \"\"\"Arrange the datasets according to the configuration.\n\n    The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.\n    \"\"\"\n    # The order of arranging the datasets matters, as some datasets depend on others\n    self.arrange_mibig()\n    self.arrange_gnps()\n    self.arrange_antismash()\n    self.arrange_bigscape()\n    self.arrange_strain_mappings()\n    self.arrange_strains_selected()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_podp_project_json","title":"arrange_podp_project_json","text":"
    arrange_podp_project_json() -> None\n

    Arrange the PODP project JSON file.

    If self.config.mode is \"podp\", download the PODP project JSON file if it doesn't exist. Then validate the PODP project JSON file if it exists or is downloaded.

    The validation is controlled by the json schema schemas/podp_adapted_schema.json.

    Source code in src/nplinker/arranger.py
    def arrange_podp_project_json(self) -> None:\n    \"\"\"Arrange the PODP project JSON file.\n\n    If `self.config.mode` is \"podp\", download the PODP project JSON file if it doesn't exist. Then\n    validate the PODP project JSON file if it exists or is downloaded.\n\n    The validation is controlled by the json schema `schemas/podp_adapted_schema.json`.\n    \"\"\"\n    if self.config.mode == \"podp\":\n        file_name = f\"paired_datarecord_{self.config.podp_id}.json\"\n        podp_file = self.downloads_dir / file_name\n        if not podp_file.exists():\n            download_url(\n                PODP_PROJECT_URL.format(self.config.podp_id),\n                self.downloads_dir,\n                file_name,\n            )\n\n        with open(podp_file, \"r\") as f:\n            json_data = json.load(f)\n        validate_podp_json(json_data)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_mibig","title":"arrange_mibig","text":"
    arrange_mibig() -> None\n

    Arrange the MIBiG metadata.

    Always download and extract the MIBiG metadata if self.config.mibig.to_use is True. If the default directory has already existed, it will be removed and re-downloaded to ensure the latest version is used. So it's not allowed to manually put MIBiG metadata in the default directory.

    Source code in src/nplinker/arranger.py
    def arrange_mibig(self) -> None:\n    \"\"\"Arrange the MIBiG metadata.\n\n    Always download and extract the MIBiG metadata if `self.config.mibig.to_use` is True.\n    If the default directory has already existed, it will be removed and re-downloaded to ensure\n    the latest version is used. So it's not allowed to manually put MIBiG metadata in the\n    default directory.\n    \"\"\"\n    if self.config.mibig.to_use:\n        if self.mibig_dir.exists():\n            # remove existing mibig data\n            shutil.rmtree(self.mibig_dir)\n        download_and_extract_mibig_metadata(\n            self.downloads_dir,\n            self.mibig_dir,\n            version=self.config.mibig.version,\n        )\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_gnps","title":"arrange_gnps","text":"
    arrange_gnps() -> None\n

    Arrange the GNPS data.

    If self.config.mode is \"local\", validate the GNPS data directory. If self.config.mode is \"podp\", download the GNPS data if it doesn't exist or remove the existing GNPS data and re-download it if it is invalid.

    The validation process includes:

    Source code in src/nplinker/arranger.py
    def arrange_gnps(self) -> None:\n    \"\"\"Arrange the GNPS data.\n\n    If `self.config.mode` is \"local\", validate the GNPS data directory.\n    If `self.config.mode` is \"podp\", download the GNPS data if it doesn't exist or remove the\n    existing GNPS data and re-download it if it is invalid.\n\n    The validation process includes:\n\n    - Check if the GNPS data directory exists.\n    - Check if the required files exist in the GNPS data directory, including:\n        - file_mappings.tsv or file_mappings.csv\n        - spectra.mgf\n        - molecular_families.tsv\n        - annotations.tsv\n    \"\"\"\n    pass_validation = False\n    if self.config.mode == \"podp\":\n        # retry downloading at most 3 times if downloaded data has problems\n        for _ in range(3):\n            try:\n                validate_gnps(self.gnps_dir)\n                pass_validation = True\n                break\n            except (FileNotFoundError, ValueError):\n                # Don't need to remove downloaded archive, as it'll be overwritten\n                shutil.rmtree(self.gnps_dir, ignore_errors=True)\n                self._download_and_extract_gnps()\n\n    if not pass_validation:\n        validate_gnps(self.gnps_dir)\n\n    # get the path to file_mappings file (csv or tsv)\n    self.gnps_file_mappings_file = self._get_gnps_file_mappings_file()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_antismash","title":"arrange_antismash","text":"
    arrange_antismash() -> None\n

    Arrange the antiSMASH data.

    If self.config.mode is \"local\", validate the antiSMASH data directory. If self.config.mode is \"podp\", download the antiSMASH data if it doesn't exist or remove the existing antiSMASH data and re-download it if it is invalid.

    The validation process includes: - Check if the antiSMASH data directory exists. - Check if the antiSMASH data directory contains at least one sub-directory, and each sub-directory contains at least one BGC file (with the suffix \".region???.gbk\" where ??? is a number).

    AntiSMASH BGC directory must follow the structure below:

    antismash\n    \u251c\u2500\u2500 genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)\n    \u2502\u00a0 \u251c\u2500\u2500 GCF_000514775.1.gbk\n    \u2502\u00a0 \u251c\u2500\u2500 NZ_AZWO01000004.region001.gbk\n    \u2502\u00a0 \u2514\u2500\u2500 ...\n    \u251c\u2500\u2500 genome_id_2\n    \u2502\u00a0 \u251c\u2500\u2500 ...\n    \u2514\u2500\u2500 ...\n

    Source code in src/nplinker/arranger.py
    def arrange_antismash(self) -> None:\n    \"\"\"Arrange the antiSMASH data.\n\n    If `self.config.mode` is \"local\", validate the antiSMASH data directory.\n    If `self.config.mode` is \"podp\", download the antiSMASH data if it doesn't exist or remove the\n    existing antiSMASH data and re-download it if it is invalid.\n\n    The validation process includes:\n    - Check if the antiSMASH data directory exists.\n    - Check if the antiSMASH data directory contains at least one sub-directory, and each\n        sub-directory contains at least one BGC file (with the suffix \".region???.gbk\" where ???\n        is a number).\n\n    AntiSMASH BGC directory must follow the structure below:\n    ```\n    antismash\n        \u251c\u2500\u2500 genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)\n        \u2502\u00a0 \u251c\u2500\u2500 GCF_000514775.1.gbk\n        \u2502\u00a0 \u251c\u2500\u2500 NZ_AZWO01000004.region001.gbk\n        \u2502\u00a0 \u2514\u2500\u2500 ...\n        \u251c\u2500\u2500 genome_id_2\n        \u2502\u00a0 \u251c\u2500\u2500 ...\n        \u2514\u2500\u2500 ...\n    ```\n    \"\"\"\n    pass_validation = False\n    if self.config.mode == \"podp\":\n        for _ in range(3):\n            try:\n                validate_antismash(self.antismash_dir)\n                pass_validation = True\n                break\n            except FileNotFoundError:\n                shutil.rmtree(self.antismash_dir, ignore_errors=True)\n                self._download_and_extract_antismash()\n\n    if not pass_validation:\n        validate_antismash(self.antismash_dir)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_bigscape","title":"arrange_bigscape","text":"
    arrange_bigscape() -> None\n

    Arrange the BiG-SCAPE data.

    If self.config.mode is \"local\", validate the BiG-SCAPE data directory. If self.config.mode is \"podp\", run BiG-SCAPE to generate the clustering file if it doesn't exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid. The running output of BiG-SCAPE will be saved to the directory \"bigscape_running_output\" in the default BiG-SCAPE directory, and the clustering file \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" will be copied to the default BiG-SCAPE directory.

    The validation process includes:

    Source code in src/nplinker/arranger.py
    def arrange_bigscape(self) -> None:\n    \"\"\"Arrange the BiG-SCAPE data.\n\n    If `self.config.mode` is \"local\", validate the BiG-SCAPE data directory.\n    If `self.config.mode` is \"podp\", run BiG-SCAPE to generate the clustering file if it doesn't\n    exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid.\n    The running output of BiG-SCAPE will be saved to the directory \"bigscape_running_output\"\n    in the default BiG-SCAPE directory, and the clustering file\n    \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" will be copied to the default BiG-SCAPE\n    directory.\n\n    The validation process includes:\n\n    - Check if the default BiG-SCAPE data directory exists.\n    - Check if the clustering file \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" exists in the\n            BiG-SCAPE data directory.\n    - Check if the 'data_sqlite.db' file exists in the BiG-SCAPE data directory.\n    \"\"\"\n    pass_validation = False\n    if self.config.mode == \"podp\":\n        for _ in range(3):\n            try:\n                validate_bigscape(self.bigscape_dir, self.config.bigscape.cutoff)\n                pass_validation = True\n                break\n            except FileNotFoundError:\n                shutil.rmtree(self.bigscape_dir, ignore_errors=True)\n                self._run_bigscape()\n\n    if not pass_validation:\n        validate_bigscape(self.bigscape_dir, self.config.bigscape.cutoff)\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_strain_mappings","title":"arrange_strain_mappings","text":"
    arrange_strain_mappings() -> None\n

    Arrange the strain mappings file.

    If self.config.mode is \"local\", validate the strain mappings file. If self.config.mode is \"podp\", always generate the strain mappings file and validate it.

    The validation checks if the strain mappings file exists and if it is a valid JSON file according to the schema defined in schemas/strain_mappings_schema.json.

    Source code in src/nplinker/arranger.py
    def arrange_strain_mappings(self) -> None:\n    \"\"\"Arrange the strain mappings file.\n\n    If `self.config.mode` is \"local\", validate the strain mappings file.\n    If `self.config.mode` is \"podp\", always generate the strain mappings file and validate it.\n\n    The validation checks if the strain mappings file exists and if it is a valid JSON file\n    according to the schema defined in `schemas/strain_mappings_schema.json`.\n    \"\"\"\n    if self.config.mode == \"podp\":\n        self._generate_strain_mappings()\n\n    self._validate_strain_mappings()\n
    "},{"location":"api/arranger/#nplinker.arranger.DatasetArranger.arrange_strains_selected","title":"arrange_strains_selected","text":"
    arrange_strains_selected() -> None\n

    Arrange the strains selected file.

    Validate the strains selected file if it exists. The validation checks if the strains selected file is a valid JSON file according to the schema defined in schemas/user_strains.json.

    Source code in src/nplinker/arranger.py
    def arrange_strains_selected(self) -> None:\n    \"\"\"Arrange the strains selected file.\n\n    Validate the strains selected file if it exists.\n    The validation checks if the strains selected file is a valid JSON file according to the\n    schema defined in `schemas/user_strains.json`.\n    \"\"\"\n    strains_selected_file = self.root_dir / defaults.STRAINS_SELECTED_FILENAME\n    if strains_selected_file.exists():\n        with open(strains_selected_file, \"r\") as f:\n            json_data = json.load(f)\n        validate(instance=json_data, schema=USER_STRAINS_SCHEMA)\n
    "},{"location":"api/arranger/#nplinker.arranger.validate_gnps","title":"validate_gnps","text":"
    validate_gnps(gnps_dir: str | PathLike) -> None\n

    Validate the GNPS data directory and its contents.

    The GNPS data directory must contain the following files:

    Parameters:

    Name Type Description Default gnps_dir str | PathLike

    Path to the GNPS data directory.

    required

    Raises:

    Type Description FileNotFoundError

    If the GNPS data directory is not found or any of the required files is not found.

    ValueError

    If both file_mappings.tsv and file_mapping.csv are found.

    Source code in src/nplinker/arranger.py
    def validate_gnps(gnps_dir: str | PathLike) -> None:\n    \"\"\"Validate the GNPS data directory and its contents.\n\n    The GNPS data directory must contain the following files:\n\n    - file_mappings.tsv or file_mappings.csv\n    - spectra.mgf\n    - molecular_families.tsv\n    - annotations.tsv\n\n    Args:\n        gnps_dir: Path to the GNPS data directory.\n\n    Raises:\n        FileNotFoundError: If the GNPS data directory is not found or any of the required files\n            is not found.\n        ValueError: If both file_mappings.tsv and file_mapping.csv are found.\n    \"\"\"\n    gnps_dir = Path(gnps_dir)\n    if not gnps_dir.exists():\n        raise FileNotFoundError(f\"GNPS data directory not found at {gnps_dir}\")\n\n    file_mappings_tsv = gnps_dir / defaults.GNPS_FILE_MAPPINGS_TSV\n    file_mappings_csv = gnps_dir / defaults.GNPS_FILE_MAPPINGS_CSV\n    if file_mappings_tsv.exists() and file_mappings_csv.exists():\n        raise ValueError(\n            f\"Both {file_mappings_tsv.name} and {file_mappings_csv.name} found in GNPS directory \"\n            f\"{gnps_dir}, only one is allowed.\"\n        )\n    elif not file_mappings_tsv.exists() and not file_mappings_csv.exists():\n        raise FileNotFoundError(\n            f\"Neither {file_mappings_tsv.name} nor {file_mappings_csv.name} found in GNPS directory\"\n            f\" {gnps_dir}\"\n        )\n\n    required_files = [\n        gnps_dir / defaults.GNPS_SPECTRA_FILENAME,\n        gnps_dir / defaults.GNPS_MOLECULAR_FAMILY_FILENAME,\n        gnps_dir / defaults.GNPS_ANNOTATIONS_FILENAME,\n    ]\n    list_not_found = [f.name for f in required_files if not f.exists()]\n    if list_not_found:\n        raise FileNotFoundError(\n            f\"Files not found in GNPS directory {gnps_dir}: ', '.join({list_not_found})\"\n        )\n
    "},{"location":"api/arranger/#nplinker.arranger.validate_antismash","title":"validate_antismash","text":"
    validate_antismash(antismash_dir: str | PathLike) -> None\n

    Validate the antiSMASH data directory and its contents.

    The validation only checks the structure of the antiSMASH data directory and file names. It does not check

    The antiSMASH data directory must exist and contain at least one sub-directory. The name of the sub-directories must not contain any space. Each sub-directory must contain at least one BGC file (with the suffix \".region???.gbk\" where ??? is the region number).

    Parameters:

    Name Type Description Default antismash_dir str | PathLike

    Path to the antiSMASH data directory.

    required

    Raises:

    Type Description FileNotFoundError

    If the antiSMASH data directory is not found, or no sub-directories are found in the antiSMASH data directory, or no BGC files are found in any sub-directory.

    ValueError

    If any sub-directory name contains a space.

    Source code in src/nplinker/arranger.py
    def validate_antismash(antismash_dir: str | PathLike) -> None:\n    \"\"\"Validate the antiSMASH data directory and its contents.\n\n    The validation only checks the structure of the antiSMASH data directory and file names.\n    It does not check\n\n    - the content of the BGC files\n    - the consistency between the antiSMASH data and the PODP project JSON file for the PODP\n        mode\n\n    The antiSMASH data directory must exist and contain at least one sub-directory. The name of the\n    sub-directories must not contain any space. Each sub-directory must contain at least one BGC\n    file (with the suffix \".region???.gbk\" where ??? is the region number).\n\n    Args:\n        antismash_dir: Path to the antiSMASH data directory.\n\n    Raises:\n        FileNotFoundError: If the antiSMASH data directory is not found, or no sub-directories\n            are found in the antiSMASH data directory, or no BGC files are found in any\n            sub-directory.\n        ValueError: If any sub-directory name contains a space.\n    \"\"\"\n    antismash_dir = Path(antismash_dir)\n    if not antismash_dir.exists():\n        raise FileNotFoundError(f\"antiSMASH data directory not found at {antismash_dir}\")\n\n    sub_dirs = list_dirs(antismash_dir)\n    if not sub_dirs:\n        raise FileNotFoundError(\n            \"No BGC directories found in antiSMASH data directory {antismash_dir}\"\n        )\n\n    for sub_dir in sub_dirs:\n        dir_name = Path(sub_dir).name\n        if \" \" in dir_name:\n            raise ValueError(\n                f\"antiSMASH sub-directory name {dir_name} contains space, which is not allowed\"\n            )\n\n        gbk_files = list_files(sub_dir, suffix=\".gbk\", keep_parent=False)\n        bgc_files = fnmatch.filter(gbk_files, \"*.region???.gbk\")\n        if not bgc_files:\n            raise FileNotFoundError(f\"No BGC files found in antiSMASH sub-directory {sub_dir}\")\n
    "},{"location":"api/arranger/#nplinker.arranger.validate_bigscape","title":"validate_bigscape","text":"
    validate_bigscape(\n    bigscape_dir: str | PathLike, cutoff: str\n) -> None\n

    Validate the BiG-SCAPE data directory and its contents.

    The BiG-SCAPE data directory must exist and contain the clustering file \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" where {self.config.bigscape.cutoff} is the bigscape cutoff value set in the config file.

    Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2. At the moment, all the family assignments in the database will be used, so this database should contain results from a single run with the desired cutoff.

    Parameters:

    Name Type Description Default bigscape_dir str | PathLike

    Path to the BiG-SCAPE data directory.

    required cutoff str

    The BiG-SCAPE cutoff value.

    required

    Raises:

    Type Description FileNotFoundError

    If the BiG-SCAPE data directory or the clustering file is not found.

    Source code in src/nplinker/arranger.py
    def validate_bigscape(bigscape_dir: str | PathLike, cutoff: str) -> None:\n    \"\"\"Validate the BiG-SCAPE data directory and its contents.\n\n    The BiG-SCAPE data directory must exist and contain the clustering file\n    \"mix_clustering_c{self.config.bigscape.cutoff}.tsv\" where {self.config.bigscape.cutoff} is the\n    bigscape cutoff value set in the config file.\n\n    Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2.\n    At the moment, all the family assignments in the database will be used, so this database should\n    contain results from a single run with the desired cutoff.\n\n    Args:\n        bigscape_dir: Path to the BiG-SCAPE data directory.\n        cutoff: The BiG-SCAPE cutoff value.\n\n    Raises:\n        FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found.\n    \"\"\"\n    bigscape_dir = Path(bigscape_dir)\n    if not bigscape_dir.exists():\n        raise FileNotFoundError(f\"BiG-SCAPE data directory not found at {bigscape_dir}\")\n\n    clustering_file = bigscape_dir / f\"mix_clustering_c{cutoff}.tsv\"\n    database_file = bigscape_dir / \"data_sqlite.db\"\n    if not clustering_file.exists() and not database_file.exists():\n        raise FileNotFoundError(f\"BiG-SCAPE data not found in {clustering_file} or {database_file}\")\n
    "},{"location":"api/bigscape/","title":"BigScape","text":""},{"location":"api/bigscape/#nplinker.genomics.bigscape","title":"bigscape","text":""},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeGCFLoader","title":"BigscapeGCFLoader","text":"
    BigscapeGCFLoader(cluster_file: str | PathLike)\n

    Bases: GCFLoaderBase

    Build a loader for BiG-SCAPE GCF cluster file.

    Attributes:

    Name Type Description cluster_file str

    path to the BiG-SCAPE cluster file.

    Parameters:

    Name Type Description Default cluster_file str | PathLike

    Path to the BiG-SCAPE cluster file, the filename has a pattern of \"_clustering_c0.xx.tsv\". required Source code in src/nplinker/genomics/bigscape/bigscape_loader.py

    def __init__(self, cluster_file: str | PathLike, /) -> None:\n    \"\"\"Initialize the BiG-SCAPE GCF loader.\n\n    Args:\n        cluster_file: Path to the BiG-SCAPE cluster file,\n            the filename has a pattern of \"<class>_clustering_c0.xx.tsv\".\n    \"\"\"\n    self.cluster_file: str = str(cluster_file)\n    self._gcf_list = self._parse_gcf(self.cluster_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeGCFLoader.cluster_file","title":"cluster_file instance-attribute","text":"
    cluster_file: str = str(cluster_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeGCFLoader.get_gcfs","title":"get_gcfs","text":"
    get_gcfs(\n    keep_mibig_only: bool = False,\n    keep_singleton: bool = False,\n) -> list[GCF]\n

    Get all GCF objects.

    Parameters:

    Name Type Description Default keep_mibig_only bool

    True to keep GCFs that contain only MIBiG BGCs.

    False keep_singleton bool

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

    False

    Returns:

    Type Description list[GCF]

    A list of GCF objects.

    Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
    def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]:\n    \"\"\"Get all GCF objects.\n\n    Args:\n        keep_mibig_only: True to keep GCFs that contain only MIBiG\n            BGCs.\n        keep_singleton: True to keep singleton GCFs. A singleton GCF\n            is a GCF that contains only one BGC.\n\n    Returns:\n        A list of GCF objects.\n    \"\"\"\n    gcf_list = self._gcf_list\n    if not keep_mibig_only:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()]\n    if not keep_singleton:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()]\n    return gcf_list\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeV2GCFLoader","title":"BigscapeV2GCFLoader","text":"
    BigscapeV2GCFLoader(db_file: str | PathLike)\n

    Bases: GCFLoaderBase

    Build a loader for BiG-SCAPE v2 database file.

    Attributes:

    Name Type Description db_file

    Path to the BiG-SCAPE database file.

    Parameters:

    Name Type Description Default db_file str | PathLike

    Path to the BiG-SCAPE v2 database file

    required Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
    def __init__(self, db_file: str | PathLike, /) -> None:\n    \"\"\"Initialize the BiG-SCAPE v2 GCF loader.\n\n    Args:\n        db_file: Path to the BiG-SCAPE v2 database file\n    \"\"\"\n    self.db_file = str(db_file)\n    self._gcf_list = self._parse_gcf(self.db_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeV2GCFLoader.db_file","title":"db_file instance-attribute","text":"
    db_file = str(db_file)\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.BigscapeV2GCFLoader.get_gcfs","title":"get_gcfs","text":"
    get_gcfs(\n    keep_mibig_only: bool = False,\n    keep_singleton: bool = False,\n) -> list[GCF]\n

    Get all GCF objects.

    Parameters:

    Name Type Description Default keep_mibig_only bool

    True to keep GCFs that contain only MIBiG BGCs.

    False keep_singleton bool

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

    False

    Returns:

    Type Description list[GCF]

    a list of GCF objects.

    Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
    def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]:\n    \"\"\"Get all GCF objects.\n\n    Args:\n        keep_mibig_only: True to keep GCFs that contain only MIBiG\n            BGCs.\n        keep_singleton: True to keep singleton GCFs. A singleton GCF\n            is a GCF that contains only one BGC.\n\n    Returns:\n        a list of GCF objects.\n    \"\"\"\n    gcf_list = self._gcf_list\n    if not keep_mibig_only:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()]\n    if not keep_singleton:\n        gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()]\n    return gcf_list\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.run_bigscape","title":"run_bigscape","text":"
    run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n)\n
    Source code in src/nplinker/genomics/bigscape/runbigscape.py
    def run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n):\n    bigscape_py_path = \"bigscape.py\"\n    logger.info(\n        f'run_bigscape: input=\"{antismash_path}\", output=\"{output_path}\", extra_params={extra_params}\"'\n    )\n\n    try:\n        subprocess.run([bigscape_py_path, \"-h\"], capture_output=True, check=True)\n    except Exception as e:\n        raise Exception(f\"Failed to find/run bigscape.py (path={bigscape_py_path}, err={e})\") from e\n\n    if not os.path.exists(antismash_path):\n        raise Exception(f'antismash_path \"{antismash_path}\" does not exist!')\n\n    # configure the IO-related parameters, including pfam_dir\n    args = [bigscape_py_path, \"-i\", antismash_path, \"-o\", output_path, \"--pfam_dir\", PFAM_PATH]\n\n    # append the user supplied params, if any\n    if len(extra_params) > 0:\n        args.extend(extra_params.split(\" \"))\n\n    logger.info(f\"BiG-SCAPE command: {args}\")\n    result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr, check=True)\n    logger.info(f\"BiG-SCAPE completed with return code {result.returncode}\")\n    # use subprocess.CompletedProcess.check_returncode() to test if the BiG-SCAPE\n    # process exited successfully. This throws an exception for non-zero returncodes\n    # which will indicate to the PODPDownloader module that something went wrong.\n    result.check_returncode()\n\n    return True\n
    "},{"location":"api/bigscape/#nplinker.genomics.bigscape.run_bigscape","title":"run_bigscape","text":"
    run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n)\n
    Source code in src/nplinker/genomics/bigscape/runbigscape.py
    def run_bigscape(\n    antismash_path: str | PathLike,\n    output_path: str | PathLike,\n    extra_params: str,\n):\n    bigscape_py_path = \"bigscape.py\"\n    logger.info(\n        f'run_bigscape: input=\"{antismash_path}\", output=\"{output_path}\", extra_params={extra_params}\"'\n    )\n\n    try:\n        subprocess.run([bigscape_py_path, \"-h\"], capture_output=True, check=True)\n    except Exception as e:\n        raise Exception(f\"Failed to find/run bigscape.py (path={bigscape_py_path}, err={e})\") from e\n\n    if not os.path.exists(antismash_path):\n        raise Exception(f'antismash_path \"{antismash_path}\" does not exist!')\n\n    # configure the IO-related parameters, including pfam_dir\n    args = [bigscape_py_path, \"-i\", antismash_path, \"-o\", output_path, \"--pfam_dir\", PFAM_PATH]\n\n    # append the user supplied params, if any\n    if len(extra_params) > 0:\n        args.extend(extra_params.split(\" \"))\n\n    logger.info(f\"BiG-SCAPE command: {args}\")\n    result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr, check=True)\n    logger.info(f\"BiG-SCAPE completed with return code {result.returncode}\")\n    # use subprocess.CompletedProcess.check_returncode() to test if the BiG-SCAPE\n    # process exited successfully. This throws an exception for non-zero returncodes\n    # which will indicate to the PODPDownloader module that something went wrong.\n    result.check_returncode()\n\n    return True\n
    "},{"location":"api/genomics/","title":"Data Models","text":""},{"location":"api/genomics/#nplinker.genomics","title":"genomics","text":""},{"location":"api/genomics/#nplinker.genomics.BGC","title":"BGC","text":"
    BGC(id: str, /, *product_prediction: str)\n

    Class to model BGC (biosynthetic gene cluster) data.

    BGC data include both annotations and sequence data. This class is mainly designed to model the annotations or metadata.

    The raw BGC data is stored in GenBank format (.gbk). Additional GenBank features could be added to the GenBank file to annotate BGCs, e.g. antiSMASH has some self-defined features (like region) in its output GenBank files.

    The annotations of BGC can be stored in JSON format, which is defined and used by MIBiG.

    Attributes:

    Name Type Description id

    BGC identifier, e.g. MIBiG accession, GenBank accession.

    product_prediction

    A tuple of (predicted) natural products or product classes of the BGC. For antiSMASH's GenBank data, the feature region /product gives product information. For MIBiG metadata, its biosynthetic class provides such info.

    mibig_bgc_class tuple[str] | None

    A tuple of MIBiG biosynthetic classes to which the BGC belongs. Defaults to None. MIBiG defines 6 major biosynthetic classes for natural products, including \"NRP\", \"Polyketide\", \"RiPP\", \"Terpene\", \"Saccharide\" and \"Alkaloid\". Note that natural products created by all other biosynthetic mechanisms fall under the category \"Other\". More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.

    description str | None

    Brief description of the BGC. Defaults to None.

    smiles tuple[str] | None

    A tuple of SMILES formulas of the BGC's products. Defaults to None.

    antismash_file str | None

    The path to the antiSMASH GenBank file. Defaults to None.

    antismash_id str | None

    Identifier of the antiSMASH BGC, referring to the feature VERSION of GenBank file. Defaults to None.

    antismash_region int | None

    AntiSMASH BGC region number, referring to the feature region of GenBank file. Defaults to None.

    parents set[GCF]

    The set of GCFs that contain the BGC.

    strain Strain | None

    The strain of the BGC.

    Parameters:

    Name Type Description Default id str

    BGC identifier, e.g. MIBiG accession, GenBank accession.

    required product_prediction str

    BGC's (predicted) natural products or product classes.

    () Source code in src/nplinker/genomics/bgc.py
    def __init__(self, id: str, /, *product_prediction: str):\n    \"\"\"Initialize the BGC object.\n\n    Args:\n        id: BGC identifier, e.g. MIBiG accession, GenBank accession.\n        product_prediction: BGC's (predicted) natural products or product classes.\n    \"\"\"\n    # BGC metadata\n    self.id = id\n    self.product_prediction = product_prediction\n\n    self.mibig_bgc_class: tuple[str] | None = None\n    self.description: str | None = None\n    self.smiles: tuple[str] | None = None\n\n    # antismash related attributes\n    self.antismash_file: str | None = None\n    self.antismash_id: str | None = None  # version in .gbk, id in SeqRecord\n    self.antismash_region: int | None = None  # antismash region number\n\n    # other attributes\n    self.parents: set[GCF] = set()\n    self._strain: Strain | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.id","title":"id instance-attribute","text":"
    id = id\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.product_prediction","title":"product_prediction instance-attribute","text":"
    product_prediction = product_prediction\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.mibig_bgc_class","title":"mibig_bgc_class instance-attribute","text":"
    mibig_bgc_class: tuple[str] | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.description","title":"description instance-attribute","text":"
    description: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.smiles","title":"smiles instance-attribute","text":"
    smiles: tuple[str] | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.antismash_file","title":"antismash_file instance-attribute","text":"
    antismash_file: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.antismash_id","title":"antismash_id instance-attribute","text":"
    antismash_id: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.antismash_region","title":"antismash_region instance-attribute","text":"
    antismash_region: int | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.parents","title":"parents instance-attribute","text":"
    parents: set[GCF] = set()\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.strain","title":"strain property writable","text":"
    strain: Strain | None\n

    Get the strain of the BGC.

    "},{"location":"api/genomics/#nplinker.genomics.BGC.bigscape_classes","title":"bigscape_classes property","text":"
    bigscape_classes: set[str | None]\n

    Get BiG-SCAPE's BGC classes.

    BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have more categories (7 classes). More details see: https://doi.org/10.1038%2Fs41589-019-0400-9.

    "},{"location":"api/genomics/#nplinker.genomics.BGC.aa_predictions","title":"aa_predictions property","text":"
    aa_predictions: list\n

    Amino acids as predicted monomers of product.

    Returns:

    Type Description list

    list of dicts with key as amino acid and value as prediction

    list

    probability.

    "},{"location":"api/genomics/#nplinker.genomics.BGC.add_parent","title":"add_parent","text":"
    add_parent(gcf: GCF) -> None\n

    Add a parent GCF to the BGC.

    Parameters:

    Name Type Description Default gcf GCF

    gene cluster family

    required Source code in src/nplinker/genomics/bgc.py
    def add_parent(self, gcf: GCF) -> None:\n    \"\"\"Add a parent GCF to the BGC.\n\n    Args:\n        gcf: gene cluster family\n    \"\"\"\n    gcf.add_bgc(self)\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.detach_parent","title":"detach_parent","text":"
    detach_parent(gcf: GCF) -> None\n

    Remove a parent GCF.

    Source code in src/nplinker/genomics/bgc.py
    def detach_parent(self, gcf: GCF) -> None:\n    \"\"\"Remove a parent GCF.\"\"\"\n    gcf.detach_bgc(self)\n
    "},{"location":"api/genomics/#nplinker.genomics.BGC.is_mibig","title":"is_mibig","text":"
    is_mibig() -> bool\n

    Check if the BGC is MIBiG reference BGC or not.

    Note

    This method evaluates MIBiG BGC based on the pattern that MIBiG BGC names start with \"BGC\". It might give false positive result.

    Returns:

    Type Description bool

    True if it's MIBiG reference BGC

    Source code in src/nplinker/genomics/bgc.py
    def is_mibig(self) -> bool:\n    \"\"\"Check if the BGC is MIBiG reference BGC or not.\n\n    Note:\n        This method evaluates MIBiG BGC based on the pattern that MIBiG\n        BGC names start with \"BGC\". It might give false positive result.\n\n    Returns:\n        True if it's MIBiG reference BGC\n    \"\"\"\n    return self.id.startswith(\"BGC\")\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF","title":"GCF","text":"
    GCF(id: str)\n

    Class to model gene cluster family (GCF).

    GCF is a group of similar BGCs and generated by clustering BGCs with tools such as BiG-SCAPE and BiG-SLICE.

    Attributes:

    Name Type Description id

    id of the GCF object.

    bgc_ids set[str]

    a set of BGC ids that belongs to the GCF.

    bigscape_class str | None

    BiG-SCAPE's BGC class. BiG-SCAPE's BGC classes are similar to those defined in MiBIG but have more categories (7 classes). More details see: https://doi.org/10.1038%2Fs41589-019-0400-9.

    Parameters:

    Name Type Description Default id str

    id of the GCF object.

    required Source code in src/nplinker/genomics/gcf.py
    def __init__(self, id: str, /) -> None:\n    \"\"\"Initialize the GCF object.\n\n    Args:\n        id: id of the GCF object.\n    \"\"\"\n    self.id = id\n    self.bgc_ids: set[str] = set()\n    self.bigscape_class: str | None = None\n    self._bgcs: set[BGC] = set()\n    self._strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.id","title":"id instance-attribute","text":"
    id = id\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.bgc_ids","title":"bgc_ids instance-attribute","text":"
    bgc_ids: set[str] = set()\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.bigscape_class","title":"bigscape_class instance-attribute","text":"
    bigscape_class: str | None = None\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.bgcs","title":"bgcs property","text":"
    bgcs: set[BGC]\n

    Get the BGC objects.

    "},{"location":"api/genomics/#nplinker.genomics.GCF.strains","title":"strains property","text":"
    strains: StrainCollection\n

    Get the strains in the GCF.

    "},{"location":"api/genomics/#nplinker.genomics.GCF.add_bgc","title":"add_bgc","text":"
    add_bgc(bgc: BGC) -> None\n

    Add a BGC object to the GCF.

    Source code in src/nplinker/genomics/gcf.py
    def add_bgc(self, bgc: BGC) -> None:\n    \"\"\"Add a BGC object to the GCF.\"\"\"\n    bgc.parents.add(self)\n    self._bgcs.add(bgc)\n    self.bgc_ids.add(bgc.id)\n    if bgc.strain is not None:\n        self._strains.add(bgc.strain)\n    else:\n        logger.warning(\"No strain specified for the BGC %s\", bgc.id)\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.detach_bgc","title":"detach_bgc","text":"
    detach_bgc(bgc: BGC) -> None\n

    Remove a child BGC object.

    Source code in src/nplinker/genomics/gcf.py
    def detach_bgc(self, bgc: BGC) -> None:\n    \"\"\"Remove a child BGC object.\"\"\"\n    bgc.parents.remove(self)\n    self._bgcs.remove(bgc)\n    self.bgc_ids.remove(bgc.id)\n    if bgc.strain is not None:\n        for other_bgc in self._bgcs:\n            if other_bgc.strain == bgc.strain:\n                return\n        self._strains.remove(bgc.strain)\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.has_strain","title":"has_strain","text":"
    has_strain(strain: Strain) -> bool\n

    Check if the given strain exists.

    Parameters:

    Name Type Description Default strain Strain

    Strain object.

    required

    Returns:

    Type Description bool

    True when the given strain exist.

    Source code in src/nplinker/genomics/gcf.py
    def has_strain(self, strain: Strain) -> bool:\n    \"\"\"Check if the given strain exists.\n\n    Args:\n        strain: `Strain` object.\n\n    Returns:\n        True when the given strain exist.\n    \"\"\"\n    return strain in self._strains\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.has_mibig_only","title":"has_mibig_only","text":"
    has_mibig_only() -> bool\n

    Check if the GCF's children are only MIBiG BGCs.

    Returns:

    Type Description bool

    True if GCF.bgc_ids are only MIBiG BGC ids.

    Source code in src/nplinker/genomics/gcf.py
    def has_mibig_only(self) -> bool:\n    \"\"\"Check if the GCF's children are only MIBiG BGCs.\n\n    Returns:\n        True if `GCF.bgc_ids` are only MIBiG BGC ids.\n    \"\"\"\n    return all(map(lambda id: id.startswith(\"BGC\"), self.bgc_ids))\n
    "},{"location":"api/genomics/#nplinker.genomics.GCF.is_singleton","title":"is_singleton","text":"
    is_singleton() -> bool\n

    Check if the GCF contains only one BGC.

    Returns:

    Type Description bool

    True if GCF.bgc_ids contains only one BGC id.

    Source code in src/nplinker/genomics/gcf.py
    def is_singleton(self) -> bool:\n    \"\"\"Check if the GCF contains only one BGC.\n\n    Returns:\n        True if `GCF.bgc_ids` contains only one BGC id.\n    \"\"\"\n    return len(self.bgc_ids) == 1\n
    "},{"location":"api/genomics_abc/","title":"Abstract Base Classes","text":""},{"location":"api/genomics_abc/#nplinker.genomics.abc","title":"abc","text":""},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase","title":"BGCLoaderBase","text":"
    BGCLoaderBase(data_dir: str | PathLike)\n

    Bases: ABC

    Abstract base class for BGC loader.

    Parameters:

    Name Type Description Default data_dir str | PathLike

    Path to directory that contains BGC metadata files (.json) or full data genbank files (.gbk).

    required Source code in src/nplinker/genomics/abc.py
    def __init__(self, data_dir: str | PathLike) -> None:\n    \"\"\"Initialize the BGC loader.\n\n    Args:\n        data_dir: Path to directory that contains BGC metadata files\n            (.json) or full data genbank files (.gbk).\n    \"\"\"\n    self.data_dir = str(data_dir)\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase.data_dir","title":"data_dir instance-attribute","text":"
    data_dir = str(data_dir)\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase.get_files","title":"get_files abstractmethod","text":"
    get_files() -> dict[str, str]\n

    Get path to BGC files.

    Returns:

    Type Description dict[str, str]

    The key is BGC name and value is path to BGC file

    Source code in src/nplinker/genomics/abc.py
    @abstractmethod\ndef get_files(self) -> dict[str, str]:\n    \"\"\"Get path to BGC files.\n\n    Returns:\n        The key is BGC name and value is path to BGC file\n    \"\"\"\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.BGCLoaderBase.get_bgcs","title":"get_bgcs abstractmethod","text":"
    get_bgcs() -> list[BGC]\n

    Get BGC objects.

    Returns:

    Type Description list[BGC]

    A list of BGC objects

    Source code in src/nplinker/genomics/abc.py
    @abstractmethod\ndef get_bgcs(self) -> list[BGC]:\n    \"\"\"Get BGC objects.\n\n    Returns:\n        A list of BGC objects\n    \"\"\"\n
    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.GCFLoaderBase","title":"GCFLoaderBase","text":"

    Bases: ABC

    Abstract base class for GCF loader.

    "},{"location":"api/genomics_abc/#nplinker.genomics.abc.GCFLoaderBase.get_gcfs","title":"get_gcfs abstractmethod","text":"
    get_gcfs(\n    keep_mibig_only: bool, keep_singleton: bool\n) -> list[GCF]\n

    Get GCF objects.

    Parameters:

    Name Type Description Default keep_mibig_only bool

    True to keep GCFs that contain only MIBiG BGCs.

    required keep_singleton bool

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

    required

    Returns:

    Type Description list[GCF]

    A list of GCF objects

    Source code in src/nplinker/genomics/abc.py
    @abstractmethod\ndef get_gcfs(self, keep_mibig_only: bool, keep_singleton: bool) -> list[GCF]:\n    \"\"\"Get GCF objects.\n\n    Args:\n        keep_mibig_only: True to keep GCFs that contain only MIBiG\n            BGCs.\n        keep_singleton: True to keep singleton GCFs. A singleton GCF\n            is a GCF that contains only one BGC.\n\n    Returns:\n        A list of GCF objects\n    \"\"\"\n
    "},{"location":"api/genomics_utils/","title":"Utilities","text":""},{"location":"api/genomics_utils/#nplinker.genomics.utils","title":"utils","text":""},{"location":"api/genomics_utils/#nplinker.genomics.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.generate_mappings_genome_id_bgc_id","title":"generate_mappings_genome_id_bgc_id","text":"
    generate_mappings_genome_id_bgc_id(\n    bgc_dir: str | PathLike,\n    output_file: str | PathLike | None = None,\n) -> None\n

    Generate a file that maps genome id to BGC id.

    Note that the output_file will be overwritten if it already exists.

    Parameters:

    Name Type Description Default bgc_dir str | PathLike

    The directory has one-layer of subfolders and each subfolder contains BGC files in .gbk format. It assumes that - the subfolder name is the genome id (e.g. refseq), - the BGC file name is the BGC id.

    required output_file str | PathLike | None

    The path to the output file. Note that the file will be overwritten if it already exists. Defaults to None, in which case the output file will be placed in the directory bgc_dir with a file name defined in global variable GENOME_BGC_MAPPINGS_FILENAME.

    None Source code in src/nplinker/genomics/utils.py
    def generate_mappings_genome_id_bgc_id(\n    bgc_dir: str | PathLike, output_file: str | PathLike | None = None\n) -> None:\n    \"\"\"Generate a file that maps genome id to BGC id.\n\n    Note that the `output_file` will be overwritten if it already exists.\n\n    Args:\n        bgc_dir: The directory has one-layer of subfolders and\n            each subfolder contains BGC files in `.gbk` format.\n            It assumes that\n            - the subfolder name is the genome id (e.g. refseq),\n            - the BGC file name is the BGC id.\n        output_file: The path to the output file. Note\n            that the file will be overwritten if it already exists.\n            Defaults to None, in which case the output file will be placed in\n            the directory `bgc_dir` with a file name defined in global variable\n            `GENOME_BGC_MAPPINGS_FILENAME`.\n    \"\"\"\n    bgc_dir = Path(bgc_dir)\n    genome_bgc_mappings = {}\n\n    for subdir in list_dirs(bgc_dir):\n        genome_id = Path(subdir).name\n        bgc_files = list_files(subdir, suffix=(\".gbk\"), keep_parent=False)\n        bgc_ids = [bgc_id for f in bgc_files if (bgc_id := Path(f).stem) != genome_id]\n        if bgc_ids:\n            genome_bgc_mappings[genome_id] = bgc_ids\n        else:\n            logger.warning(\"No BGC files found in %s\", subdir)\n\n    # sort mappings by genome_id and construct json data\n    genome_bgc_mappings = dict(sorted(genome_bgc_mappings.items()))\n    json_data_mappings = [{\"genome_ID\": k, \"BGC_ID\": v} for k, v in genome_bgc_mappings.items()]\n    json_data = {\"mappings\": json_data_mappings, \"version\": \"1.0\"}\n\n    # validate json data\n    validate(instance=json_data, schema=GENOME_BGC_MAPPINGS_SCHEMA)\n\n    if output_file is None:\n        output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME\n    with open(output_file, \"w\") as f:\n        json.dump(json_data, f)\n    logger.info(\"Generated genome-BGC mappings file: %s\", output_file)\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.add_strain_to_bgc","title":"add_strain_to_bgc","text":"
    add_strain_to_bgc(\n    strains: StrainCollection, bgcs: Sequence[BGC]\n) -> tuple[list[BGC], list[BGC]]\n

    Assign a Strain object to BGC.strain for input BGCs.

    BGC id is used to find the corresponding Strain object. It's possible that no Strain object is found for a BGC id.

    Note that the input list bgcs will be changed in place.

    Parameters:

    Name Type Description Default strains StrainCollection

    A collection of all strain objects.

    required bgcs Sequence[BGC]

    A list of BGC objects.

    required

    Returns:

    Type Description tuple[list[BGC], list[BGC]]

    A tuple of two lists of BGC objects,

    Raises:

    Type Description ValueError

    Multiple strain objects found for a BGC id.

    Source code in src/nplinker/genomics/utils.py
    def add_strain_to_bgc(\n    strains: StrainCollection, bgcs: Sequence[BGC]\n) -> tuple[list[BGC], list[BGC]]:\n    \"\"\"Assign a Strain object to `BGC.strain` for input BGCs.\n\n    BGC id is used to find the corresponding Strain object. It's possible that\n    no Strain object is found for a BGC id.\n\n    Note that the input list `bgcs` will be changed in place.\n\n    Args:\n        strains: A collection of all strain objects.\n        bgcs: A list of BGC objects.\n\n    Returns:\n        A tuple of two lists of BGC objects,\n\n            - the first list contains BGC objects that are updated with Strain object;\n            - the second list contains BGC objects that are not updated with\n                Strain object because no Strain object is found.\n\n    Raises:\n        ValueError: Multiple strain objects found for a BGC id.\n    \"\"\"\n    bgc_with_strain = []\n    bgc_without_strain = []\n    for bgc in bgcs:\n        try:\n            strain_list = strains.lookup(bgc.id)\n        except ValueError:\n            bgc_without_strain.append(bgc)\n            continue\n        if len(strain_list) > 1:\n            raise ValueError(\n                f\"Multiple strain objects found for BGC id '{bgc.id}'.\"\n                f\"BGC object accept only one strain.\"\n            )\n        bgc.strain = strain_list[0]\n        bgc_with_strain.append(bgc)\n\n    logger.info(\n        f\"{len(bgc_with_strain)} BGC objects updated with Strain object.\\n\"\n        f\"{len(bgc_without_strain)} BGC objects not updated with Strain object.\"\n    )\n    return bgc_with_strain, bgc_without_strain\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.add_bgc_to_gcf","title":"add_bgc_to_gcf","text":"
    add_bgc_to_gcf(\n    bgcs: Sequence[BGC], gcfs: Sequence[GCF]\n) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]\n

    Add BGC objects to GCF object based on GCF's BGC ids.

    The attribute of GCF.bgc_ids contains the ids of BGC objects. These ids are used to find BGC objects from the input bgcs list. The found BGC objects are added to the bgcs attribute of GCF object. It is possible that some BGC ids are not found in the input bgcs list, and so their BGC objects are missing in the GCF object.

    This method changes the lists bgcs and gcfs in place.

    Parameters:

    Name Type Description Default bgcs Sequence[BGC]

    A list of BGC objects.

    required gcfs Sequence[GCF]

    A list of GCF objects.

    required

    Returns:

    Type Description tuple[list[GCF], list[GCF], dict[GCF, set[str]]]

    A tuple of two lists and a dictionary,

    Source code in src/nplinker/genomics/utils.py
    def add_bgc_to_gcf(\n    bgcs: Sequence[BGC], gcfs: Sequence[GCF]\n) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]:\n    \"\"\"Add BGC objects to GCF object based on GCF's BGC ids.\n\n    The attribute of `GCF.bgc_ids` contains the ids of BGC objects. These ids\n    are used to find BGC objects from the input `bgcs` list. The found BGC\n    objects are added to the `bgcs` attribute of GCF object. It is possible that\n    some BGC ids are not found in the input `bgcs` list, and so their BGC\n    objects are missing in the GCF object.\n\n    This method changes the lists `bgcs` and `gcfs` in place.\n\n    Args:\n        bgcs: A list of BGC objects.\n        gcfs: A list of GCF objects.\n\n    Returns:\n        A tuple of two lists and a dictionary,\n\n            - The first list contains GCF objects that are updated with BGC objects;\n            - The second list contains GCF objects that are not updated with BGC objects\n                because no BGC objects are found;\n            - The dictionary contains GCF objects as keys and a set of ids of missing\n                BGC objects as values.\n    \"\"\"\n    bgc_dict = {bgc.id: bgc for bgc in bgcs}\n    gcf_with_bgc = []\n    gcf_without_bgc = []\n    gcf_missing_bgc: dict[GCF, set[str]] = {}\n    for gcf in gcfs:\n        for bgc_id in gcf.bgc_ids:\n            try:\n                bgc = bgc_dict[bgc_id]\n            except KeyError:\n                if gcf not in gcf_missing_bgc:\n                    gcf_missing_bgc[gcf] = {bgc_id}\n                else:\n                    gcf_missing_bgc[gcf].add(bgc_id)\n                continue\n            gcf.add_bgc(bgc)\n\n        if gcf.bgcs:\n            gcf_with_bgc.append(gcf)\n        else:\n            gcf_without_bgc.append(gcf)\n\n    logger.info(\n        f\"{len(gcf_with_bgc)} GCF objects updated with BGC objects.\\n\"\n        f\"{len(gcf_without_bgc)} GCF objects not updated with BGC objects.\\n\"\n        f\"{len(gcf_missing_bgc)} GCF objects have missing BGC objects.\"\n    )\n    return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.get_mibig_from_gcf","title":"get_mibig_from_gcf","text":"
    get_mibig_from_gcf(\n    gcfs: Sequence[GCF],\n) -> tuple[list[BGC], StrainCollection]\n

    Get MIBiG BGCs and strains from GCF objects.

    Parameters:

    Name Type Description Default gcfs Sequence[GCF]

    A list of GCF objects.

    required

    Returns:

    Type Description tuple[list[BGC], StrainCollection]

    A tuple of two objects,

    Source code in src/nplinker/genomics/utils.py
    def get_mibig_from_gcf(gcfs: Sequence[GCF]) -> tuple[list[BGC], StrainCollection]:\n    \"\"\"Get MIBiG BGCs and strains from GCF objects.\n\n    Args:\n        gcfs: A list of GCF objects.\n\n    Returns:\n        A tuple of two objects,\n\n            - the first is a list of MIBiG BGC objects used in the GCFs;\n            - the second is a StrainCollection object that contains all Strain objects used in the\n            GCFs.\n    \"\"\"\n    mibig_bgcs_in_use = []\n    mibig_strains_in_use = StrainCollection()\n    for gcf in gcfs:\n        for bgc in gcf.bgcs:\n            if bgc.is_mibig():\n                mibig_bgcs_in_use.append(bgc)\n                if bgc.strain is not None:\n                    mibig_strains_in_use.add(bgc.strain)\n    return mibig_bgcs_in_use, mibig_strains_in_use\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.extract_mappings_strain_id_original_genome_id","title":"extract_mappings_strain_id_original_genome_id","text":"
    extract_mappings_strain_id_original_genome_id(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"strain id <-> original genome id\".

    Parameters:

    Name Type Description Default podp_project_json_file str | PathLike

    The path to the PODP project JSON file.

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of original genome ids.

    Notes

    The podp_project_json_file is the project JSON file downloaded from PODP platform. For example, for project MSV000079284, its json file is https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.

    Source code in src/nplinker/genomics/utils.py
    def extract_mappings_strain_id_original_genome_id(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"strain id <-> original genome id\".\n\n    Args:\n        podp_project_json_file: The path to the PODP project\n            JSON file.\n\n    Returns:\n        Key is strain id and value is a set of original genome ids.\n\n    Notes:\n        The `podp_project_json_file` is the project JSON file downloaded from\n        PODP platform. For example, for project MSV000079284, its json file is\n        https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.\n    \"\"\"\n    mappings_dict: dict[str, set[str]] = {}\n    with open(podp_project_json_file, \"r\") as f:\n        json_data = json.load(f)\n\n    validate_podp_json(json_data)\n\n    for record in json_data[\"genomes\"]:\n        strain_id = record[\"genome_label\"]\n        genome_id = get_best_available_genome_id(record[\"genome_ID\"])\n        if genome_id is None:\n            logger.warning(\"Failed to extract genome ID from genome with label %s\", strain_id)\n            continue\n        if strain_id in mappings_dict:\n            mappings_dict[strain_id].add(genome_id)\n        else:\n            mappings_dict[strain_id] = {genome_id}\n    return mappings_dict\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.extract_mappings_original_genome_id_resolved_genome_id","title":"extract_mappings_original_genome_id_resolved_genome_id","text":"
    extract_mappings_original_genome_id_resolved_genome_id(\n    genome_status_json_file: str | PathLike,\n) -> dict[str, str]\n

    Extract mappings \"original_genome_id <-> resolved_genome_id\".

    Parameters:

    Name Type Description Default genome_status_json_file str | PathLike

    The path to the genome status JSON file.

    required

    Returns:

    Type Description dict[str, str]

    Key is original genome id and value is resolved genome id.

    Notes

    The genome_status_json_file is usually generated by the podp_download_and_extract_antismash_data function with a default file name defined in nplinker.defaults.GENOME_STATUS_FILENAME.

    Source code in src/nplinker/genomics/utils.py
    def extract_mappings_original_genome_id_resolved_genome_id(\n    genome_status_json_file: str | PathLike,\n) -> dict[str, str]:\n    \"\"\"Extract mappings \"original_genome_id <-> resolved_genome_id\".\n\n    Args:\n        genome_status_json_file: The path to the genome status\n            JSON file.\n\n    Returns:\n        Key is original genome id and value is resolved genome id.\n\n    Notes:\n        The `genome_status_json_file` is usually generated by the\n        `podp_download_and_extract_antismash_data` function with\n        a default file name defined in `nplinker.defaults.GENOME_STATUS_FILENAME`.\n    \"\"\"\n    gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file)\n    return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()}\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.extract_mappings_resolved_genome_id_bgc_id","title":"extract_mappings_resolved_genome_id_bgc_id","text":"
    extract_mappings_resolved_genome_id_bgc_id(\n    genome_bgc_mappings_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"resolved_genome_id <-> bgc_id\".

    Parameters:

    Name Type Description Default genome_bgc_mappings_file str | PathLike

    The path to the genome BGC mappings JSON file.

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is resolved genome id and value is a set of BGC ids.

    Notes

    The genome_bgc_mappings_file is usually generated by the generate_mappings_genome_id_bgc_id function with a default file name defined in nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME.

    Source code in src/nplinker/genomics/utils.py
    def extract_mappings_resolved_genome_id_bgc_id(\n    genome_bgc_mappings_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"resolved_genome_id <-> bgc_id\".\n\n    Args:\n        genome_bgc_mappings_file: The path to the genome BGC\n            mappings JSON file.\n\n    Returns:\n        Key is resolved genome id and value is a set of BGC ids.\n\n    Notes:\n        The `genome_bgc_mappings_file` is usually generated by the\n        `generate_mappings_genome_id_bgc_id` function with a default file name\n        defined in `nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME`.\n    \"\"\"\n    with open(genome_bgc_mappings_file, \"r\") as f:\n        json_data = json.load(f)\n\n    # validate the JSON data\n    validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA)\n\n    return {mapping[\"genome_ID\"]: set(mapping[\"BGC_ID\"]) for mapping in json_data[\"mappings\"]}\n
    "},{"location":"api/genomics_utils/#nplinker.genomics.utils.get_mappings_strain_id_bgc_id","title":"get_mappings_strain_id_bgc_id","text":"
    get_mappings_strain_id_bgc_id(\n    mappings_strain_id_original_genome_id: Mapping[\n        str, set[str]\n    ],\n    mappings_original_genome_id_resolved_genome_id: Mapping[\n        str, str\n    ],\n    mappings_resolved_genome_id_bgc_id: Mapping[\n        str, set[str]\n    ],\n) -> dict[str, set[str]]\n

    Get mappings \"strain_id <-> bgc_id\".

    Parameters:

    Name Type Description Default mappings_strain_id_original_genome_id Mapping[str, set[str]]

    Mappings \"strain_id <-> original_genome_id\".

    required mappings_original_genome_id_resolved_genome_id Mapping[str, str]

    Mappings \"original_genome_id <-> resolved_genome_id\".

    required mappings_resolved_genome_id_bgc_id Mapping[str, set[str]]

    Mappings \"resolved_genome_id <-> bgc_id\".

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of BGC ids.

    See Also Source code in src/nplinker/genomics/utils.py
    def get_mappings_strain_id_bgc_id(\n    mappings_strain_id_original_genome_id: Mapping[str, set[str]],\n    mappings_original_genome_id_resolved_genome_id: Mapping[str, str],\n    mappings_resolved_genome_id_bgc_id: Mapping[str, set[str]],\n) -> dict[str, set[str]]:\n    \"\"\"Get mappings \"strain_id <-> bgc_id\".\n\n    Args:\n        mappings_strain_id_original_genome_id: Mappings\n            \"strain_id <-> original_genome_id\".\n        mappings_original_genome_id_resolved_genome_id: Mappings\n            \"original_genome_id <-> resolved_genome_id\".\n        mappings_resolved_genome_id_bgc_id: Mappings\n            \"resolved_genome_id <-> bgc_id\".\n\n    Returns:\n        Key is strain id and value is a set of BGC ids.\n\n    See Also:\n        - `extract_mappings_strain_id_original_genome_id`: Extract mappings\n            \"strain_id <-> original_genome_id\".\n        - `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings\n            \"original_genome_id <-> resolved_genome_id\".\n        - `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings\n            \"resolved_genome_id <-> bgc_id\".\n    \"\"\"\n    mappings_dict = {}\n    for strain_id, original_genome_ids in mappings_strain_id_original_genome_id.items():\n        bgc_ids = set()\n        for original_genome_id in original_genome_ids:\n            resolved_genome_id = mappings_original_genome_id_resolved_genome_id[original_genome_id]\n            if (bgc_id := mappings_resolved_genome_id_bgc_id.get(resolved_genome_id)) is not None:\n                bgc_ids.update(bgc_id)\n        if bgc_ids:\n            mappings_dict[strain_id] = bgc_ids\n    return mappings_dict\n
    "},{"location":"api/gnps/","title":"GNPS","text":""},{"location":"api/gnps/#nplinker.metabolomics.gnps","title":"gnps","text":""},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat","title":"GNPSFormat","text":"

    Bases: Enum

    Enum class for GNPS format (workflow).

    The GNPS format refers to the GNPS workflow. The name of the enum is a simple short name for the workflow, and the value of the enum is the actual name of the workflow in the GNPS website.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.SNETS","title":"SNETS class-attribute instance-attribute","text":"
    SNETS = 'METABOLOMICS-SNETS'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.SNETSV2","title":"SNETSV2 class-attribute instance-attribute","text":"
    SNETSV2 = 'METABOLOMICS-SNETS-V2'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.FBMN","title":"FBMN class-attribute instance-attribute","text":"
    FBMN = 'FEATURE-BASED-MOLECULAR-NETWORKING'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFormat.Unknown","title":"Unknown class-attribute instance-attribute","text":"
    Unknown = 'Unknown-GNPS-Workflow'\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader","title":"GNPSDownloader","text":"
    GNPSDownloader(task_id: str, download_root: str | PathLike)\n

    Download GNPS zip archive for the given task id.

    Note that only GNPS workflows listed in the GNPSFormat enum are supported.

    Attributes:

    Name Type Description GNPS_DATA_DOWNLOAD_URL str

    URL template for downloading GNPS data.

    GNPS_DATA_DOWNLOAD_URL_FBMN str

    URL template for downloading GNPS data for FBMN.

    Parameters:

    Name Type Description Default task_id str

    GNPS task id, identifying the data to be downloaded.

    required download_root str | PathLike

    Path where to store the downloaded archive.

    required

    Raises:

    Type Description ValueError

    If the given task id does not correspond to a supported GNPS workflow.

    Examples:

    >>> GNPSDownloader(\"c22f44b14a3d450eb836d607cb9521bb\", \"~/downloads\")\n
    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def __init__(self, task_id: str, download_root: str | PathLike):\n    \"\"\"Initialize the GNPSDownloader.\n\n    Args:\n        task_id: GNPS task id, identifying the data to be downloaded.\n        download_root: Path where to store the downloaded archive.\n\n    Raises:\n        ValueError: If the given task id does not correspond to a supported\n            GNPS workflow.\n\n    Examples:\n        >>> GNPSDownloader(\"c22f44b14a3d450eb836d607cb9521bb\", \"~/downloads\")\n    \"\"\"\n    gnps_format = gnps_format_from_task_id(task_id)\n    if gnps_format == GNPSFormat.Unknown:\n        raise ValueError(\n            f\"Unknown workflow type for GNPS task '{task_id}'.\"\n            f\"Supported GNPS workflows are described in the GNPSFormat enum, \"\n            f\"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' \"\n            f\"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.\"\n        )\n\n    self._task_id = task_id\n    self._download_root: Path = Path(download_root)\n    self._gnps_format = gnps_format\n    self._file_name = gnps_format.value + \"-\" + self._task_id + \".zip\"\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.GNPS_DATA_DOWNLOAD_URL","title":"GNPS_DATA_DOWNLOAD_URL class-attribute instance-attribute","text":"
    GNPS_DATA_DOWNLOAD_URL: str = (\n    \"https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra\"\n)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN","title":"GNPS_DATA_DOWNLOAD_URL_FBMN class-attribute instance-attribute","text":"
    GNPS_DATA_DOWNLOAD_URL_FBMN: str = (\n    \"https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data\"\n)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.gnps_format","title":"gnps_format property","text":"
    gnps_format: GNPSFormat\n

    Get the GNPS workflow type.

    Returns:

    Type Description GNPSFormat

    GNPS workflow type.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.download","title":"download","text":"
    download() -> 'Self'\n

    Execute the downloading process.

    Note: GNPS data is downloaded using the POST method (empty payload is OK).

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def download(self) -> \"Self\":\n    \"\"\"Execute the downloading process.\n\n    Note: GNPS data is downloaded using the POST method (empty payload is OK).\n    \"\"\"\n    download_url(\n        self.get_url(), self._download_root, filename=self._file_name, http_method=\"POST\"\n    )\n    return self\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.get_download_file","title":"get_download_file","text":"
    get_download_file() -> str\n

    Get the path to the zip file.

    Returns:

    Type Description str

    Download path as string

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def get_download_file(self) -> str:\n    \"\"\"Get the path to the zip file.\n\n    Returns:\n        Download path as string\n    \"\"\"\n    return str(Path(self._download_root) / self._file_name)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.get_task_id","title":"get_task_id","text":"
    get_task_id() -> str\n

    Get the GNPS task id.

    Returns:

    Type Description str

    Task id as string.

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def get_task_id(self) -> str:\n    \"\"\"Get the GNPS task id.\n\n    Returns:\n        Task id as string.\n    \"\"\"\n    return self._task_id\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSDownloader.get_url","title":"get_url","text":"
    get_url() -> str\n

    Get the full URL linking to GNPS data to be downloaded.

    Returns:

    Type Description str

    URL pointing to the GNPS data to be downloaded.

    Source code in src/nplinker/metabolomics/gnps/gnps_downloader.py
    def get_url(self) -> str:\n    \"\"\"Get the full URL linking to GNPS data to be downloaded.\n\n    Returns:\n        URL pointing to the GNPS data to be downloaded.\n    \"\"\"\n    if self.gnps_format == GNPSFormat.FBMN:\n        return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id)\n    return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id)\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSExtractor","title":"GNPSExtractor","text":"
    GNPSExtractor(\n    file: str | PathLike, extract_dir: str | PathLike\n)\n

    Class to extract files from a GNPS molecular networking archive(.zip).

    Four files are extracted and renamed to the following names:

    The files to be extracted are selected based on the GNPS workflow type, as described below (in the order of the files above):

    1. METABOLOMICS-SNETS
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv
      • METABOLOMICS-SNETS*.mgf
      • networkedges_selfloop/*.pairsinfo
      • result_specnets_DB/*.tsv
    2. METABOLOMICS-SNETS-V2
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.clustersummary
      • METABOLOMICS-SNETS-V2*.mgf
      • networkedges_selfloop/*.selfloop
      • result_specnets_DB/.tsv
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • quantification_table/.csv
      • spectra/*.mgf
      • networkedges_selfloop/*.selfloop
      • DB_result/*.tsv

    Parameters:

    Name Type Description Default file str | PathLike

    The path to the GNPS zip file.

    required extract_dir str | PathLike

    path to the directory where to extract the files to.

    required

    Raises:

    Type Description ValueError

    If the given file is an invalid GNPS archive.

    Examples:

    >>> gnps_extractor = GNPSExtractor(\"path/to/gnps_archive.zip\", \"path/to/extract_dir\")\n>>> gnps_extractor.gnps_format\n<GNPSFormat.SNETS: 'METABOLOMICS-SNETS'>\n>>> gnps_extractor.extract_dir\n'path/to/extract_dir'\n
    Source code in src/nplinker/metabolomics/gnps/gnps_extractor.py
    def __init__(self, file: str | PathLike, extract_dir: str | PathLike):\n    \"\"\"Initialize the GNPSExtractor.\n\n    Args:\n        file: The path to the GNPS zip file.\n        extract_dir: path to the directory where to extract the files to.\n\n    Raises:\n        ValueError: If the given file is an invalid GNPS archive.\n\n    Examples:\n        >>> gnps_extractor = GNPSExtractor(\"path/to/gnps_archive.zip\", \"path/to/extract_dir\")\n        >>> gnps_extractor.gnps_format\n        <GNPSFormat.SNETS: 'METABOLOMICS-SNETS'>\n        >>> gnps_extractor.extract_dir\n        'path/to/extract_dir'\n    \"\"\"\n    gnps_format = gnps_format_from_archive(file)\n    if gnps_format == GNPSFormat.Unknown:\n        raise ValueError(\n            f\"Unknown workflow type for GNPS archive '{file}'.\"\n            f\"Supported GNPS workflows are described in the GNPSFormat enum, \"\n            f\"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' \"\n            f\"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.\"\n        )\n\n    self._file = Path(file)\n    self._extract_path = Path(extract_dir)\n    self._gnps_format = gnps_format\n    # the order of filenames matters\n    self._target_files = [\n        \"file_mappings\",\n        \"spectra.mgf\",\n        \"molecular_families.tsv\",\n        \"annotations.tsv\",\n    ]\n\n    self._extract()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSExtractor.gnps_format","title":"gnps_format property","text":"
    gnps_format: GNPSFormat\n

    Get the GNPS workflow type.

    Returns:

    Type Description GNPSFormat

    GNPS workflow type.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSExtractor.extract_dir","title":"extract_dir property","text":"
    extract_dir: str\n

    Get the path where to extract the files to.

    Returns:

    Type Description str

    Path where to extract files as string.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSSpectrumLoader","title":"GNPSSpectrumLoader","text":"
    GNPSSpectrumLoader(file: str | PathLike)\n

    Bases: SpectrumLoaderBase

    Class to load mass spectra from the given GNPS MGF file.

    The file mappings file is from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • METABOLOMICS-SNETS*.mgf
    2. METABOLOMICS-SNETS-V2
      • METABOLOMICS-SNETS-V2*.mgf
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • spectra/*.mgf

    Parameters:

    Name Type Description Default file str | PathLike

    path to the MGF file.

    required

    Raises:

    Type Description ValueError

    Raises ValueError if the file is not valid.

    Examples:

    >>> loader = GNPSSpectrumLoader(\"gnps_spectra.mgf\")\n>>> print(loader.spectra[0])\n
    Source code in src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSSpectrumLoader.\n\n    Args:\n        file: path to the MGF file.\n\n    Raises:\n        ValueError: Raises ValueError if the file is not valid.\n\n    Examples:\n        >>> loader = GNPSSpectrumLoader(\"gnps_spectra.mgf\")\n        >>> print(loader.spectra[0])\n    \"\"\"\n    self._file = str(file)\n    self._spectra: list[Spectrum] = []\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSSpectrumLoader.spectra","title":"spectra property","text":"
    spectra: list[Spectrum]\n

    Get the list of Spectrum objects.

    Returns:

    Type Description list[Spectrum]

    list[Spectrum]: the loaded spectra as a list of Spectrum objects.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSMolecularFamilyLoader","title":"GNPSMolecularFamilyLoader","text":"
    GNPSMolecularFamilyLoader(file: str | PathLike)\n

    Bases: MolecularFamilyLoaderBase

    Class to load molecular families from GNPS output file.

    The molecular family file is from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • networkedges_selfloop/*.pairsinfo
    2. METABOLOMICS-SNETS-V2
      • networkedges_selfloop/*.selfloop
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • networkedges_selfloop/*.selfloop

    The \"ComponentIndex\" column in the GNPS molecular family's file is treated as family id. But for molecular families that have only one member (i.e. spectrum), named singleton molecular families, their files have the same value of \"-1\" in the \"ComponentIndex\" column. To make the family id unique,the spectrum id plus a prefix singleton- is used as the family id of singleton molecular families.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the GNPS molecular family file.

    required

    Raises:

    Type Description ValueError

    Raises ValueError if the file is not valid.

    Examples:

    >>> loader = GNPSMolecularFamilyLoader(\"gnps_molecular_families.tsv\")\n>>> print(loader.families)\n[<MolecularFamily 1>, <MolecularFamily 2>, ...]\n>>> print(loader.families[0].spectra_ids)\n{'1', '3', '7', ...}\n
    Source code in src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSMolecularFamilyLoader.\n\n    Args:\n        file: Path to the GNPS molecular family file.\n\n    Raises:\n        ValueError: Raises ValueError if the file is not valid.\n\n    Examples:\n        >>> loader = GNPSMolecularFamilyLoader(\"gnps_molecular_families.tsv\")\n        >>> print(loader.families)\n        [<MolecularFamily 1>, <MolecularFamily 2>, ...]\n        >>> print(loader.families[0].spectra_ids)\n        {'1', '3', '7', ...}\n    \"\"\"\n    self._mfs: list[MolecularFamily] = []\n    self._file = file\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSMolecularFamilyLoader.get_mfs","title":"get_mfs","text":"
    get_mfs(\n    keep_singleton: bool = False,\n) -> list[MolecularFamily]\n

    Get MolecularFamily objects.

    Parameters:

    Name Type Description Default keep_singleton bool

    True to keep singleton molecular families. A singleton molecular family is a molecular family that contains only one spectrum.

    False

    Returns:

    Type Description list[MolecularFamily]

    A list of MolecularFamily objects with their spectra ids.

    Source code in src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
    def get_mfs(self, keep_singleton: bool = False) -> list[MolecularFamily]:\n    \"\"\"Get MolecularFamily objects.\n\n    Args:\n        keep_singleton: True to keep singleton molecular families. A\n            singleton molecular family is a molecular family that contains\n            only one spectrum.\n\n    Returns:\n        A list of MolecularFamily objects with their spectra ids.\n    \"\"\"\n    mfs = self._mfs\n    if not keep_singleton:\n        mfs = [mf for mf in mfs if not mf.is_singleton()]\n    return mfs\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSAnnotationLoader","title":"GNPSAnnotationLoader","text":"
    GNPSAnnotationLoader(file: str | PathLike)\n

    Bases: AnnotationLoaderBase

    Load annotations from GNPS output file.

    The annotation file is a .tsv file from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • result_specnets_DB/*.tsv
    2. METABOLOMICS-SNETS-V2
      • result_specnets_DB/.tsv
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • DB_result/*.tsv

    Parameters:

    Name Type Description Default file str | PathLike

    The GNPS annotation file.

    required

    Examples:

    >>> loader = GNPSAnnotationLoader(\"gnps_annotations.tsv\")\n>>> print(loader.annotations[\"100\"])\n{'#Scan#': '100',\n'Adduct': 'M+H',\n'CAS_Number': 'N/A',\n'Charge': '1',\n'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',\n'Compound_Source': 'NIH Pharmacologically Active Library',\n'Data_Collector': 'VP/LMS',\n'ExactMass': '274.992',\n'INCHI': 'N/A',\n'INCHI_AUX': 'N/A',\n'Instrument': 'qTof',\n'IonMode': 'Positive',\n'Ion_Source': 'LC-ESI',\n'LibMZ': '276.003',\n'LibraryName': 'lib-00014.mgf',\n'LibraryQualityString': 'Gold',\n'Library_Class': '1',\n'MQScore': '0.704152',\n'MZErrorPPM': '405416',\n'MassDiff': '111.896',\n'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',\n'PI': 'Dorrestein',\n'Precursor_MZ': '276.003',\n'Pubmed_ID': 'N/A',\n'RT_Query': '795.979',\n'SharedPeaks': '7',\n'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',\n'SpecCharge': '1',\n'SpecMZ': '164.107',\n'SpectrumFile': 'spectra/specs_ms.pklbin',\n'SpectrumID': 'CCMSLIB00000086167',\n'TIC_Query': '986.997',\n'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',\n'tags': ' ',\n'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}\n
    Source code in src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSAnnotationLoader.\n\n    Args:\n        file: The GNPS annotation file.\n\n    Examples:\n        >>> loader = GNPSAnnotationLoader(\"gnps_annotations.tsv\")\n        >>> print(loader.annotations[\"100\"])\n        {'#Scan#': '100',\n        'Adduct': 'M+H',\n        'CAS_Number': 'N/A',\n        'Charge': '1',\n        'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',\n        'Compound_Source': 'NIH Pharmacologically Active Library',\n        'Data_Collector': 'VP/LMS',\n        'ExactMass': '274.992',\n        'INCHI': 'N/A',\n        'INCHI_AUX': 'N/A',\n        'Instrument': 'qTof',\n        'IonMode': 'Positive',\n        'Ion_Source': 'LC-ESI',\n        'LibMZ': '276.003',\n        'LibraryName': 'lib-00014.mgf',\n        'LibraryQualityString': 'Gold',\n        'Library_Class': '1',\n        'MQScore': '0.704152',\n        'MZErrorPPM': '405416',\n        'MassDiff': '111.896',\n        'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',\n        'PI': 'Dorrestein',\n        'Precursor_MZ': '276.003',\n        'Pubmed_ID': 'N/A',\n        'RT_Query': '795.979',\n        'SharedPeaks': '7',\n        'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',\n        'SpecCharge': '1',\n        'SpecMZ': '164.107',\n        'SpectrumFile': 'spectra/specs_ms.pklbin',\n        'SpectrumID': 'CCMSLIB00000086167',\n        'TIC_Query': '986.997',\n        'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',\n        'tags': ' ',\n        'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n        'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n        'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',\n        'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}\n    \"\"\"\n    self._file = Path(file)\n    self._annotations: dict[str, dict] = {}\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSAnnotationLoader.annotations","title":"annotations property","text":"
    annotations: dict[str, dict]\n

    Get annotations.

    Returns:

    Type Description dict[str, dict]

    Keys are spectrum ids (\"#Scan#\" in annotation file) and values are the annotations dict

    dict[str, dict]

    for each spectrum.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFileMappingLoader","title":"GNPSFileMappingLoader","text":"
    GNPSFileMappingLoader(file: str | PathLike)\n

    Bases: FileMappingLoaderBase

    Class to load file mappings from GNPS output file.

    File mappings refers to the mapping from spectrum id to files in which this spectrum occurs.

    The file mappings file is from GNPS output archive, as described below for each GNPS workflow type:

    1. METABOLOMICS-SNETS
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv
    2. METABOLOMICS-SNETS-V2
      • clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.clustersummary
    3. FEATURE-BASED-MOLECULAR-NETWORKING
      • quantification_table/.csv

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the GNPS file mappings file.

    required

    Raises:

    Type Description ValueError

    Raises ValueError if the file is not valid.

    Examples:

    >>> loader = GNPSFileMappingLoader(\"gnps_file_mappings.tsv\")\n>>> print(loader.mappings[\"1\"])\n['26c.mzXML']\n>>> print(loader.mapping_reversed[\"26c.mzXML\"])\n{'1', '3', '7', ...}\n
    Source code in src/nplinker/metabolomics/gnps/gnps_file_mapping_loader.py
    def __init__(self, file: str | PathLike):\n    \"\"\"Initialize the GNPSFileMappingLoader.\n\n    Args:\n        file: Path to the GNPS file mappings file.\n\n    Raises:\n        ValueError: Raises ValueError if the file is not valid.\n\n    Examples:\n        >>> loader = GNPSFileMappingLoader(\"gnps_file_mappings.tsv\")\n        >>> print(loader.mappings[\"1\"])\n        ['26c.mzXML']\n        >>> print(loader.mapping_reversed[\"26c.mzXML\"])\n        {'1', '3', '7', ...}\n    \"\"\"\n    self._gnps_format = gnps_format_from_file_mapping(file)\n    if self._gnps_format is GNPSFormat.Unknown:\n        raise ValueError(\"Unknown workflow type for GNPS file mappings file \")\n\n    self._file = Path(file)\n    self._mapping: dict[str, list[str]] = {}\n\n    self._validate()\n    self._load()\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFileMappingLoader.mappings","title":"mappings property","text":"
    mappings: dict[str, list[str]]\n

    Return mapping from spectrum id to files in which this spectrum occurs.

    Returns:

    Type Description dict[str, list[str]]

    Mapping from spectrum id to names of all files in which this spectrum occurs.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.GNPSFileMappingLoader.mapping_reversed","title":"mapping_reversed property","text":"
    mapping_reversed: dict[str, set[str]]\n

    Return mapping from file name to all spectra that occur in this file.

    Returns:

    Type Description dict[str, set[str]]

    Mapping from file name to all spectra ids that occur in this file.

    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.gnps_format_from_archive","title":"gnps_format_from_archive","text":"
    gnps_format_from_archive(\n    zip_file: str | PathLike,\n) -> GNPSFormat\n

    Detect GNPS format from a downloaded GNPS zip archive.

    The detection is based on the filename of the zip file and the names of the files contained in the zip file.

    Parameters:

    Name Type Description Default zip_file str | PathLike

    Path to the downloaded GNPS zip file.

    required

    Returns:

    Type Description GNPSFormat

    The format identified in the GNPS zip file.

    Examples:

    >>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip\") == GNPSFormat.SNETS\n>>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-V2-189e8bf1-download_clustered_spectra.zip\") == GNPSFormat.SNETSV2\n>>> gnps_format_from_archive(\"downloads/ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip\") == GNPSFormat.FBMN\n
    Source code in src/nplinker/metabolomics/gnps/gnps_format.py
    def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:\n    \"\"\"Detect GNPS format from a downloaded GNPS zip archive.\n\n    The detection is based on the filename of the zip file and the names of the\n    files contained in the zip file.\n\n    Args:\n        zip_file: Path to the downloaded GNPS zip file.\n\n    Returns:\n        The format identified in the GNPS zip file.\n\n    Examples:\n        >>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip\") == GNPSFormat.SNETS\n        >>> gnps_format_from_archive(\"downloads/ProteoSAFe-METABOLOMICS-SNETS-V2-189e8bf1-download_clustered_spectra.zip\") == GNPSFormat.SNETSV2\n        >>> gnps_format_from_archive(\"downloads/ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip\") == GNPSFormat.FBMN\n    \"\"\"\n    file = Path(zip_file)\n    # Guess the format from the filename of the zip file\n    if GNPSFormat.FBMN.value in file.name:\n        return GNPSFormat.FBMN\n    # the order of the if statements matters for the following two\n    if GNPSFormat.SNETSV2.value in file.name:\n        return GNPSFormat.SNETSV2\n    if GNPSFormat.SNETS.value in file.name:\n        return GNPSFormat.SNETS\n\n    # Guess the format from the names of the files in the zip file\n    with zipfile.ZipFile(file) as archive:\n        filenames = archive.namelist()\n    if any(GNPSFormat.FBMN.value in x for x in filenames):\n        return GNPSFormat.FBMN\n    # the order of the if statements matters for the following two\n    if any(GNPSFormat.SNETSV2.value in x for x in filenames):\n        return GNPSFormat.SNETSV2\n    if any(GNPSFormat.SNETS.value in x for x in filenames):\n        return GNPSFormat.SNETS\n\n    return GNPSFormat.Unknown\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.gnps_format_from_file_mapping","title":"gnps_format_from_file_mapping","text":"
    gnps_format_from_file_mapping(\n    file: str | PathLike,\n) -> GNPSFormat\n

    Detect GNPS format from the given file mapping file.

    The GNPS file mapping file is located in different folders depending on the GNPS workflow. Here are the locations in corresponding GNPS zip archives:

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the file to peek the format for.

    required

    Returns:

    Type Description GNPSFormat

    GNPS format identified in the file.

    Source code in src/nplinker/metabolomics/gnps/gnps_format.py
    def gnps_format_from_file_mapping(file: str | PathLike) -> GNPSFormat:\n    \"\"\"Detect GNPS format from the given file mapping file.\n\n    The GNPS file mapping file is located in different folders depending on the\n    GNPS workflow. Here are the locations in corresponding GNPS zip archives:\n\n    - METABOLOMICS-SNETS workflow: the .tsv file under folder \"clusterinfosummarygroup_attributes_withIDs_withcomponentID\"\n    - METABOLOMICS-SNETS-V2 workflow: the .clustersummary file (tsv) under folder \"clusterinfosummarygroup_attributes_withIDs_withcomponentID\"\n    - FEATURE-BASED-MOLECULAR-NETWORKING workflow: the .csv file under folder \"quantification_table\"\n\n    Args:\n        file: Path to the file to peek the format for.\n\n    Returns:\n        GNPS format identified in the file.\n    \"\"\"\n    headers = get_headers(file)\n    if \"AllFiles\" in headers:\n        return GNPSFormat.SNETS\n    if \"UniqueFileSources\" in headers:\n        return GNPSFormat.SNETSV2\n    if \"row ID\" in headers:\n        return GNPSFormat.FBMN\n    return GNPSFormat.Unknown\n
    "},{"location":"api/gnps/#nplinker.metabolomics.gnps.gnps_format_from_task_id","title":"gnps_format_from_task_id","text":"
    gnps_format_from_task_id(task_id: str) -> GNPSFormat\n

    Detect GNPS format for the given task id.

    Parameters:

    Name Type Description Default task_id str

    GNPS task id.

    required

    Returns:

    Type Description GNPSFormat

    The format identified in the GNPS task.

    Examples:

    >>> gnps_format_from_task_id(\"c22f44b14a3d450eb836d607cb9521bb\") == GNPSFormat.SNETS\n>>> gnps_format_from_task_id(\"189e8bf16af145758b0a900f1c44ff4a\") == GNPSFormat.SNETSV2\n>>> gnps_format_from_task_id(\"92036537c21b44c29e509291e53f6382\") == GNPSFormat.FBMN\n>>> gnps_format_from_task_id(\"0ad6535e34d449788f297e712f43068a\") == GNPSFormat.Unknown\n
    Source code in src/nplinker/metabolomics/gnps/gnps_format.py
    def gnps_format_from_task_id(task_id: str) -> GNPSFormat:\n    \"\"\"Detect GNPS format for the given task id.\n\n    Args:\n        task_id: GNPS task id.\n\n    Returns:\n        The format identified in the GNPS task.\n\n    Examples:\n        >>> gnps_format_from_task_id(\"c22f44b14a3d450eb836d607cb9521bb\") == GNPSFormat.SNETS\n        >>> gnps_format_from_task_id(\"189e8bf16af145758b0a900f1c44ff4a\") == GNPSFormat.SNETSV2\n        >>> gnps_format_from_task_id(\"92036537c21b44c29e509291e53f6382\") == GNPSFormat.FBMN\n        >>> gnps_format_from_task_id(\"0ad6535e34d449788f297e712f43068a\") == GNPSFormat.Unknown\n    \"\"\"\n    task_html = httpx.get(GNPS_TASK_URL.format(task_id))\n    soup = BeautifulSoup(task_html.text, features=\"html.parser\")\n    try:\n        # find the td tag that follows the th tag containing 'Workflow'\n        workflow_tag = soup.find(\"th\", string=\"Workflow\").find_next_sibling(\"td\")  # type: ignore\n        workflow_format = workflow_tag.contents[0].strip()  # type: ignore\n    except AttributeError:\n        return GNPSFormat.Unknown\n\n    if workflow_format == GNPSFormat.FBMN.value:\n        return GNPSFormat.FBMN\n    if workflow_format == GNPSFormat.SNETSV2.value:\n        return GNPSFormat.SNETSV2\n    if workflow_format == GNPSFormat.SNETS.value:\n        return GNPSFormat.SNETS\n    return GNPSFormat.Unknown\n
    "},{"location":"api/loader/","title":"Dataset Loader","text":""},{"location":"api/loader/#nplinker.loader","title":"loader","text":""},{"location":"api/loader/#nplinker.loader.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader","title":"DatasetLoader","text":"
    DatasetLoader(config: Dynaconf)\n

    Class to load all data.

    Attributes:

    Name Type Description config

    A Dynaconf object that contains the configuration settings. Check the nplinker.config module for more information.

    bgcs list[BGC]

    A list of BGC objects.

    gcfs list[GCF]

    A list of GCF objects.

    spectra list[Spectrum]

    A list of Spectrum objects.

    mfs list[MolecularFamily]

    A list of MolecularFamily objects.

    mibig_bgcs list[BGC]

    A list of MIBiG BGC objects.

    mibig_strains_in_use StrainCollection

    A StrainCollection object that contains the strains in use from MIBiG.

    product_types list

    A list of product types.

    strains StrainCollection

    A StrainCollection object that contains all strains.

    class_matches

    A ClassMatches object that contains class match info.

    chem_classes

    A ChemClassPredictions object that contains chemical class predictions.

    Parameters:

    Name Type Description Default config Dynaconf

    A Dynaconf object that contains the configuration settings. Check the nplinker.config module for more information.

    required Source code in src/nplinker/loader.py
    def __init__(self, config: Dynaconf):\n    \"\"\"Initialize the DatasetLoader.\n\n    Args:\n        config: A Dynaconf object that contains the configuration settings. Check the\n            `nplinker.config` module for more information.\n    \"\"\"\n    self.config = config\n\n    self.bgcs: list[BGC] = []\n    self.gcfs: list[GCF] = []\n    self.spectra: list[Spectrum] = []\n    self.mfs: list[MolecularFamily] = []\n    self.mibig_bgcs: list[BGC] = []\n    self.mibig_strains_in_use: StrainCollection = StrainCollection()\n    self.product_types: list = []\n    self.strains: StrainCollection = StrainCollection()\n\n    self.class_matches = None\n    self.chem_classes = None\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.RUN_CANOPUS_DEFAULT","title":"RUN_CANOPUS_DEFAULT class-attribute instance-attribute","text":"
    RUN_CANOPUS_DEFAULT = False\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.EXTRA_CANOPUS_PARAMS_DEFAULT","title":"EXTRA_CANOPUS_PARAMS_DEFAULT class-attribute instance-attribute","text":"
    EXTRA_CANOPUS_PARAMS_DEFAULT = (\n    \"--maxmz 600 formula zodiac structure canopus\"\n)\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.OR_CANOPUS","title":"OR_CANOPUS class-attribute instance-attribute","text":"
    OR_CANOPUS = 'canopus_dir'\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.OR_MOLNETENHANCER","title":"OR_MOLNETENHANCER class-attribute instance-attribute","text":"
    OR_MOLNETENHANCER = 'molnetenhancer_dir'\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.config","title":"config instance-attribute","text":"
    config = config\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.bgcs","title":"bgcs instance-attribute","text":"
    bgcs: list[BGC] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.gcfs","title":"gcfs instance-attribute","text":"
    gcfs: list[GCF] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.spectra","title":"spectra instance-attribute","text":"
    spectra: list[Spectrum] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.mfs","title":"mfs instance-attribute","text":"
    mfs: list[MolecularFamily] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.mibig_bgcs","title":"mibig_bgcs instance-attribute","text":"
    mibig_bgcs: list[BGC] = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.mibig_strains_in_use","title":"mibig_strains_in_use instance-attribute","text":"
    mibig_strains_in_use: StrainCollection = StrainCollection()\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.product_types","title":"product_types instance-attribute","text":"
    product_types: list = []\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.strains","title":"strains instance-attribute","text":"
    strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.class_matches","title":"class_matches instance-attribute","text":"
    class_matches = None\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.chem_classes","title":"chem_classes instance-attribute","text":"
    chem_classes = None\n
    "},{"location":"api/loader/#nplinker.loader.DatasetLoader.load","title":"load","text":"
    load()\n

    Load all data.

    Source code in src/nplinker/loader.py
    def load(self):\n    \"\"\"Load all data.\"\"\"\n    if not self._load_strain_mappings():\n        return False\n\n    if not self._load_metabolomics():\n        return False\n\n    if not self._load_genomics():\n        return False\n\n    # set self.strains with all strains from input plus mibig strains in use\n    self.strains = self.strains + self.mibig_strains_in_use\n\n    if len(self.strains) == 0:\n        raise Exception(\"Failed to find *ANY* strains.\")\n\n    return True\n
    "},{"location":"api/metabolomics/","title":"Data Models","text":""},{"location":"api/metabolomics/#nplinker.metabolomics","title":"metabolomics","text":""},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily","title":"MolecularFamily","text":"
    MolecularFamily(id: str)\n

    Class to model molecular family.

    Attributes:

    Name Type Description id str

    Unique id for the molecular family.

    spectra_ids set[str]

    Set of spectrum ids in the molecular family.

    Parameters:

    Name Type Description Default id str

    Unique id for the molecular family.

    required Source code in src/nplinker/metabolomics/molecular_family.py
    def __init__(self, id: str):\n    \"\"\"Initialize the MolecularFamily.\n\n    Args:\n        id: Unique id for the molecular family.\n    \"\"\"\n    self.id: str = id\n    self.spectra_ids: set[str] = set()\n    self._spectra: set[Spectrum] = set()\n    self._strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.id","title":"id instance-attribute","text":"
    id: str = id\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.spectra_ids","title":"spectra_ids instance-attribute","text":"
    spectra_ids: set[str] = set()\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.spectra","title":"spectra property","text":"
    spectra: set[Spectrum]\n

    Get Spectrum objects in the molecular family.

    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.strains","title":"strains property","text":"
    strains: StrainCollection\n

    Get strains in the molecular family.

    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.add_spectrum","title":"add_spectrum","text":"
    add_spectrum(spectrum: Spectrum) -> None\n

    Add a Spectrum object to the molecular family.

    Parameters:

    Name Type Description Default spectrum Spectrum

    Spectrum object to add to the molecular family.

    required Source code in src/nplinker/metabolomics/molecular_family.py
    def add_spectrum(self, spectrum: Spectrum) -> None:\n    \"\"\"Add a Spectrum object to the molecular family.\n\n    Args:\n        spectrum: `Spectrum` object to add to the molecular family.\n    \"\"\"\n    self._spectra.add(spectrum)\n    self.spectra_ids.add(spectrum.id)\n    self._strains = self._strains + spectrum.strains\n    # add the molecular family to the spectrum\n    spectrum.family = self\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.detach_spectrum","title":"detach_spectrum","text":"
    detach_spectrum(spectrum: Spectrum) -> None\n

    Remove a Spectrum object from the molecular family.

    Parameters:

    Name Type Description Default spectrum Spectrum

    Spectrum object to remove from the molecular family.

    required Source code in src/nplinker/metabolomics/molecular_family.py
    def detach_spectrum(self, spectrum: Spectrum) -> None:\n    \"\"\"Remove a Spectrum object from the molecular family.\n\n    Args:\n        spectrum: `Spectrum` object to remove from the molecular family.\n    \"\"\"\n    self._spectra.remove(spectrum)\n    self.spectra_ids.remove(spectrum.id)\n    self._strains = self._update_strains()\n    # remove the molecular family from the spectrum\n    spectrum.family = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.has_strain","title":"has_strain","text":"
    has_strain(strain: Strain) -> bool\n

    Check if the given strain exists.

    Parameters:

    Name Type Description Default strain Strain

    Strain object.

    required

    Returns:

    Type Description bool

    True when the given strain exists.

    Source code in src/nplinker/metabolomics/molecular_family.py
    def has_strain(self, strain: Strain) -> bool:\n    \"\"\"Check if the given strain exists.\n\n    Args:\n        strain: `Strain` object.\n\n    Returns:\n        True when the given strain exists.\n    \"\"\"\n    return strain in self._strains\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.MolecularFamily.is_singleton","title":"is_singleton","text":"
    is_singleton() -> bool\n

    Check if the molecular family contains only one spectrum.

    Returns:

    Type Description bool

    True when MolecularFamily.spectra_ids contains only one spectrum id.

    Source code in src/nplinker/metabolomics/molecular_family.py
    def is_singleton(self) -> bool:\n    \"\"\"Check if the molecular family contains only one spectrum.\n\n    Returns:\n        True when `MolecularFamily.spectra_ids` contains only one spectrum id.\n    \"\"\"\n    return len(self.spectra_ids) == 1\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum","title":"Spectrum","text":"
    Spectrum(\n    id: str,\n    mz: list[float],\n    intensity: list[float],\n    precursor_mz: float,\n    rt: float = 0,\n    metadata: dict | None = None,\n)\n

    Class to model MS/MS Spectrum.

    Attributes:

    Name Type Description id

    the spectrum ID.

    mz

    the list of m/z values.

    intensity

    the list of intensity values.

    precursor_mz

    the m/z value of the precursor.

    rt

    the retention time in seconds.

    metadata

    the metadata of the spectrum, i.e. the header information in the MGF file.

    gnps_annotations dict

    the GNPS annotations of the spectrum.

    gnps_id str | None

    the GNPS ID of the spectrum.

    strains StrainCollection

    the strains that this spectrum belongs to.

    family MolecularFamily | None

    the molecular family that this spectrum belongs to.

    peaks ndarray

    2D array of peaks, each row is a peak of (m/z, intensity) values.

    Parameters:

    Name Type Description Default id str

    the spectrum ID.

    required mz list[float]

    the list of m/z values.

    required intensity list[float]

    the list of intensity values.

    required precursor_mz float

    the precursor m/z.

    required rt float

    the retention time in seconds. Defaults to 0.

    0 metadata dict | None

    the metadata of the spectrum, i.e. the header information in the MGF file.

    None Source code in src/nplinker/metabolomics/spectrum.py
    def __init__(\n    self,\n    id: str,\n    mz: list[float],\n    intensity: list[float],\n    precursor_mz: float,\n    rt: float = 0,\n    metadata: dict | None = None,\n) -> None:\n    \"\"\"Initialize the Spectrum.\n\n    Args:\n        id: the spectrum ID.\n        mz: the list of m/z values.\n        intensity: the list of intensity values.\n        precursor_mz: the precursor m/z.\n        rt: the retention time in seconds. Defaults to 0.\n        metadata: the metadata of the spectrum, i.e. the header information\n            in the MGF file.\n    \"\"\"\n    self.id = id\n    self.mz = mz\n    self.intensity = intensity\n    self.precursor_mz = precursor_mz\n    self.rt = rt\n    self.metadata = metadata or {}\n\n    self.gnps_annotations: dict = {}\n    self.gnps_id: str | None = None\n    self.strains: StrainCollection = StrainCollection()\n    self.family: MolecularFamily | None = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.id","title":"id instance-attribute","text":"
    id = id\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.mz","title":"mz instance-attribute","text":"
    mz = mz\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.intensity","title":"intensity instance-attribute","text":"
    intensity = intensity\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.precursor_mz","title":"precursor_mz instance-attribute","text":"
    precursor_mz = precursor_mz\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.rt","title":"rt instance-attribute","text":"
    rt = rt\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.metadata","title":"metadata instance-attribute","text":"
    metadata = metadata or {}\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.gnps_annotations","title":"gnps_annotations instance-attribute","text":"
    gnps_annotations: dict = {}\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.gnps_id","title":"gnps_id instance-attribute","text":"
    gnps_id: str | None = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.strains","title":"strains instance-attribute","text":"
    strains: StrainCollection = StrainCollection()\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.family","title":"family instance-attribute","text":"
    family: MolecularFamily | None = None\n
    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.peaks","title":"peaks cached property","text":"
    peaks: ndarray\n

    Get the peaks, a 2D array with each row containing the values of (m/z, intensity).

    "},{"location":"api/metabolomics/#nplinker.metabolomics.Spectrum.has_strain","title":"has_strain","text":"
    has_strain(strain: Strain) -> bool\n

    Check if the given strain exists in the spectrum.

    Parameters:

    Name Type Description Default strain Strain

    Strain object.

    required

    Returns:

    Type Description bool

    True when the given strain exist in the spectrum.

    Source code in src/nplinker/metabolomics/spectrum.py
    def has_strain(self, strain: Strain) -> bool:\n    \"\"\"Check if the given strain exists in the spectrum.\n\n    Args:\n        strain: `Strain` object.\n\n    Returns:\n        True when the given strain exist in the spectrum.\n    \"\"\"\n    return strain in self.strains\n
    "},{"location":"api/metabolomics_abc/","title":"Abstract Base Classes","text":""},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc","title":"abc","text":""},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.SpectrumLoaderBase","title":"SpectrumLoaderBase","text":"

    Bases: ABC

    Abstract base class for SpectrumLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.SpectrumLoaderBase.spectra","title":"spectra abstractmethod property","text":"
    spectra: list[Spectrum]\n

    Get Spectrum objects.

    Returns:

    Type Description list[Spectrum]

    A sequence of Spectrum objects.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.MolecularFamilyLoaderBase","title":"MolecularFamilyLoaderBase","text":"

    Bases: ABC

    Abstract base class for MolecularFamilyLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.MolecularFamilyLoaderBase.get_mfs","title":"get_mfs abstractmethod","text":"
    get_mfs(keep_singleton: bool) -> list[MolecularFamily]\n

    Get MolecularFamily objects.

    Parameters:

    Name Type Description Default keep_singleton bool

    True to keep singleton molecular families. A singleton molecular family is a molecular family that contains only one spectrum.

    required

    Returns:

    Type Description list[MolecularFamily]

    A sequence of MolecularFamily objects.

    Source code in src/nplinker/metabolomics/abc.py
    @abstractmethod\ndef get_mfs(self, keep_singleton: bool) -> list[MolecularFamily]:\n    \"\"\"Get MolecularFamily objects.\n\n    Args:\n        keep_singleton: True to keep singleton molecular families. A\n            singleton molecular family is a molecular family that contains\n            only one spectrum.\n\n    Returns:\n        A sequence of MolecularFamily objects.\n    \"\"\"\n
    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.FileMappingLoaderBase","title":"FileMappingLoaderBase","text":"

    Bases: ABC

    Abstract base class for FileMappingLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.FileMappingLoaderBase.mappings","title":"mappings abstractmethod property","text":"
    mappings: dict[str, list[str]]\n

    Get file mappings.

    Returns:

    Type Description dict[str, list[str]]

    A mapping from spectrum ID to the names of files where the spectrum occurs.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.AnnotationLoaderBase","title":"AnnotationLoaderBase","text":"

    Bases: ABC

    Abstract base class for AnnotationLoader.

    "},{"location":"api/metabolomics_abc/#nplinker.metabolomics.abc.AnnotationLoaderBase.annotations","title":"annotations abstractmethod property","text":"
    annotations: dict[str, dict]\n

    Get annotations.

    Returns:

    Type Description dict[str, dict]

    A mapping from spectrum ID to its annotations.

    "},{"location":"api/metabolomics_utils/","title":"Utilities","text":""},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils","title":"utils","text":""},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.add_annotation_to_spectrum","title":"add_annotation_to_spectrum","text":"
    add_annotation_to_spectrum(\n    annotations: Mapping[str, dict],\n    spectra: Sequence[Spectrum],\n) -> None\n

    Add GNPS annotations to the Spectrum.gnps_annotations attribute for input spectra.

    It is possible that some spectra don't have annotations. Note that the input spectra list is changed in place.

    Parameters:

    Name Type Description Default annotations Mapping[str, dict]

    A dictionary of GNPS annotations, where the keys are spectrum ids and the values are GNPS annotations.

    required spectra Sequence[Spectrum]

    A list of Spectrum objects.

    required Source code in src/nplinker/metabolomics/utils.py
    def add_annotation_to_spectrum(\n    annotations: Mapping[str, dict], spectra: Sequence[Spectrum]\n) -> None:\n    \"\"\"Add GNPS annotations to the `Spectrum.gnps_annotations` attribute for input spectra.\n\n    It is possible that some spectra don't have annotations.\n    Note that the input `spectra` list is changed in place.\n\n    Args:\n        annotations: A dictionary of GNPS annotations, where the keys are\n            spectrum ids and the values are GNPS annotations.\n        spectra: A list of Spectrum objects.\n    \"\"\"\n    for spec in spectra:\n        if spec.id in annotations:\n            spec.gnps_annotations = annotations[spec.id]\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.add_strains_to_spectrum","title":"add_strains_to_spectrum","text":"
    add_strains_to_spectrum(\n    strains: StrainCollection, spectra: Sequence[Spectrum]\n) -> tuple[list[Spectrum], list[Spectrum]]\n

    Add Strain objects to the Spectrum.strains attribute for input spectra.

    Note that the input spectra list is changed in place.

    Parameters:

    Name Type Description Default strains StrainCollection

    A collection of strain objects.

    required spectra Sequence[Spectrum]

    A list of Spectrum objects.

    required

    Returns:

    Type Description tuple[list[Spectrum], list[Spectrum]]

    A tuple of two lists of Spectrum objects,

    Source code in src/nplinker/metabolomics/utils.py
    def add_strains_to_spectrum(\n    strains: StrainCollection, spectra: Sequence[Spectrum]\n) -> tuple[list[Spectrum], list[Spectrum]]:\n    \"\"\"Add `Strain` objects to the `Spectrum.strains` attribute for input spectra.\n\n    Note that the input `spectra` list is changed in place.\n\n    Args:\n        strains: A collection of strain objects.\n        spectra: A list of Spectrum objects.\n\n    Returns:\n        A tuple of two lists of Spectrum objects,\n\n            - the first list contains Spectrum objects that are updated with Strain objects;\n            - the second list contains Spectrum objects that are not updated with Strain objects\n            because no Strain objects are found.\n    \"\"\"\n    spectra_with_strains = []\n    spectra_without_strains = []\n    for spec in spectra:\n        try:\n            strain_list = strains.lookup(spec.id)\n        except ValueError:\n            spectra_without_strains.append(spec)\n            continue\n\n        for strain in strain_list:\n            spec.strains.add(strain)\n        spectra_with_strains.append(spec)\n\n    logger.info(\n        f\"{len(spectra_with_strains)} Spectrum objects updated with Strain objects.\\n\"\n        f\"{len(spectra_without_strains)} Spectrum objects not updated with Strain objects.\"\n    )\n\n    return spectra_with_strains, spectra_without_strains\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.add_spectrum_to_mf","title":"add_spectrum_to_mf","text":"
    add_spectrum_to_mf(\n    spectra: Sequence[Spectrum],\n    mfs: Sequence[MolecularFamily],\n) -> tuple[\n    list[MolecularFamily],\n    list[MolecularFamily],\n    dict[MolecularFamily, set[str]],\n]\n

    Add Spectrum objects to MolecularFamily objects.

    The attribute of spectra_ids of MolecularFamily object contains the ids of Spectrum objects. These ids are used to find Spectrum objects from the input spectra list. The found Spectrum objects are added to the spectra attribute of MolecularFamily object. It is possible that some spectrum ids are not found in the input spectra list, and so their Spectrum objects are missing in the MolecularFamily object.

    Note that the input mfs list is changed in place.

    Parameters:

    Name Type Description Default spectra Sequence[Spectrum]

    A list of Spectrum objects.

    required mfs Sequence[MolecularFamily]

    A list of MolecularFamily objects.

    required

    Returns:

    Type Description tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]

    A tuple of three elements,

    Source code in src/nplinker/metabolomics/utils.py
    def add_spectrum_to_mf(\n    spectra: Sequence[Spectrum], mfs: Sequence[MolecularFamily]\n) -> tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]:\n    \"\"\"Add Spectrum objects to MolecularFamily objects.\n\n    The attribute of `spectra_ids` of MolecularFamily object contains the ids of Spectrum objects.\n    These ids are used to find Spectrum objects from the input `spectra` list. The found Spectrum\n    objects are added to the `spectra` attribute of MolecularFamily object. It is possible that\n    some spectrum ids are not found in the input `spectra` list, and so their Spectrum objects are\n    missing in the MolecularFamily object.\n\n    Note that the input `mfs` list is changed in place.\n\n    Args:\n        spectra: A list of Spectrum objects.\n        mfs: A list of MolecularFamily objects.\n\n    Returns:\n        A tuple of three elements,\n\n            - the first list contains MolecularFamily objects that are updated with Spectrum objects\n            - the second list contains MolecularFamily objects that are not updated with Spectrum\n            objects (all Spectrum objects are missing).\n            - the third is a dictionary containing MolecularFamily objects as keys and a set of ids\n            of missing Spectrum objects as values.\n    \"\"\"\n    spec_dict = {spec.id: spec for spec in spectra}\n    mf_with_spec = []\n    mf_without_spec = []\n    mf_missing_spec: dict[MolecularFamily, set[str]] = {}\n    for mf in mfs:\n        for spec_id in mf.spectra_ids:\n            try:\n                spec = spec_dict[spec_id]\n            except KeyError:\n                if mf not in mf_missing_spec:\n                    mf_missing_spec[mf] = {spec_id}\n                else:\n                    mf_missing_spec[mf].add(spec_id)\n                continue\n            mf.add_spectrum(spec)\n\n        if mf.spectra:\n            mf_with_spec.append(mf)\n        else:\n            mf_without_spec.append(mf)\n\n    logger.info(\n        f\"{len(mf_with_spec)} MolecularFamily objects updated with Spectrum objects.\\n\"\n        f\"{len(mf_without_spec)} MolecularFamily objects not updated with Spectrum objects.\\n\"\n        f\"{len(mf_missing_spec)} MolecularFamily objects have missing Spectrum objects.\"\n    )\n    return mf_with_spec, mf_without_spec, mf_missing_spec\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.extract_mappings_strain_id_ms_filename","title":"extract_mappings_strain_id_ms_filename","text":"
    extract_mappings_strain_id_ms_filename(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"strain_id <-> MS_filename\".

    Parameters:

    Name Type Description Default podp_project_json_file str | PathLike

    The path to the PODP project JSON file.

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of MS filenames.

    Notes

    The podp_project_json_file is the project JSON file downloaded from PODP platform. For example, for project MSV000079284, its json file is https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.

    Source code in src/nplinker/metabolomics/utils.py
    def extract_mappings_strain_id_ms_filename(\n    podp_project_json_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"strain_id <-> MS_filename\".\n\n    Args:\n        podp_project_json_file: The path to the PODP project\n            JSON file.\n\n    Returns:\n        Key is strain id and value is a set of MS filenames.\n\n    Notes:\n        The `podp_project_json_file` is the project JSON file downloaded from\n        PODP platform. For example, for project MSV000079284, its json file is\n        https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.\n    \"\"\"\n    mappings_dict: dict[str, set[str]] = {}\n    with open(podp_project_json_file, \"r\") as f:\n        json_data = json.load(f)\n\n    validate_podp_json(json_data)\n\n    # Extract mappings strain id <-> metabolomics filename\n    for record in json_data[\"genome_metabolome_links\"]:\n        strain_id = record[\"genome_label\"]\n        # get the actual filename of the mzXML URL\n        filename = Path(record[\"metabolomics_file\"]).name\n        if strain_id in mappings_dict:\n            mappings_dict[strain_id].add(filename)\n        else:\n            mappings_dict[strain_id] = {filename}\n    return mappings_dict\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.extract_mappings_ms_filename_spectrum_id","title":"extract_mappings_ms_filename_spectrum_id","text":"
    extract_mappings_ms_filename_spectrum_id(\n    gnps_file_mappings_file: str | PathLike,\n) -> dict[str, set[str]]\n

    Extract mappings \"MS_filename <-> spectrum_id\".

    Parameters:

    Name Type Description Default gnps_file_mappings_file str | PathLike

    The path to the GNPS file mappings file (csv or tsv).

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is MS filename and value is a set of spectrum ids.

    Notes

    The gnps_file_mappings_file is generated by GNPS molecular networking. It's downloaded from GNPS website to a file with a default name defined in GNPS_FILE_MAPPINGS_FILENAME.

    See Also

    GNPSFileMappingLoader: A class to load GNPS file mappings file.

    Source code in src/nplinker/metabolomics/utils.py
    def extract_mappings_ms_filename_spectrum_id(\n    gnps_file_mappings_file: str | PathLike,\n) -> dict[str, set[str]]:\n    \"\"\"Extract mappings \"MS_filename <-> spectrum_id\".\n\n    Args:\n        gnps_file_mappings_file: The path to the GNPS file mappings file (csv or\n            tsv).\n\n    Returns:\n        Key is MS filename and value is a set of spectrum ids.\n\n    Notes:\n        The `gnps_file_mappings_file` is generated by GNPS molecular networking. It's downloaded\n        from GNPS website to a file with a default name defined in `GNPS_FILE_MAPPINGS_FILENAME`.\n\n    See Also:\n        GNPSFileMappingLoader: A class to load GNPS file mappings file.\n    \"\"\"\n    loader = GNPSFileMappingLoader(gnps_file_mappings_file)\n    return loader.mapping_reversed\n
    "},{"location":"api/metabolomics_utils/#nplinker.metabolomics.utils.get_mappings_strain_id_spectrum_id","title":"get_mappings_strain_id_spectrum_id","text":"
    get_mappings_strain_id_spectrum_id(\n    mappings_strain_id_ms_filename: Mapping[str, set[str]],\n    mappings_ms_filename_spectrum_id: Mapping[\n        str, set[str]\n    ],\n) -> dict[str, set[str]]\n

    Get mappings \"strain_id <-> spectrum_id\".

    Parameters:

    Name Type Description Default mappings_strain_id_ms_filename Mapping[str, set[str]]

    Mappings \"strain_id <-> MS_filename\".

    required mappings_ms_filename_spectrum_id Mapping[str, set[str]]

    Mappings \"MS_filename <-> spectrum_id\".

    required

    Returns:

    Type Description dict[str, set[str]]

    Key is strain id and value is a set of spectrum ids.

    See Also

    extract_mappings_strain_id_ms_filename: Extract mappings \"strain_id <-> MS_filename\". extract_mappings_ms_filename_spectrum_id: Extract mappings \"MS_filename <-> spectrum_id\".

    Source code in src/nplinker/metabolomics/utils.py
    def get_mappings_strain_id_spectrum_id(\n    mappings_strain_id_ms_filename: Mapping[str, set[str]],\n    mappings_ms_filename_spectrum_id: Mapping[str, set[str]],\n) -> dict[str, set[str]]:\n    \"\"\"Get mappings \"strain_id <-> spectrum_id\".\n\n    Args:\n        mappings_strain_id_ms_filename: Mappings\n            \"strain_id <-> MS_filename\".\n        mappings_ms_filename_spectrum_id: Mappings\n            \"MS_filename <-> spectrum_id\".\n\n    Returns:\n        Key is strain id and value is a set of spectrum ids.\n\n\n    See Also:\n        `extract_mappings_strain_id_ms_filename`: Extract mappings\n            \"strain_id <-> MS_filename\".\n        `extract_mappings_ms_filename_spectrum_id`: Extract mappings\n            \"MS_filename <-> spectrum_id\".\n    \"\"\"\n    mappings_dict = {}\n    for strain_id, ms_filenames in mappings_strain_id_ms_filename.items():\n        spectrum_ids = set()\n        for ms_filename in ms_filenames:\n            if (sid := mappings_ms_filename_spectrum_id.get(ms_filename)) is not None:\n                spectrum_ids.update(sid)\n        if spectrum_ids:\n            mappings_dict[strain_id] = spectrum_ids\n    return mappings_dict\n
    "},{"location":"api/mibig/","title":"MiBIG","text":""},{"location":"api/mibig/#nplinker.genomics.mibig","title":"mibig","text":""},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader","title":"MibigLoader","text":"
    MibigLoader(data_dir: str | PathLike)\n

    Bases: BGCLoaderBase

    Parse MIBiG metadata files and return BGC objects.

    MIBiG metadata file (json) contains annotations/metadata information for each BGC. See https://mibig.secondarymetabolites.org/download.

    The MiBIG accession is used as BGC id and strain name. The loaded BGC objects have Strain object as their strain attribute (i.e. BGC.strain).

    Parameters:

    Name Type Description Default data_dir str | PathLike

    Path to the directory of MIBiG metadata json files

    required Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def __init__(self, data_dir: str | PathLike):\n    \"\"\"Initialize the MIBiG metadata loader.\n\n    Args:\n        data_dir: Path to the directory of MIBiG metadata json files\n    \"\"\"\n    self.data_dir = str(data_dir)\n    self._file_dict = self.parse_data_dir(self.data_dir)\n    self._metadata_dict = self._parse_metadata()\n    self._bgcs = self._parse_bgcs()\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.data_dir","title":"data_dir instance-attribute","text":"
    data_dir = str(data_dir)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.get_files","title":"get_files","text":"
    get_files() -> dict[str, str]\n

    Get the path of all MIBiG metadata json files.

    Returns:

    Type Description dict[str, str]

    The key is metadata file name (BGC accession), and the value is path to the metadata

    dict[str, str]

    json file

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def get_files(self) -> dict[str, str]:\n    \"\"\"Get the path of all MIBiG metadata json files.\n\n    Returns:\n        The key is metadata file name (BGC accession), and the value is path to the metadata\n        json file\n    \"\"\"\n    return self._file_dict\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.parse_data_dir","title":"parse_data_dir staticmethod","text":"
    parse_data_dir(data_dir: str | PathLike) -> dict[str, str]\n

    Parse metadata directory and return paths to all metadata json files.

    Parameters:

    Name Type Description Default data_dir str | PathLike

    path to the directory of MIBiG metadata json files

    required

    Returns:

    Type Description dict[str, str]

    The key is metadata file name (BGC accession), and the value is path to the metadata

    dict[str, str]

    json file

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    @staticmethod\ndef parse_data_dir(data_dir: str | PathLike) -> dict[str, str]:\n    \"\"\"Parse metadata directory and return paths to all metadata json files.\n\n    Args:\n        data_dir: path to the directory of MIBiG metadata json files\n\n    Returns:\n        The key is metadata file name (BGC accession), and the value is path to the metadata\n        json file\n    \"\"\"\n    file_dict = {}\n    json_files = list_files(data_dir, prefix=\"BGC\", suffix=\".json\")\n    for file in json_files:\n        fname = Path(file).stem\n        file_dict[fname] = file\n    return file_dict\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.get_metadata","title":"get_metadata","text":"
    get_metadata() -> dict[str, MibigMetadata]\n

    Get MibigMetadata objects.

    Returns:

    Type Description dict[str, MibigMetadata]

    The key is BGC accession (file name) and the value is MibigMetadata object

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def get_metadata(self) -> dict[str, MibigMetadata]:\n    \"\"\"Get MibigMetadata objects.\n\n    Returns:\n        The key is BGC accession (file name) and the value is MibigMetadata object\n    \"\"\"\n    return self._metadata_dict\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigLoader.get_bgcs","title":"get_bgcs","text":"
    get_bgcs() -> list[BGC]\n

    Get BGC objects.

    The BGC objects use MiBIG accession as id and have Strain object as their strain attribute (i.e. BGC.strain), where the name of the Strain object is also MiBIG accession.

    Returns:

    Type Description list[BGC]

    A list of BGC objects

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def get_bgcs(self) -> list[BGC]:\n    \"\"\"Get BGC objects.\n\n    The BGC objects use MiBIG accession as id and have Strain object as\n    their strain attribute (i.e. `BGC.strain`), where the name of the Strain\n    object is also MiBIG accession.\n\n    Returns:\n        A list of BGC objects\n    \"\"\"\n    return self._bgcs\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata","title":"MibigMetadata","text":"
    MibigMetadata(file: str | PathLike)\n

    Class to model the BGC metadata/annotations defined in MIBiG.

    MIBiG is a specification of BGC metadata and use JSON schema to represent BGC metadata. More details see: https://mibig.secondarymetabolites.org/download.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the json file of MIBiG BGC metadata

    required

    Examples:

    >>> metadata = MibigMetadata(\"/data/BGC0000001.json\")\n
    Source code in src/nplinker/genomics/mibig/mibig_metadata.py
    def __init__(self, file: str | PathLike) -> None:\n    \"\"\"Initialize the MIBiG metadata object.\n\n    Args:\n        file: Path to the json file of MIBiG BGC metadata\n\n    Examples:\n        >>> metadata = MibigMetadata(\"/data/BGC0000001.json\")\n    \"\"\"\n    self.file = str(file)\n    with open(self.file, \"rb\") as f:\n        self.metadata = json.load(f)\n\n    self._mibig_accession: str\n    self._biosyn_class: tuple[str]\n    self._parse_metadata()\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.file","title":"file instance-attribute","text":"
    file = str(file)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.metadata","title":"metadata instance-attribute","text":"
    metadata = load(f)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.mibig_accession","title":"mibig_accession property","text":"
    mibig_accession: str\n

    Get the value of metadata item 'mibig_accession'.

    "},{"location":"api/mibig/#nplinker.genomics.mibig.MibigMetadata.biosyn_class","title":"biosyn_class property","text":"
    biosyn_class: tuple[str]\n

    Get the value of metadata item 'biosyn_class'.

    The 'biosyn_class' is biosynthetic class(es), namely the type of natural product or secondary metabolite.

    MIBiG defines 6 major biosynthetic classes, including \"NRP\", \"Polyketide\", \"RiPP\", \"Terpene\", \"Saccharide\" and \"Alkaloid\". Note that natural products created by all other biosynthetic mechanisms fall under the category \"Other\". More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.

    "},{"location":"api/mibig/#nplinker.genomics.mibig.download_and_extract_mibig_metadata","title":"download_and_extract_mibig_metadata","text":"
    download_and_extract_mibig_metadata(\n    download_root: str | PathLike,\n    extract_path: str | PathLike,\n    version: str = \"3.1\",\n)\n

    Download and extract MIBiG metadata json files.

    Note that it does not matter whether the metadata json files are in nested folders or not in the archive, all json files will be extracted to the same location, i.e. extract_path. The nested folders will be removed if they exist. So the extract_path will have only json files.

    Parameters:

    Name Type Description Default download_root str | PathLike

    Path to the directory in which to place the downloaded archive.

    required extract_path str | PathLike

    Path to an empty directory where the json files will be extracted. The directory must be empty if it exists. If it doesn't exist, the directory will be created.

    required version str

    description. Defaults to \"3.1\".

    '3.1'

    Examples:

    >>> download_and_extract_mibig_metadata(\"/data/download\", \"/data/mibig_metadata\")\n
    Source code in src/nplinker/genomics/mibig/mibig_downloader.py
    def download_and_extract_mibig_metadata(\n    download_root: str | os.PathLike,\n    extract_path: str | os.PathLike,\n    version: str = \"3.1\",\n):\n    \"\"\"Download and extract MIBiG metadata json files.\n\n    Note that it does not matter whether the metadata json files are in nested folders or not in the archive,\n    all json files will be extracted to the same location, i.e. `extract_path`. The nested\n    folders will be removed if they exist. So the `extract_path` will have only json files.\n\n    Args:\n        download_root: Path to the directory in which to place the downloaded archive.\n        extract_path: Path to an empty directory where the json files will be extracted.\n            The directory must be empty if it exists. If it doesn't exist, the directory will be created.\n        version: _description_. Defaults to \"3.1\".\n\n    Examples:\n        >>> download_and_extract_mibig_metadata(\"/data/download\", \"/data/mibig_metadata\")\n    \"\"\"\n    download_root = Path(download_root)\n    extract_path = Path(extract_path)\n\n    if download_root == extract_path:\n        raise ValueError(\"Identical path of download directory and extract directory\")\n\n    # check if extract_path is empty\n    if not extract_path.exists():\n        extract_path.mkdir(parents=True)\n    else:\n        if len(list(extract_path.iterdir())) != 0:\n            raise ValueError(f'Nonempty directory: \"{extract_path}\"')\n\n    # download and extract\n    md5 = _MD5_MIBIG_METADATA[version]\n    download_and_extract_archive(\n        url=MIBIG_METADATA_URL.format(version=version),\n        download_root=download_root,\n        extract_root=extract_path,\n        md5=md5,\n    )\n\n    # After extracting mibig archive, it's either one dir or many json files,\n    # if it's a dir, then move all json files from it to extract_path\n    subdirs = list_dirs(extract_path)\n    if len(subdirs) > 1:\n        raise ValueError(f\"Expected one extracted directory, got {len(subdirs)}\")\n\n    if len(subdirs) == 1:\n        subdir_path = subdirs[0]\n        for fname in list_files(subdir_path, prefix=\"BGC\", suffix=\".json\", keep_parent=False):\n            shutil.move(os.path.join(subdir_path, fname), os.path.join(extract_path, fname))\n        # delete subdir\n        if subdir_path != extract_path:\n            shutil.rmtree(subdir_path)\n
    "},{"location":"api/mibig/#nplinker.genomics.mibig.parse_bgc_metadata_json","title":"parse_bgc_metadata_json","text":"
    parse_bgc_metadata_json(file: str | PathLike) -> BGC\n

    Parse MIBiG metadata file and return BGC object.

    Note that the MiBIG accession is used as the BGC id and strain name. The BGC object has Strain object as its strain attribute.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the MIBiG metadata json file

    required

    Returns:

    Type Description BGC

    BGC object

    Source code in src/nplinker/genomics/mibig/mibig_loader.py
    def parse_bgc_metadata_json(file: str | PathLike) -> BGC:\n    \"\"\"Parse MIBiG metadata file and return BGC object.\n\n    Note that the MiBIG accession is used as the BGC id and strain name. The BGC\n    object has Strain object as its strain attribute.\n\n    Args:\n        file: Path to the MIBiG metadata json file\n\n    Returns:\n        BGC object\n    \"\"\"\n    metadata = MibigMetadata(str(file))\n    mibig_bgc = BGC(metadata.mibig_accession, *metadata.biosyn_class)\n    mibig_bgc.mibig_bgc_class = metadata.biosyn_class\n    mibig_bgc.strain = Strain(metadata.mibig_accession)\n    return mibig_bgc\n
    "},{"location":"api/nplinker/","title":"NPLinker","text":""},{"location":"api/nplinker/#nplinker","title":"nplinker","text":""},{"location":"api/nplinker/#nplinker.NPLinker","title":"NPLinker","text":"
    NPLinker(config_file: str | PathLike)\n

    Main class for the NPLinker application.

    Attributes:

    Name Type Description config

    The configuration object for the current NPLinker application.

    root_dir str

    The path to the root directory of the current NPLinker application.

    output_dir str

    The path to the output directory of the current NPLinker application.

    bgcs list[BGC]

    A list of all BGC objects.

    gcfs list[GCF]

    A list of all GCF objects.

    spectra list[Spectrum]

    A list of all Spectrum objects.

    mfs list[MolecularFamily]

    A list of all MolecularFamily objects.

    mibig_bgcs list[BGC]

    A list of all MiBIG BGC objects.

    strains StrainCollection

    A StrainCollection object containing all Strain objects.

    product_types list[str]

    A list of all BiGSCAPE product types.

    scoring_methods list[str]

    A list of all valid scoring methods.

    Examples:

    To start a NPLinker application:

    >>> from nplinker import NPLinker\n>>> npl = NPLinker(\"path/to/config.toml\")\n

    To load all data into memory:

    >>> npl.load_data()\n

    To check the number of GCF objects:

    >>> len(npl.gcfs)\n

    To get the links for all GCF objects using the Metcalf scoring method, the result is a LinkGraph object:

    >>> lg = npl.get_links(npl.gcfs, \"metcalf\")\n

    To get the link data between two objects:

    >>> link_data = lg.get_link_data(npl.gcfs[0], npl.spectra[0])\n{\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0, \"standardised\": False})}\n

    Parameters:

    Name Type Description Default config_file str | PathLike

    Path to the configuration file to use.

    required Source code in src/nplinker/nplinker.py
    def __init__(self, config_file: str | PathLike):\n    \"\"\"Initialise an NPLinker instance.\n\n    Args:\n        config_file: Path to the configuration file to use.\n    \"\"\"\n    # Load the configuration file\n    self.config = load_config(config_file)\n\n    # Setup logging for the application\n    setup_logging(\n        level=self.config.log.level,\n        file=self.config.log.get(\"file\", \"\"),\n        use_console=self.config.log.use_console,\n    )\n    logger.info(\n        \"Configuration:\\n %s\", pformat(self.config.as_dict(), width=20, sort_dicts=False)\n    )\n\n    # Setup the output directory\n    self._output_dir = self.config.root_dir / OUTPUT_DIRNAME\n    self._output_dir.mkdir(exist_ok=True)\n\n    # Initialise data containers that will be populated by the `load_data` method\n    self._bgc_dict: dict[str, BGC] = {}\n    self._gcf_dict: dict[str, GCF] = {}\n    self._spec_dict: dict[str, Spectrum] = {}\n    self._mf_dict: dict[str, MolecularFamily] = {}\n    self._mibig_bgcs: list[BGC] = []\n    self._strains: StrainCollection = StrainCollection()\n    self._product_types: list = []\n    self._chem_classes = None  # TODO: to be refactored\n    self._class_matches = None  # TODO: to be refactored\n\n    # Flags to keep track of whether the scoring methods have been set up\n    self._scoring_methods_setup_done = {name: False for name in self._valid_scoring_methods}\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.config","title":"config instance-attribute","text":"
    config = load_config(config_file)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.root_dir","title":"root_dir property","text":"
    root_dir: str\n

    Get the path to the root directory of the current NPLinker instance.

    "},{"location":"api/nplinker/#nplinker.NPLinker.output_dir","title":"output_dir property","text":"
    output_dir: str\n

    Get the path to the output directory of the current NPLinker instance.

    "},{"location":"api/nplinker/#nplinker.NPLinker.bgcs","title":"bgcs property","text":"
    bgcs: list[BGC]\n

    Get all BGC objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.gcfs","title":"gcfs property","text":"
    gcfs: list[GCF]\n

    Get all GCF objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.spectra","title":"spectra property","text":"
    spectra: list[Spectrum]\n

    Get all Spectrum objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.mfs","title":"mfs property","text":"
    mfs: list[MolecularFamily]\n

    Get all MolecularFamily objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.mibig_bgcs","title":"mibig_bgcs property","text":"
    mibig_bgcs: list[BGC]\n

    Get all MiBIG BGC objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.strains","title":"strains property","text":"
    strains: StrainCollection\n

    Get all Strain objects.

    "},{"location":"api/nplinker/#nplinker.NPLinker.product_types","title":"product_types property","text":"
    product_types: list[str]\n

    Get all BiGSCAPE product types.

    "},{"location":"api/nplinker/#nplinker.NPLinker.chem_classes","title":"chem_classes property","text":"
    chem_classes\n

    Returns loaded ChemClassPredictions with the class predictions.

    "},{"location":"api/nplinker/#nplinker.NPLinker.class_matches","title":"class_matches property","text":"
    class_matches\n

    ClassMatches with the matched classes and scoring tables from MIBiG.

    "},{"location":"api/nplinker/#nplinker.NPLinker.scoring_methods","title":"scoring_methods property","text":"
    scoring_methods: list[str]\n

    Get names of all valid scoring methods.

    "},{"location":"api/nplinker/#nplinker.NPLinker.load_data","title":"load_data","text":"
    load_data()\n

    Load all data from local files into memory.

    This method is a convenience function that calls the DatasetArranger and DatasetLoader classes to load all data from the local filesystem into memory. The loaded data is then stored in various private data containers for easy access.

    Source code in src/nplinker/nplinker.py
    def load_data(self):\n    \"\"\"Load all data from local files into memory.\n\n    This method is a convenience function that calls the `DatasetArranger` and `DatasetLoader`\n    classes to load all data from the local filesystem into memory. The loaded data is then\n    stored in various private data containers for easy access.\n    \"\"\"\n    arranger = DatasetArranger(self.config)\n    arranger.arrange()\n    loader = DatasetLoader(self.config)\n    loader.load()\n\n    self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}\n    self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}\n    self._spec_dict = {spec.id: spec for spec in loader.spectra}\n    self._mf_dict = {mf.id: mf for mf in loader.mfs}\n\n    self._mibig_bgcs = loader.mibig_bgcs\n    self._strains = loader.strains\n    self._product_types = loader.product_types\n    self._chem_classes = loader.chem_classes\n    self._class_matches = loader.class_matches\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.get_links","title":"get_links","text":"
    get_links(\n    objects: (\n        Sequence[BGC]\n        | Sequence[GCF]\n        | Sequence[Spectrum]\n        | Sequence[MolecularFamily]\n    ),\n    scoring_method: str,\n    **scoring_params: Any\n) -> LinkGraph\n

    Get the links for the given objects using the specified scoring method and parameters.

    Parameters:

    Name Type Description Default objects Sequence[BGC] | Sequence[GCF] | Sequence[Spectrum] | Sequence[MolecularFamily]

    A sequence of objects to get the links for. The objects must be of the same type, i.e. BGC, GCF, Spectrum or MolecularFamily type. For scoring method metcalf, the BGC objects are not supported.

    required scoring_method str

    The scoring method to use. Must be one of the valid scoring methods self.scoring_methods, such as \"metcalf\".

    required scoring_params Any

    Parameters to pass to the scoring method. If not provided, the default parameters for the scoring method will be used.

    {}

    Returns:

    Type Description LinkGraph

    A LinkGraph object containing the links for the given objects.

    Raises:

    Type Description ValueError

    If input objects are empty or if the scoring method is invalid.

    TypeError

    If the input objects are not of the same type or if the object type is invalid.

    Source code in src/nplinker/nplinker.py
    def get_links(\n    self,\n    objects: Sequence[BGC] | Sequence[GCF] | Sequence[Spectrum] | Sequence[MolecularFamily],\n    scoring_method: str,\n    **scoring_params: Any,\n) -> LinkGraph:\n    \"\"\"Get the links for the given objects using the specified scoring method and parameters.\n\n    Args:\n        objects: A sequence of objects to get the links for. The objects must be of the same\n            type, i.e. `BGC`, `GCF`, `Spectrum` or `MolecularFamily` type.\n            For scoring method `metcalf`, the BGC objects are not supported.\n        scoring_method: The scoring method to use. Must be one of the valid scoring methods\n            `self.scoring_methods`, such as \"metcalf\".\n        scoring_params: Parameters to pass to the scoring method. If not provided, the default\n            parameters for the scoring method will be used.\n\n    Returns:\n        A LinkGraph object containing the links for the given objects.\n\n    Raises:\n        ValueError: If input objects are empty or if the scoring method is invalid.\n        TypeError: If the input objects are not of the same type or if the object type is invalid.\n    \"\"\"\n    # Validate objects\n    if len(objects) == 0:\n        raise ValueError(\"No objects provided to get links for\")\n    # check if all objects are of the same type\n    types = {type(i) for i in objects}\n    if len(types) > 1:\n        raise TypeError(\"Input objects must be of the same type.\")\n    # check if the object type is valid\n    obj_type = next(iter(types))\n    if obj_type not in (BGC, GCF, Spectrum, MolecularFamily):\n        raise TypeError(\n            f\"Invalid type {obj_type}. Input objects must be BGC, GCF, Spectrum or MolecularFamily objects.\"\n        )\n\n    # Validate scoring method\n    if scoring_method not in self._valid_scoring_methods:\n        raise ValueError(f\"Invalid scoring method {scoring_method}.\")\n\n    # Check if the scoring method has been set up\n    if not self._scoring_methods_setup_done[scoring_method]:\n        self._valid_scoring_methods[scoring_method].setup(self)\n        self._scoring_methods_setup_done[scoring_method] = True\n\n    # Initialise the scoring method\n    scoring = self._valid_scoring_methods[scoring_method]()\n\n    return scoring.get_links(*objects, **scoring_params)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_bgc","title":"lookup_bgc","text":"
    lookup_bgc(id: str) -> BGC | None\n

    Get the BGC object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the BGC to look up.

    required

    Returns:

    Type Description BGC | None

    The BGC object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_bgc(self, id: str) -> BGC | None:\n    \"\"\"Get the BGC object with the given ID.\n\n    Args:\n        id: the ID of the BGC to look up.\n\n    Returns:\n        The BGC object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._bgc_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_gcf","title":"lookup_gcf","text":"
    lookup_gcf(id: str) -> GCF | None\n

    Get the GCF object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the GCF to look up.

    required

    Returns:

    Type Description GCF | None

    The GCF object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_gcf(self, id: str) -> GCF | None:\n    \"\"\"Get the GCF object with the given ID.\n\n    Args:\n        id: the ID of the GCF to look up.\n\n    Returns:\n        The GCF object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._gcf_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_spectrum","title":"lookup_spectrum","text":"
    lookup_spectrum(id: str) -> Spectrum | None\n

    Get the Spectrum object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the Spectrum to look up.

    required

    Returns:

    Type Description Spectrum | None

    The Spectrum object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_spectrum(self, id: str) -> Spectrum | None:\n    \"\"\"Get the Spectrum object with the given ID.\n\n    Args:\n        id: the ID of the Spectrum to look up.\n\n    Returns:\n        The Spectrum object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._spec_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.lookup_mf","title":"lookup_mf","text":"
    lookup_mf(id: str) -> MolecularFamily | None\n

    Get the MolecularFamily object with the given ID.

    Parameters:

    Name Type Description Default id str

    the ID of the MolecularFamily to look up.

    required

    Returns:

    Type Description MolecularFamily | None

    The MolecularFamily object with the given ID, or None if no such object exists.

    Source code in src/nplinker/nplinker.py
    def lookup_mf(self, id: str) -> MolecularFamily | None:\n    \"\"\"Get the MolecularFamily object with the given ID.\n\n    Args:\n        id: the ID of the MolecularFamily to look up.\n\n    Returns:\n        The MolecularFamily object with the given ID, or None if no such object exists.\n    \"\"\"\n    return self._mf_dict.get(id, None)\n
    "},{"location":"api/nplinker/#nplinker.NPLinker.save_data","title":"save_data","text":"
    save_data(\n    file: str | PathLike, links: LinkGraph | None = None\n) -> None\n

    Pickle data to a file.

    The data to be pickled is a tuple containing the BGCs, GCFs, Spectra, MolecularFamilies, StrainCollection and links, i.e. (bgcs, gcfs, spectra, mfs, strains, links). If the links are not provided, None will be used.

    Parameters:

    Name Type Description Default file str | PathLike

    The path to the pickle file to save the data to.

    required links LinkGraph | None

    The LinkGraph object to save.

    None Source code in src/nplinker/nplinker.py
    def save_data(\n    self,\n    file: str | PathLike,\n    links: LinkGraph | None = None,\n) -> None:\n    \"\"\"Pickle data to a file.\n\n    The data to be pickled is a tuple containing the BGCs, GCFs, Spectra, MolecularFamilies,\n    StrainCollection and links, i.e. `(bgcs, gcfs, spectra, mfs, strains, links)`. If the links\n    are not provided, `None` will be used.\n\n    Args:\n        file: The path to the pickle file to save the data to.\n        links: The LinkGraph object to save.\n    \"\"\"\n    data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)\n    with open(file, \"wb\") as f:\n        pickle.dump(data, f)\n
    "},{"location":"api/nplinker/#nplinker.setup_logging","title":"setup_logging","text":"
    setup_logging(\n    level: str = \"INFO\",\n    file: str = \"\",\n    use_console: bool = True,\n) -> None\n

    Setup logging configuration for the ancestor logger \"nplinker\".

    Parameters:

    Name Type Description Default level str

    The log level, use the logging module's log level constants. Valid levels are: \"NOTSET\", \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\".

    'INFO' file str

    The file to write the log to. If the file does not exist, it will be created. The log will be written to the file in append mode. If the file is an empty string (by default), the log will not be written to a file.

    '' use_console bool

    Whether to log to the console.

    True Source code in src/nplinker/logger.py
    def setup_logging(level: str = \"INFO\", file: str = \"\", use_console: bool = True) -> None:\n    \"\"\"Setup logging configuration for the ancestor logger \"nplinker\".\n\n    Args:\n        level: The log level, use the logging module's log level constants. Valid levels are:\n            \"NOTSET\", \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\".\n        file: The file to write the log to. If the file does not exist, it will be created. The log\n            will be written to the file in append mode. If the file is an empty string (by default),\n            the log will not be written to a file.\n        use_console: Whether to log to the console.\n    \"\"\"\n    # Get the ancestor logger \"nplinker\"\n    logger = logging.getLogger(\"nplinker\")\n    logger.setLevel(level)\n\n    # File handler\n    if file:\n        logger.addHandler(\n            RichHandler(\n                console=Console(file=open(file, \"a\"), width=120),  # force the line width to 120\n                omit_repeated_times=False,\n                rich_tracebacks=True,\n                tracebacks_show_locals=True,\n                log_time_format=\"[%Y-%m-%d %X]\",\n            )\n        )\n\n    # Console handler\n    if use_console:\n        logger.addHandler(\n            RichHandler(\n                omit_repeated_times=False,\n                rich_tracebacks=True,\n                tracebacks_show_locals=True,\n                log_time_format=\"[%Y-%m-%d %X]\",\n            )\n        )\n
    "},{"location":"api/nplinker/#nplinker.defaults","title":"defaults","text":""},{"location":"api/nplinker/#nplinker.defaults.NPLINKER_APP_DATA_DIR","title":"NPLINKER_APP_DATA_DIR module-attribute","text":"
    NPLINKER_APP_DATA_DIR: Final = parent / 'data'\n
    "},{"location":"api/nplinker/#nplinker.defaults.STRAIN_MAPPINGS_FILENAME","title":"STRAIN_MAPPINGS_FILENAME module-attribute","text":"
    STRAIN_MAPPINGS_FILENAME: Final = 'strain_mappings.json'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GENOME_BGC_MAPPINGS_FILENAME","title":"GENOME_BGC_MAPPINGS_FILENAME module-attribute","text":"
    GENOME_BGC_MAPPINGS_FILENAME: Final = (\n    \"genome_bgc_mappings.json\"\n)\n
    "},{"location":"api/nplinker/#nplinker.defaults.GENOME_STATUS_FILENAME","title":"GENOME_STATUS_FILENAME module-attribute","text":"
    GENOME_STATUS_FILENAME: Final = 'genome_status.json'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_SPECTRA_FILENAME","title":"GNPS_SPECTRA_FILENAME module-attribute","text":"
    GNPS_SPECTRA_FILENAME: Final = 'spectra.mgf'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_MOLECULAR_FAMILY_FILENAME","title":"GNPS_MOLECULAR_FAMILY_FILENAME module-attribute","text":"
    GNPS_MOLECULAR_FAMILY_FILENAME: Final = (\n    \"molecular_families.tsv\"\n)\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_ANNOTATIONS_FILENAME","title":"GNPS_ANNOTATIONS_FILENAME module-attribute","text":"
    GNPS_ANNOTATIONS_FILENAME: Final = 'annotations.tsv'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_FILE_MAPPINGS_TSV","title":"GNPS_FILE_MAPPINGS_TSV module-attribute","text":"
    GNPS_FILE_MAPPINGS_TSV: Final = 'file_mappings.tsv'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_FILE_MAPPINGS_CSV","title":"GNPS_FILE_MAPPINGS_CSV module-attribute","text":"
    GNPS_FILE_MAPPINGS_CSV: Final = 'file_mappings.csv'\n
    "},{"location":"api/nplinker/#nplinker.defaults.STRAINS_SELECTED_FILENAME","title":"STRAINS_SELECTED_FILENAME module-attribute","text":"
    STRAINS_SELECTED_FILENAME: Final = 'strains_selected.json'\n
    "},{"location":"api/nplinker/#nplinker.defaults.DOWNLOADS_DIRNAME","title":"DOWNLOADS_DIRNAME module-attribute","text":"
    DOWNLOADS_DIRNAME: Final = 'downloads'\n
    "},{"location":"api/nplinker/#nplinker.defaults.MIBIG_DIRNAME","title":"MIBIG_DIRNAME module-attribute","text":"
    MIBIG_DIRNAME: Final = 'mibig'\n
    "},{"location":"api/nplinker/#nplinker.defaults.GNPS_DIRNAME","title":"GNPS_DIRNAME module-attribute","text":"
    GNPS_DIRNAME: Final = 'gnps'\n
    "},{"location":"api/nplinker/#nplinker.defaults.ANTISMASH_DIRNAME","title":"ANTISMASH_DIRNAME module-attribute","text":"
    ANTISMASH_DIRNAME: Final = 'antismash'\n
    "},{"location":"api/nplinker/#nplinker.defaults.BIGSCAPE_DIRNAME","title":"BIGSCAPE_DIRNAME module-attribute","text":"
    BIGSCAPE_DIRNAME: Final = 'bigscape'\n
    "},{"location":"api/nplinker/#nplinker.defaults.BIGSCAPE_RUNNING_OUTPUT_DIRNAME","title":"BIGSCAPE_RUNNING_OUTPUT_DIRNAME module-attribute","text":"
    BIGSCAPE_RUNNING_OUTPUT_DIRNAME: Final = (\n    \"bigscape_running_output\"\n)\n
    "},{"location":"api/nplinker/#nplinker.defaults.OUTPUT_DIRNAME","title":"OUTPUT_DIRNAME module-attribute","text":"
    OUTPUT_DIRNAME: Final = 'output'\n
    "},{"location":"api/nplinker/#nplinker.config","title":"config","text":""},{"location":"api/nplinker/#nplinker.config.CONFIG_VALIDATORS","title":"CONFIG_VALIDATORS module-attribute","text":"
    CONFIG_VALIDATORS = [\n    Validator(\n        \"root_dir\",\n        required=True,\n        cast=transform_to_full_path,\n        condition=lambda v: is_dir(),\n    ),\n    Validator(\n        \"mode\",\n        required=True,\n        cast=lambda v: lower(),\n        is_in=[\"local\", \"podp\"],\n    ),\n    Validator(\n        \"podp_id\",\n        required=True,\n        when=Validator(\"mode\", eq=\"podp\"),\n    ),\n    Validator(\n        \"podp_id\",\n        required=False,\n        when=Validator(\"mode\", eq=\"local\"),\n    ),\n    Validator(\n        \"log.level\",\n        is_type_of=str,\n        cast=lambda v: upper(),\n        is_in=[\n            \"NOTSET\",\n            \"DEBUG\",\n            \"INFO\",\n            \"WARNING\",\n            \"ERROR\",\n            \"CRITICAL\",\n        ],\n    ),\n    Validator(\"log.file\", is_type_of=str),\n    Validator(\"log.use_console\", is_type_of=bool),\n    Validator(\n        \"mibig.to_use\", required=True, is_type_of=bool\n    ),\n    Validator(\n        \"mibig.version\",\n        required=True,\n        is_type_of=str,\n        when=Validator(\"mibig.to_use\", eq=True),\n    ),\n    Validator(\n        \"bigscape.parameters\", required=True, is_type_of=str\n    ),\n    Validator(\n        \"bigscape.cutoff\", required=True, is_type_of=str\n    ),\n    Validator(\n        \"scoring.methods\",\n        required=True,\n        cast=lambda v: [lower() for i in v],\n        is_type_of=list,\n        len_min=1,\n        condition=lambda v: issubset(\n            {\"metcalf\", \"rosetta\"}\n        ),\n    ),\n]\n
    "},{"location":"api/nplinker/#nplinker.config.load_config","title":"load_config","text":"
    load_config(config_file: str | PathLike) -> Dynaconf\n

    Load and validate the configuration file.

    Parameters:

    Name Type Description Default config_file str | PathLike

    Path to the configuration file.

    required

    Returns:

    Name Type Description Dynaconf Dynaconf

    A Dynaconf object containing the configuration settings.

    Raises:

    Type Description FileNotFoundError

    If the configuration file does not exist.

    Source code in src/nplinker/config.py
    def load_config(config_file: str | PathLike) -> Dynaconf:\n    \"\"\"Load and validate the configuration file.\n\n    Args:\n        config_file: Path to the configuration file.\n\n    Returns:\n        Dynaconf: A Dynaconf object containing the configuration settings.\n\n    Raises:\n        FileNotFoundError: If the configuration file does not exist.\n    \"\"\"\n    config_file = transform_to_full_path(config_file)\n    if not config_file.exists():\n        raise FileNotFoundError(f\"Config file '{config_file}' not found\")\n\n    # Locate the default config file\n    default_config_file = Path(__file__).resolve().parent / \"nplinker_default.toml\"\n\n    # Load config files\n    config = Dynaconf(settings_files=[config_file], preload=[default_config_file])\n\n    # Validate configs\n    config.validators.register(*CONFIG_VALIDATORS)\n    config.validators.validate()\n\n    return config\n
    "},{"location":"api/schema/","title":"Schemas","text":""},{"location":"api/schema/#nplinker.schemas","title":"schemas","text":""},{"location":"api/schema/#nplinker.schemas.PODP_ADAPTED_SCHEMA","title":"PODP_ADAPTED_SCHEMA module-attribute","text":"
    PODP_ADAPTED_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.SCHEMA_DIR","title":"SCHEMA_DIR module-attribute","text":"
    SCHEMA_DIR = parent\n
    "},{"location":"api/schema/#nplinker.schemas.GENOME_STATUS_SCHEMA","title":"GENOME_STATUS_SCHEMA module-attribute","text":"
    GENOME_STATUS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.GENOME_BGC_MAPPINGS_SCHEMA","title":"GENOME_BGC_MAPPINGS_SCHEMA module-attribute","text":"
    GENOME_BGC_MAPPINGS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.STRAIN_MAPPINGS_SCHEMA","title":"STRAIN_MAPPINGS_SCHEMA module-attribute","text":"
    STRAIN_MAPPINGS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.USER_STRAINS_SCHEMA","title":"USER_STRAINS_SCHEMA module-attribute","text":"
    USER_STRAINS_SCHEMA = load(f)\n
    "},{"location":"api/schema/#nplinker.schemas.validate_podp_json","title":"validate_podp_json","text":"
    validate_podp_json(json_data: dict) -> None\n

    Validate a dictionary of JSON data against the PODP JSON schema.

    All validation error messages are collected and raised as a single ValueError.

    Parameters:

    Name Type Description Default json_data dict

    The JSON data to validate.

    required

    Raises:

    Type Description ValueError

    If the JSON data does not match the schema.

    Source code in src/nplinker/schemas/utils.py
    def validate_podp_json(json_data: dict) -> None:\n    \"\"\"Validate a dictionary of JSON data against the PODP JSON schema.\n\n    All validation error messages are collected and raised as a single\n    ValueError.\n\n    Args:\n        json_data: The JSON data to validate.\n\n    Raises:\n        ValueError: If the JSON data does not match the schema.\n    \"\"\"\n    validator = Draft7Validator(PODP_ADAPTED_SCHEMA)\n    errors = sorted(validator.iter_errors(json_data), key=lambda e: e.path)\n    if errors:\n        error_messages = [f\"{e.json_path}: {e.message}\" for e in errors]\n        raise ValueError(\n            \"Not match PODP adapted schema, here are the detailed error:\\n  - \"\n            + \"\\n  - \".join(error_messages)\n        )\n
    "},{"location":"api/scoring/","title":"Data Models","text":""},{"location":"api/scoring/#nplinker.scoring","title":"scoring","text":""},{"location":"api/scoring/#nplinker.scoring.LinkGraph","title":"LinkGraph","text":"
    LinkGraph()\n

    A class to represent the links between objects in NPLinker.

    This class wraps the networkx.Graph class to provide a more user-friendly interface for working with the links.

    The links between objects are stored as edges in a graph, while the objects themselves are stored as nodes.

    The scoring data for each link (or link data) is stored as the key/value attributes of the edge.

    Examples:

    Create a LinkGraph object:

    >>> lg = LinkGraph()\n

    Add a link between a GCF and a Spectrum object:

    >>> lg.add_link(gcf, spectrum, metcalf=Score(\"metcalf\", 1.0, {\"cutoff\": 0.5}))\n

    Get all links for a given object:

    >>> lg[gcf]\n{spectrum: {\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})}}\n

    Get all links:

    >>> lg.links\n[(gcf, spectrum, {\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})})]\n

    Check if there is a link between two objects:

    >>> lg.has_link(gcf, spectrum)\nTrue\n

    Get the link data between two objects:

    >>> lg.get_link_data(gcf, spectrum)\n{\"metcalf\": Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})}\n
    Source code in src/nplinker/scoring/link_graph.py
    def __init__(self) -> None:\n    self._g: Graph = Graph()\n
    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.links","title":"links property","text":"
    links: list[LINK]\n

    Get all links.

    Returns:

    Type Description list[LINK]

    A list of tuples containing the links between objects.

    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.add_link","title":"add_link","text":"
    add_link(u: Entity, v: Entity, **data: Score) -> None\n

    Add a link between two objects.

    The objects u and v must be different types, i.e. one must be a GCF and the other must be a Spectrum or MolecularFamily.

    Parameters:

    Name Type Description Default u Entity

    the first object, either a GCF, Spectrum, or MolecularFamily

    required v Entity

    the second object, either a GCF, Spectrum, or MolecularFamily

    required data Score

    keyword arguments. At least one scoring method and its data must be provided. The key must be the name of the scoring method defined in ScoringMethod, and the value is a Score object, e.g. metcalf=Score(\"metcalf\", 1.0, {\"cutoff\": 0.5}).

    {} Source code in src/nplinker/scoring/link_graph.py
    @validate_uv\ndef add_link(\n    self,\n    u: Entity,\n    v: Entity,\n    **data: Score,\n) -> None:\n    \"\"\"Add a link between two objects.\n\n    The objects `u` and `v` must be different types, i.e. one must be a GCF and the other must be\n    a Spectrum or MolecularFamily.\n\n    Args:\n        u: the first object, either a GCF, Spectrum, or MolecularFamily\n        v: the second object, either a GCF, Spectrum, or MolecularFamily\n        data: keyword arguments. At least one scoring method and its data must be provided.\n            The key must be the name of the scoring method defined in `ScoringMethod`, and the\n            value is a `Score` object, e.g. `metcalf=Score(\"metcalf\", 1.0, {\"cutoff\": 0.5})`.\n    \"\"\"\n    # validate the data\n    if not data:\n        raise ValueError(\"At least one scoring method and its data must be provided.\")\n    for key, value in data.items():\n        if not ScoringMethod.has_value(key):\n            raise ValueError(\n                f\"{key} is not a valid name of scoring method. See `ScoringMethod` for valid names.\"\n            )\n        if not isinstance(value, Score):\n            raise TypeError(f\"{value} is not a Score object.\")\n\n    self._g.add_edge(u, v, **data)\n
    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.has_link","title":"has_link","text":"
    has_link(u: Entity, v: Entity) -> bool\n

    Check if there is a link between two objects.

    Parameters:

    Name Type Description Default u Entity

    the first object, either a GCF, Spectrum, or MolecularFamily

    required v Entity

    the second object, either a GCF, Spectrum, or MolecularFamily

    required

    Returns:

    Type Description bool

    True if there is a link between the two objects, False otherwise

    Source code in src/nplinker/scoring/link_graph.py
    @validate_uv\ndef has_link(self, u: Entity, v: Entity) -> bool:\n    \"\"\"Check if there is a link between two objects.\n\n    Args:\n        u: the first object, either a GCF, Spectrum, or MolecularFamily\n        v: the second object, either a GCF, Spectrum, or MolecularFamily\n\n    Returns:\n        True if there is a link between the two objects, False otherwise\n    \"\"\"\n    return self._g.has_edge(u, v)\n
    "},{"location":"api/scoring/#nplinker.scoring.LinkGraph.get_link_data","title":"get_link_data","text":"
    get_link_data(u: Entity, v: Entity) -> LINK_DATA | None\n

    Get the data for a link between two objects.

    Parameters:

    Name Type Description Default u Entity

    the first object, either a GCF, Spectrum, or MolecularFamily

    required v Entity

    the second object, either a GCF, Spectrum, or MolecularFamily

    required

    Returns:

    Type Description LINK_DATA | None

    A dictionary of scoring methods and their data for the link between the two objects, or

    LINK_DATA | None

    None if there is no link between the two objects.

    Source code in src/nplinker/scoring/link_graph.py
    @validate_uv\ndef get_link_data(\n    self,\n    u: Entity,\n    v: Entity,\n) -> LINK_DATA | None:\n    \"\"\"Get the data for a link between two objects.\n\n    Args:\n        u: the first object, either a GCF, Spectrum, or MolecularFamily\n        v: the second object, either a GCF, Spectrum, or MolecularFamily\n\n    Returns:\n        A dictionary of scoring methods and their data for the link between the two objects, or\n        None if there is no link between the two objects.\n    \"\"\"\n    return self._g.get_edge_data(u, v)  # type: ignore\n
    "},{"location":"api/scoring/#nplinker.scoring.Score","title":"Score dataclass","text":"
    Score(name: str, value: float, parameter: dict)\n

    A data class to represent score data.

    Attributes:

    Name Type Description name str

    the name of the scoring method. See ScoringMethod for valid values.

    value float

    the score value.

    parameter dict

    the parameters used for the scoring method.

    "},{"location":"api/scoring/#nplinker.scoring.Score.name","title":"name instance-attribute","text":"
    name: str\n
    "},{"location":"api/scoring/#nplinker.scoring.Score.value","title":"value instance-attribute","text":"
    value: float\n
    "},{"location":"api/scoring/#nplinker.scoring.Score.parameter","title":"parameter instance-attribute","text":"
    parameter: dict\n
    "},{"location":"api/scoring_abc/","title":"Abstract Base Classes","text":""},{"location":"api/scoring_abc/#nplinker.scoring.abc","title":"abc","text":""},{"location":"api/scoring_abc/#nplinker.scoring.abc.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase","title":"ScoringBase","text":"

    Bases: ABC

    Abstract base class of scoring methods.

    Attributes:

    Name Type Description name str

    The name of the scoring method.

    npl NPLinker | None

    The NPLinker object.

    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.name","title":"name class-attribute instance-attribute","text":"
    name: str = 'ScoringBase'\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.npl","title":"npl class-attribute instance-attribute","text":"
    npl: NPLinker | None = None\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.setup","title":"setup abstractmethod classmethod","text":"
    setup(npl: NPLinker)\n

    Setup class level attributes.

    Source code in src/nplinker/scoring/abc.py
    @classmethod\n@abstractmethod\ndef setup(cls, npl: NPLinker):\n    \"\"\"Setup class level attributes.\"\"\"\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.get_links","title":"get_links abstractmethod","text":"
    get_links(*objects, **parameters) -> LinkGraph\n

    Get links information for the given objects.

    Parameters:

    Name Type Description Default objects

    A list of objects to get links for.

    () parameters

    The parameters used for scoring.

    {}

    Returns:

    Type Description LinkGraph

    The LinkGraph object.

    Source code in src/nplinker/scoring/abc.py
    @abstractmethod\ndef get_links(\n    self,\n    *objects,\n    **parameters,\n) -> LinkGraph:\n    \"\"\"Get links information for the given objects.\n\n    Args:\n        objects: A list of objects to get links for.\n        parameters: The parameters used for scoring.\n\n    Returns:\n        The LinkGraph object.\n    \"\"\"\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.format_data","title":"format_data abstractmethod","text":"
    format_data(data) -> str\n

    Format the scoring data to a string.

    Source code in src/nplinker/scoring/abc.py
    @abstractmethod\ndef format_data(self, data) -> str:\n    \"\"\"Format the scoring data to a string.\"\"\"\n
    "},{"location":"api/scoring_abc/#nplinker.scoring.abc.ScoringBase.sort","title":"sort abstractmethod","text":"
    sort(objects, reverse=True) -> list\n

    Sort the given objects based on the scoring data.

    Source code in src/nplinker/scoring/abc.py
    @abstractmethod\ndef sort(self, objects, reverse=True) -> list:\n    \"\"\"Sort the given objects based on the scoring data.\"\"\"\n
    "},{"location":"api/scoring_methods/","title":"Scoring Methods","text":""},{"location":"api/scoring_methods/#nplinker.scoring","title":"scoring","text":""},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod","title":"ScoringMethod","text":"

    Bases: Enum

    Enum class for scoring methods.

    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.METCALF","title":"METCALF class-attribute instance-attribute","text":"
    METCALF = 'metcalf'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.ROSETTA","title":"ROSETTA class-attribute instance-attribute","text":"
    ROSETTA = 'rosetta'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.NPLCLASS","title":"NPLCLASS class-attribute instance-attribute","text":"
    NPLCLASS = 'nplclass'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.ScoringMethod.has_value","title":"has_value classmethod","text":"
    has_value(value: str) -> bool\n

    Check if the enum has a value.

    Source code in src/nplinker/scoring/scoring_method.py
    @classmethod\ndef has_value(cls, value: str) -> bool:\n    \"\"\"Check if the enum has a value.\"\"\"\n    return any(value == item.value for item in cls)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring","title":"MetcalfScoring","text":"

    Bases: ScoringBase

    Metcalf scoring method.

    Attributes:

    Name Type Description name

    The name of this scoring method, set to a fixed value metcalf.

    npl NPLinker | None

    The NPLinker object.

    CACHE str

    The name of the cache file to use for storing the MetcalfScoring.

    presence_gcf_strain DataFrame

    A DataFrame to store presence of gcfs with respect to strains. The index of the DataFrame are the GCF objects and the columns are Strain objects. The values are 1 where the gcf occurs in the strain, 0 otherwise.

    presence_spec_strain DataFrame

    A DataFrame to store presence of spectra with respect to strains. The index of the DataFrame are the Spectrum objects and the columns are Strain objects. The values are 1 where the spectrum occurs in the strain, 0 otherwise.

    presence_mf_strain DataFrame

    A DataFrame to store presence of molecular families with respect to strains. The index of the DataFrame are the MolecularFamily objects and the columns are Strain objects. The values are 1 where the molecular family occurs in the strain, 0 otherwise.

    raw_score_spec_gcf DataFrame

    A DataFrame to store the raw Metcalf scores for spectrum-gcf links. The columns are \"spec\", \"gcf\" and \"score\":

    raw_score_mf_gcf DataFrame

    A DataFrame to store the raw Metcalf scores for molecular family-gcf links. The columns are \"mf\", \"gcf\" and \"score\":

    metcalf_mean ndarray | None

    A numpy array to store the mean value used for standardising Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains.

    metcalf_std ndarray | None

    A numpy array to store the standard deviation value used for standardising Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains.

    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.name","title":"name class-attribute instance-attribute","text":"
    name = METCALF.value\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.npl","title":"npl class-attribute instance-attribute","text":"
    npl: NPLinker | None = None\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.CACHE","title":"CACHE class-attribute instance-attribute","text":"
    CACHE: str = 'cache_metcalf_scoring.pckl'\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.metcalf_weights","title":"metcalf_weights class-attribute instance-attribute","text":"
    metcalf_weights: tuple[int, int, int, int] = (10, -10, 0, 1)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.presence_gcf_strain","title":"presence_gcf_strain class-attribute instance-attribute","text":"
    presence_gcf_strain: DataFrame = DataFrame()\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.presence_spec_strain","title":"presence_spec_strain class-attribute instance-attribute","text":"
    presence_spec_strain: DataFrame = DataFrame()\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.presence_mf_strain","title":"presence_mf_strain class-attribute instance-attribute","text":"
    presence_mf_strain: DataFrame = DataFrame()\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.raw_score_spec_gcf","title":"raw_score_spec_gcf class-attribute instance-attribute","text":"
    raw_score_spec_gcf: DataFrame = DataFrame(\n    columns=[\"spec\", \"gcf\", \"score\"]\n)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.raw_score_mf_gcf","title":"raw_score_mf_gcf class-attribute instance-attribute","text":"
    raw_score_mf_gcf: DataFrame = DataFrame(\n    columns=[\"mf\", \"gcf\", \"score\"]\n)\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.metcalf_mean","title":"metcalf_mean class-attribute instance-attribute","text":"
    metcalf_mean: ndarray | None = None\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.metcalf_std","title":"metcalf_std class-attribute instance-attribute","text":"
    metcalf_std: ndarray | None = None\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.setup","title":"setup classmethod","text":"
    setup(npl: NPLinker)\n

    Setup the MetcalfScoring object.

    This method is only called once to setup the MetcalfScoring object.

    Parameters:

    Name Type Description Default npl NPLinker

    The NPLinker object.

    required Source code in src/nplinker/scoring/metcalf_scoring.py
    @classmethod\ndef setup(cls, npl: NPLinker):\n    \"\"\"Setup the MetcalfScoring object.\n\n    This method is only called once to setup the MetcalfScoring object.\n\n    Args:\n        npl: The NPLinker object.\n    \"\"\"\n    if cls.npl is not None:\n        logger.info(\"MetcalfScoring.setup already called, skipping.\")\n        return\n\n    logger.info(\n        f\"MetcalfScoring.setup starts: #bgcs={len(npl.bgcs)}, #gcfs={len(npl.gcfs)}, \"\n        f\"#spectra={len(npl.spectra)}, #mfs={len(npl.mfs)}, #strains={npl.strains}\"\n    )\n    cls.npl = npl\n\n    # calculate presence of gcfs/spectra/mfs with respect to strains\n    cls.presence_gcf_strain = get_presence_gcf_strain(npl.gcfs, npl.strains)\n    cls.presence_spec_strain = get_presence_spec_strain(npl.spectra, npl.strains)\n    cls.presence_mf_strain = get_presence_mf_strain(npl.mfs, npl.strains)\n\n    # calculate raw Metcalf scores for spec-gcf links\n    raw_score_spec_gcf = cls._calc_raw_score(\n        cls.presence_spec_strain, cls.presence_gcf_strain, cls.metcalf_weights\n    )\n    cls.raw_score_spec_gcf = raw_score_spec_gcf.reset_index().melt(id_vars=\"index\")\n    cls.raw_score_spec_gcf.columns = [\"spec\", \"gcf\", \"score\"]  # type: ignore\n\n    # calculate raw Metcalf scores for spec-gcf links\n    raw_score_mf_gcf = cls._calc_raw_score(\n        cls.presence_mf_strain, cls.presence_gcf_strain, cls.metcalf_weights\n    )\n    cls.raw_score_mf_gcf = raw_score_mf_gcf.reset_index().melt(id_vars=\"index\")\n    cls.raw_score_mf_gcf.columns = [\"mf\", \"gcf\", \"score\"]  # type: ignore\n\n    # calculate mean and std for standardising Metcalf scores\n    cls.metcalf_mean, cls.metcalf_std = cls._calc_mean_std(\n        len(npl.strains), cls.metcalf_weights\n    )\n\n    logger.info(\"MetcalfScoring.setup completed\")\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.get_links","title":"get_links","text":"
    get_links(*objects, **parameters)\n

    Get links for the given objects.

    Parameters:

    Name Type Description Default objects

    The objects to get links for. All objects must be of the same type, i.e. GCF, Spectrum or MolecularFamily type. If no objects are provided, all detected objects (npl.gcfs) will be used.

    () parameters

    The scoring parameters to use for the links. The parameters are:

    - cutoff: The minimum score to consider a link (\u2265cutoff). Default is 0.\n- standardised: Whether to use standardised scores. Default is False.\n
    {}

    Returns:

    Type Description

    The LinkGraph object containing the links involving the input objects with the Metcalf scores.

    Raises:

    Type Description TypeError

    If the input objects are not of the same type or the object type is invalid.

    Source code in src/nplinker/scoring/metcalf_scoring.py
    def get_links(self, *objects, **parameters):\n    \"\"\"Get links for the given objects.\n\n    Args:\n        objects: The objects to get links for. All objects must be of the same type, i.e. `GCF`,\n            `Spectrum` or `MolecularFamily` type.\n            If no objects are provided, all detected objects (`npl.gcfs`) will be used.\n        parameters: The scoring parameters to use for the links. The parameters are:\n\n                - cutoff: The minimum score to consider a link (\u2265cutoff). Default is 0.\n                - standardised: Whether to use standardised scores. Default is False.\n\n    Returns:\n        The `LinkGraph` object containing the links involving the input objects with the Metcalf\n            scores.\n\n    Raises:\n        TypeError: If the input objects are not of the same type or the object type is invalid.\n    \"\"\"\n    # validate input objects\n    if len(objects) == 0:\n        objects = self.npl.gcfs\n    # check if all objects are of the same type\n    types = {type(i) for i in objects}\n    if len(types) > 1:\n        raise TypeError(\"Input objects must be of the same type.\")\n    # check if the object type is valid\n    obj_type = next(iter(types))\n    if obj_type not in (GCF, Spectrum, MolecularFamily):\n        raise TypeError(\n            f\"Invalid type {obj_type}. Input objects must be GCF, Spectrum or MolecularFamily objects.\"\n        )\n\n    # validate scoring parameters\n    self._cutoff: float = parameters.get(\"cutoff\", 0)\n    self._standardised: bool = parameters.get(\"standardised\", False)\n    parameters.update({\"cutoff\": self._cutoff, \"standardised\": self._standardised})\n\n    logger.info(\n        f\"MetcalfScoring: #objects={len(objects)}, type={obj_type}, cutoff={self._cutoff}, \"\n        f\"standardised={self._standardised}\"\n    )\n    if not self._standardised:\n        scores_list = self._get_links(*objects, obj_type=obj_type, score_cutoff=self._cutoff)\n    else:\n        if self.metcalf_mean is None or self.metcalf_std is None:\n            raise ValueError(\n                \"MetcalfScoring.metcalf_mean and metcalf_std are not set. Run MetcalfScoring.setup first.\"\n            )\n        # use negative infinity as the score cutoff to ensure we get all links\n        scores_list = self._get_links(*objects, obj_type=obj_type, score_cutoff=-np.inf)\n        scores_list = self._calc_standardised_score(scores_list)\n\n    links = LinkGraph()\n    for score_df in scores_list:\n        for row in score_df.itertuples(index=False):  # row has attributes: spec/mf, gcf, score\n            met = row.spec if score_df.name == LinkType.SPEC_GCF else row.mf\n            links.add_link(\n                row.gcf,\n                met,\n                metcalf=Score(self.name, row.score, parameters),\n            )\n\n    logger.info(f\"MetcalfScoring: completed! Found {len(links.links)} links in total.\")\n    return links\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.format_data","title":"format_data","text":"
    format_data(data)\n

    Format the data for display.

    Source code in src/nplinker/scoring/metcalf_scoring.py
    def format_data(self, data):\n    \"\"\"Format the data for display.\"\"\"\n    # for metcalf the data will just be a floating point value (i.e. the score)\n    return f\"{data:.4f}\"\n
    "},{"location":"api/scoring_methods/#nplinker.scoring.MetcalfScoring.sort","title":"sort","text":"
    sort(objects, reverse=True)\n

    Sort the objects based on the score.

    Source code in src/nplinker/scoring/metcalf_scoring.py
    def sort(self, objects, reverse=True):\n    \"\"\"Sort the objects based on the score.\"\"\"\n    # sort based on score\n    return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse)\n
    "},{"location":"api/scoring_utils/","title":"Utilities","text":""},{"location":"api/scoring_utils/#nplinker.scoring.utils","title":"utils","text":""},{"location":"api/scoring_utils/#nplinker.scoring.utils.get_presence_gcf_strain","title":"get_presence_gcf_strain","text":"
    get_presence_gcf_strain(\n    gcfs: Sequence[GCF], strains: StrainCollection\n) -> DataFrame\n

    Get the occurrence of strains in gcfs.

    The occurrence is a DataFrame with GCF objects as index and Strain objects as columns, and the values are 1 if the gcf occurs in the strain, 0 otherwise.

    Source code in src/nplinker/scoring/utils.py
    def get_presence_gcf_strain(gcfs: Sequence[GCF], strains: StrainCollection) -> pd.DataFrame:\n    \"\"\"Get the occurrence of strains in gcfs.\n\n    The occurrence is a DataFrame with GCF objects as index and Strain objects as columns, and the\n    values are 1 if the gcf occurs in the strain,  0 otherwise.\n    \"\"\"\n    df_gcf_strain = pd.DataFrame(\n        0,\n        index=gcfs,\n        columns=list(strains),\n        dtype=int,\n    )  # type: ignore\n    for gcf in gcfs:\n        for strain in strains:\n            if gcf.has_strain(strain):\n                df_gcf_strain.loc[gcf, strain] = 1\n    return df_gcf_strain  # type: ignore\n
    "},{"location":"api/scoring_utils/#nplinker.scoring.utils.get_presence_spec_strain","title":"get_presence_spec_strain","text":"
    get_presence_spec_strain(\n    spectra: Sequence[Spectrum], strains: StrainCollection\n) -> DataFrame\n

    Get the occurrence of strains in spectra.

    The occurrence is a DataFrame with Spectrum objects as index and Strain objects as columns, and the values are 1 if the spectrum occurs in the strain, 0 otherwise.

    Source code in src/nplinker/scoring/utils.py
    def get_presence_spec_strain(\n    spectra: Sequence[Spectrum], strains: StrainCollection\n) -> pd.DataFrame:\n    \"\"\"Get the occurrence of strains in spectra.\n\n    The occurrence is a DataFrame with Spectrum objects as index and Strain objects as columns, and\n    the values are 1 if the spectrum occurs in the strain, 0 otherwise.\n    \"\"\"\n    df_spec_strain = pd.DataFrame(\n        0,\n        index=spectra,\n        columns=list(strains),\n        dtype=int,\n    )  # type: ignore\n    for spectrum in spectra:\n        for strain in strains:\n            if spectrum.has_strain(strain):\n                df_spec_strain.loc[spectrum, strain] = 1\n    return df_spec_strain  # type: ignore\n
    "},{"location":"api/scoring_utils/#nplinker.scoring.utils.get_presence_mf_strain","title":"get_presence_mf_strain","text":"
    get_presence_mf_strain(\n    mfs: Sequence[MolecularFamily],\n    strains: StrainCollection,\n) -> DataFrame\n

    Get the occurrence of strains in molecular families.

    The occurrence is a DataFrame with MolecularFamily objects as index and Strain objects as columns, and the values are 1 if the molecular family occurs in the strain, 0 otherwise.

    Source code in src/nplinker/scoring/utils.py
    def get_presence_mf_strain(\n    mfs: Sequence[MolecularFamily], strains: StrainCollection\n) -> pd.DataFrame:\n    \"\"\"Get the occurrence of strains in molecular families.\n\n    The occurrence is a DataFrame with MolecularFamily objects as index and Strain objects as\n    columns, and the values are 1 if the molecular family occurs in the strain, 0 otherwise.\n    \"\"\"\n    df_mf_strain = pd.DataFrame(\n        0,\n        index=mfs,\n        columns=list(strains),\n        dtype=int,\n    )  # type: ignore\n    for mf in mfs:\n        for strain in strains:\n            if mf.has_strain(strain):\n                df_mf_strain.loc[mf, strain] = 1\n    return df_mf_strain  # type: ignore\n
    "},{"location":"api/strain/","title":"Data Models","text":""},{"location":"api/strain/#nplinker.strain","title":"strain","text":""},{"location":"api/strain/#nplinker.strain.Strain","title":"Strain","text":"
    Strain(id: str)\n

    To model the mapping between strain id and its aliases.

    It's recommended to use NCBI taxonomy strain id or name as the primary id.

    Parameters:

    Name Type Description Default id str

    the representative id of the strain.

    required Source code in src/nplinker/strain/strain.py
    def __init__(self, id: str) -> None:\n    \"\"\"To model the mapping between strain id and its aliases.\n\n    Args:\n        id: the representative id of the strain.\n    \"\"\"\n    self.id: str = id\n    self._aliases: set[str] = set()\n
    "},{"location":"api/strain/#nplinker.strain.Strain.id","title":"id instance-attribute","text":"
    id: str = id\n
    "},{"location":"api/strain/#nplinker.strain.Strain.names","title":"names property","text":"
    names: set[str]\n

    Get the set of strain names including id and aliases.

    Returns:

    Type Description set[str]

    A set of names associated with the strain.

    "},{"location":"api/strain/#nplinker.strain.Strain.aliases","title":"aliases property","text":"
    aliases: set[str]\n

    Get the set of known aliases.

    Returns:

    Type Description set[str]

    A set of aliases associated with the strain.

    "},{"location":"api/strain/#nplinker.strain.Strain.add_alias","title":"add_alias","text":"
    add_alias(alias: str) -> None\n

    Add an alias to the list of known aliases.

    Parameters:

    Name Type Description Default alias str

    The alias to add to the list of known aliases.

    required Source code in src/nplinker/strain/strain.py
    def add_alias(self, alias: str) -> None:\n    \"\"\"Add an alias to the list of known aliases.\n\n    Args:\n        alias: The alias to add to the list of known aliases.\n    \"\"\"\n    if not isinstance(alias, str):\n        raise TypeError(f\"Expected str, got {type(alias)}\")\n    if len(alias) == 0:\n        logger.warning(\"Refusing to add an empty-string alias to strain {%s}\", self)\n    else:\n        self._aliases.add(alias)\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection","title":"StrainCollection","text":"
    StrainCollection()\n

    A collection of Strain objects.

    Source code in src/nplinker/strain/strain_collection.py
    def __init__(self):\n    # the order of strains is needed for scoring part, so use a list\n    self._strains: list[Strain] = []\n    self._strain_dict_name: dict[str, list[Strain]] = {}\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.add","title":"add","text":"
    add(strain: Strain) -> None\n

    Add strain to the collection.

    If the strain already exists, merge the aliases.

    Parameters:

    Name Type Description Default strain Strain

    The strain to add.

    required Source code in src/nplinker/strain/strain_collection.py
    def add(self, strain: Strain) -> None:\n    \"\"\"Add strain to the collection.\n\n    If the strain already exists, merge the aliases.\n\n    Args:\n        strain: The strain to add.\n    \"\"\"\n    if strain in self._strains:\n        # only one strain object per id\n        strain_ref = self._strain_dict_name[strain.id][0]\n        new_aliases = [alias for alias in strain.aliases if alias not in strain_ref.aliases]\n        for alias in new_aliases:\n            strain_ref.add_alias(alias)\n            if alias not in self._strain_dict_name:\n                self._strain_dict_name[alias] = [strain_ref]\n            else:\n                self._strain_dict_name[alias].append(strain_ref)\n    else:\n        self._strains.append(strain)\n        for name in strain.names:\n            if name not in self._strain_dict_name:\n                self._strain_dict_name[name] = [strain]\n            else:\n                self._strain_dict_name[name].append(strain)\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.remove","title":"remove","text":"
    remove(strain: Strain)\n

    Remove a strain from the collection.

    It removes the given strain object from the collection by strain id. If the strain id is not found, raise ValueError.

    Parameters:

    Name Type Description Default strain Strain

    The strain to remove.

    required

    Raises:

    Type Description ValueError

    If the strain is not found in the collection.

    Source code in src/nplinker/strain/strain_collection.py
    def remove(self, strain: Strain):\n    \"\"\"Remove a strain from the collection.\n\n    It removes the given strain object from the collection by strain id.\n    If the strain id is not found, raise ValueError.\n\n    Args:\n        strain: The strain to remove.\n\n    Raises:\n        ValueError: If the strain is not found in the collection.\n    \"\"\"\n    if strain in self._strains:\n        self._strains.remove(strain)\n        # only one strain object per id\n        strain_ref = self._strain_dict_name[strain.id][0]\n        for name in strain_ref.names:\n            if name in self._strain_dict_name:\n                new_strain_list = [s for s in self._strain_dict_name[name] if s.id != strain.id]\n                if not new_strain_list:\n                    del self._strain_dict_name[name]\n                else:\n                    self._strain_dict_name[name] = new_strain_list\n    else:\n        raise ValueError(f\"Strain {strain} not found in strain collection.\")\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.filter","title":"filter","text":"
    filter(strain_set: set[Strain])\n

    Remove all strains that are not in strain_set from the strain collection.

    Parameters:

    Name Type Description Default strain_set set[Strain]

    Set of strains to keep.

    required Source code in src/nplinker/strain/strain_collection.py
    def filter(self, strain_set: set[Strain]):\n    \"\"\"Remove all strains that are not in strain_set from the strain collection.\n\n    Args:\n        strain_set: Set of strains to keep.\n    \"\"\"\n    # note that we need to copy the list of strains, as we are modifying it\n    for strain in self._strains.copy():\n        if strain not in strain_set:\n            self.remove(strain)\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.intersection","title":"intersection","text":"
    intersection(other: StrainCollection) -> StrainCollection\n

    Get the intersection of two strain collections.

    Parameters:

    Name Type Description Default other StrainCollection

    The other strain collection to compare.

    required

    Returns:

    Type Description StrainCollection

    StrainCollection object containing the strains that are in both collections.

    Source code in src/nplinker/strain/strain_collection.py
    def intersection(self, other: StrainCollection) -> StrainCollection:\n    \"\"\"Get the intersection of two strain collections.\n\n    Args:\n        other: The other strain collection to compare.\n\n    Returns:\n        StrainCollection object containing the strains that are in both collections.\n    \"\"\"\n    intersection = StrainCollection()\n    for strain in self:\n        if strain in other:\n            intersection.add(strain)\n    return intersection\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.has_name","title":"has_name","text":"
    has_name(name: str) -> bool\n

    Check if the strain collection contains the given strain name (id or alias).

    Parameters:

    Name Type Description Default name str

    Strain name (id or alias) to check.

    required

    Returns:

    Type Description bool

    True if the strain name is in the collection, False otherwise.

    Source code in src/nplinker/strain/strain_collection.py
    def has_name(self, name: str) -> bool:\n    \"\"\"Check if the strain collection contains the given strain name (id or alias).\n\n    Args:\n        name: Strain name (id or alias) to check.\n\n    Returns:\n        True if the strain name is in the collection, False otherwise.\n    \"\"\"\n    return name in self._strain_dict_name\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.lookup","title":"lookup","text":"
    lookup(name: str) -> list[Strain]\n

    Lookup a strain by name (id or alias).

    Parameters:

    Name Type Description Default name str

    Strain name (id or alias) to lookup.

    required

    Returns:

    Type Description list[Strain]

    List of Strain objects with the given name.

    Raises:

    Type Description ValueError

    If the strain name is not found.

    Source code in src/nplinker/strain/strain_collection.py
    def lookup(self, name: str) -> list[Strain]:\n    \"\"\"Lookup a strain by name (id or alias).\n\n    Args:\n        name: Strain name (id or alias) to lookup.\n\n    Returns:\n        List of Strain objects with the given name.\n\n    Raises:\n        ValueError: If the strain name is not found.\n    \"\"\"\n    if name in self._strain_dict_name:\n        return self._strain_dict_name[name]\n    raise ValueError(f\"Strain {name} not found in the strain collection.\")\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.read_json","title":"read_json staticmethod","text":"
    read_json(file: str | PathLike) -> 'StrainCollection'\n

    Read a strain mappings JSON file and return a StrainCollection object.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the strain mappings JSON file.

    required

    Returns:

    Type Description 'StrainCollection'

    StrainCollection object.

    Source code in src/nplinker/strain/strain_collection.py
    @staticmethod\ndef read_json(file: str | PathLike) -> \"StrainCollection\":\n    \"\"\"Read a strain mappings JSON file and return a StrainCollection object.\n\n    Args:\n        file: Path to the strain mappings JSON file.\n\n    Returns:\n        StrainCollection object.\n    \"\"\"\n    with open(file, \"r\") as f:\n        json_data = json.load(f)\n\n    # validate json data\n    validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA)\n\n    strain_collection = StrainCollection()\n    for data in json_data[\"strain_mappings\"]:\n        strain = Strain(data[\"strain_id\"])\n        for alias in data[\"strain_alias\"]:\n            strain.add_alias(alias)\n        strain_collection.add(strain)\n    return strain_collection\n
    "},{"location":"api/strain/#nplinker.strain.StrainCollection.to_json","title":"to_json","text":"
    to_json(file: str | PathLike | None = None) -> str | None\n

    Convert the StrainCollection object to a JSON string.

    Parameters:

    Name Type Description Default file str | PathLike | None

    Path to output JSON file. If None, return the JSON string instead.

    None

    Returns:

    Type Description str | None

    If file is None, return the JSON string. Otherwise, write the JSON string to the given

    str | None

    file.

    Source code in src/nplinker/strain/strain_collection.py
    def to_json(self, file: str | PathLike | None = None) -> str | None:\n    \"\"\"Convert the StrainCollection object to a JSON string.\n\n    Args:\n        file: Path to output JSON file. If None,\n            return the JSON string instead.\n\n    Returns:\n        If `file` is None, return the JSON string. Otherwise, write the JSON string to the given\n        file.\n    \"\"\"\n    data_list = [\n        {\"strain_id\": strain.id, \"strain_alias\": list(strain.aliases)} for strain in self\n    ]\n    json_data = {\"strain_mappings\": data_list, \"version\": \"1.0\"}\n\n    # validate json data\n    validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA)\n\n    if file is not None:\n        with open(file, \"w\") as f:\n            json.dump(json_data, f)\n        return None\n    return json.dumps(json_data)\n
    "},{"location":"api/strain_utils/","title":"Utilities","text":""},{"location":"api/strain_utils/#nplinker.strain.utils","title":"utils","text":""},{"location":"api/strain_utils/#nplinker.strain.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/strain_utils/#nplinker.strain.utils.load_user_strains","title":"load_user_strains","text":"
    load_user_strains(json_file: str | PathLike) -> set[Strain]\n

    Load user specified strains from a JSON file.

    The JSON file must follow the schema defined in schemas/user_strains.json.

    An example content of the JSON file
    {\"strain_ids\": [\"strain1\", \"strain2\"]}\n

    Parameters:

    Name Type Description Default json_file str | PathLike

    Path to the JSON file containing user specified strains.

    required

    Returns:

    Type Description set[Strain]

    A set of user specified strains.

    Source code in src/nplinker/strain/utils.py
    def load_user_strains(json_file: str | PathLike) -> set[Strain]:\n    \"\"\"Load user specified strains from a JSON file.\n\n    The JSON file must follow the schema defined in `schemas/user_strains.json`.\n\n    An example content of the JSON file:\n        ```\n        {\"strain_ids\": [\"strain1\", \"strain2\"]}\n        ```\n\n    Args:\n        json_file: Path to the JSON file containing user specified strains.\n\n    Returns:\n        A set of user specified strains.\n    \"\"\"\n    with open(json_file, \"r\") as f:\n        json_data = json.load(f)\n\n    # validate json data\n    validate(instance=json_data, schema=USER_STRAINS_SCHEMA)\n\n    strains = set()\n    for strain_id in json_data[\"strain_ids\"]:\n        strains.add(Strain(strain_id))\n\n    return strains\n
    "},{"location":"api/strain_utils/#nplinker.strain.utils.podp_generate_strain_mappings","title":"podp_generate_strain_mappings","text":"
    podp_generate_strain_mappings(\n    podp_project_json_file: str | PathLike,\n    genome_status_json_file: str | PathLike,\n    genome_bgc_mappings_file: str | PathLike,\n    gnps_file_mappings_file: str | PathLike,\n    output_json_file: str | PathLike,\n) -> StrainCollection\n

    Generate strain mappings JSON file for PODP pipeline.

    To get the strain mappings, we need to combine the following mappings:

    These mappings are extracted from the following files:

    Parameters:

    Name Type Description Default podp_project_json_file str | PathLike

    The path to the PODP project JSON file.

    required genome_status_json_file str | PathLike

    The path to the genome status JSON file.

    required genome_bgc_mappings_file str | PathLike

    The path to the genome BGC mappings JSON file.

    required gnps_file_mappings_file str | PathLike

    The path to the GNPS file mappings file (csv or tsv).

    required output_json_file str | PathLike

    The path to the output JSON file.

    required

    Returns:

    Type Description StrainCollection

    The strain mappings stored in a StrainCollection object.

    See Also Source code in src/nplinker/strain/utils.py
    def podp_generate_strain_mappings(\n    podp_project_json_file: str | PathLike,\n    genome_status_json_file: str | PathLike,\n    genome_bgc_mappings_file: str | PathLike,\n    gnps_file_mappings_file: str | PathLike,\n    output_json_file: str | PathLike,\n) -> StrainCollection:\n    \"\"\"Generate strain mappings JSON file for PODP pipeline.\n\n    To get the strain mappings, we need to combine the following mappings:\n\n    - strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id\n    - strain_id <-> MS_filename <-> spectrum_id\n\n    These mappings are extracted from the following files:\n\n    - \"strain_id <-> original_genome_id\" is extracted from `podp_project_json_file`.\n    - \"original_genome_id <-> resolved_genome_id\" is extracted from `genome_status_json_file`.\n    - \"resolved_genome_id <-> bgc_id\" is extracted from `genome_bgc_mappings_file`.\n    - \"strain_id <-> MS_filename\" is extracted from `podp_project_json_file`.\n    - \"MS_filename <-> spectrum_id\" is extracted from `gnps_file_mappings_file`.\n\n    Args:\n        podp_project_json_file: The path to the PODP project\n            JSON file.\n        genome_status_json_file: The path to the genome status\n            JSON file.\n        genome_bgc_mappings_file: The path to the genome BGC\n            mappings JSON file.\n        gnps_file_mappings_file: The path to the GNPS file\n            mappings file (csv or tsv).\n        output_json_file: The path to the output JSON file.\n\n    Returns:\n        The strain mappings stored in a StrainCollection object.\n\n    See Also:\n        - `extract_mappings_strain_id_original_genome_id`: Extract mappings\n            \"strain_id <-> original_genome_id\".\n        - `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings\n            \"original_genome_id <-> resolved_genome_id\".\n        - `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings\n            \"resolved_genome_id <-> bgc_id\".\n        - `get_mappings_strain_id_bgc_id`: Get mappings \"strain_id <-> bgc_id\".\n        - `extract_mappings_strain_id_ms_filename`: Extract mappings\n            \"strain_id <-> MS_filename\".\n        - `extract_mappings_ms_filename_spectrum_id`: Extract mappings\n            \"MS_filename <-> spectrum_id\".\n        - `get_mappings_strain_id_spectrum_id`: Get mappings \"strain_id <-> spectrum_id\".\n    \"\"\"\n    # Get mappings strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id\n    mappings_strain_id_bgc_id = get_mappings_strain_id_bgc_id(\n        extract_mappings_strain_id_original_genome_id(podp_project_json_file),\n        extract_mappings_original_genome_id_resolved_genome_id(genome_status_json_file),\n        extract_mappings_resolved_genome_id_bgc_id(genome_bgc_mappings_file),\n    )\n\n    # Get mappings strain_id <-> MS_filename <-> spectrum_id\n    mappings_strain_id_spectrum_id = get_mappings_strain_id_spectrum_id(\n        extract_mappings_strain_id_ms_filename(podp_project_json_file),\n        extract_mappings_ms_filename_spectrum_id(gnps_file_mappings_file),\n    )\n\n    # Get mappings strain_id <-> bgc_id / spectrum_id\n    mappings = mappings_strain_id_bgc_id.copy()\n    for strain_id, spectrum_ids in mappings_strain_id_spectrum_id.items():\n        if strain_id in mappings:\n            mappings[strain_id].update(spectrum_ids)\n        else:\n            mappings[strain_id] = spectrum_ids.copy()\n\n    # Create StrainCollection\n    sc = StrainCollection()\n    for strain_id, bgc_ids in mappings.items():\n        if not sc.has_name(strain_id):\n            strain = Strain(strain_id)\n            for bgc_id in bgc_ids:\n                strain.add_alias(bgc_id)\n            sc.add(strain)\n        else:\n            # strain_list has only one element\n            strain_list = sc.lookup(strain_id)\n            for bgc_id in bgc_ids:\n                strain_list[0].add_alias(bgc_id)\n\n    # Write strain mappings JSON file\n    sc.to_json(output_json_file)\n    logger.info(\"Generated strain mappings JSON file: %s\", output_json_file)\n\n    return sc\n
    "},{"location":"api/utils/","title":"General Utilities","text":""},{"location":"api/utils/#nplinker.utils","title":"utils","text":""},{"location":"api/utils/#nplinker.utils.logger","title":"logger module-attribute","text":"
    logger = getLogger(__name__)\n
    "},{"location":"api/utils/#nplinker.utils.calculate_md5","title":"calculate_md5","text":"
    calculate_md5(\n    fpath: str | PathLike, chunk_size: int = 1024 * 1024\n) -> str\n

    Calculate the MD5 checksum of a file.

    Parameters:

    Name Type Description Default fpath str | PathLike

    Path to the file.

    required chunk_size int

    Chunk size for reading the file. Defaults to 1024*1024.

    1024 * 1024

    Returns:

    Type Description str

    MD5 checksum of the file.

    Source code in src/nplinker/utils.py
    def calculate_md5(fpath: str | PathLike, chunk_size: int = 1024 * 1024) -> str:\n    \"\"\"Calculate the MD5 checksum of a file.\n\n    Args:\n        fpath: Path to the file.\n        chunk_size: Chunk size for reading the file. Defaults to 1024*1024.\n\n    Returns:\n        MD5 checksum of the file.\n    \"\"\"\n    if sys.version_info >= (3, 9):\n        md5 = hashlib.md5(usedforsecurity=False)\n    else:\n        md5 = hashlib.md5()\n    with open(fpath, \"rb\") as f:\n        for chunk in iter(lambda: f.read(chunk_size), b\"\"):\n            md5.update(chunk)\n    return md5.hexdigest()\n
    "},{"location":"api/utils/#nplinker.utils.check_md5","title":"check_md5","text":"
    check_md5(fpath: str | PathLike, md5: str) -> bool\n

    Verify the MD5 checksum of a file.

    Parameters:

    Name Type Description Default fpath str | PathLike

    Path to the file.

    required md5 str

    MD5 checksum to verify.

    required

    Returns:

    Type Description bool

    True if the MD5 checksum matches, False otherwise.

    Source code in src/nplinker/utils.py
    def check_md5(fpath: str | PathLike, md5: str) -> bool:\n    \"\"\"Verify the MD5 checksum of a file.\n\n    Args:\n        fpath: Path to the file.\n        md5: MD5 checksum to verify.\n\n    Returns:\n        True if the MD5 checksum matches, False otherwise.\n    \"\"\"\n    return md5 == calculate_md5(fpath)\n
    "},{"location":"api/utils/#nplinker.utils.download_and_extract_archive","title":"download_and_extract_archive","text":"
    download_and_extract_archive(\n    url: str,\n    download_root: str | PathLike,\n    extract_root: str | Path | None = None,\n    filename: str | None = None,\n    md5: str | None = None,\n    remove_finished: bool = False,\n) -> None\n

    Download a file from url and extract it.

    This method is a wrapper of download_url and extract_archive methods.

    Parameters:

    Name Type Description Default url str

    URL to download file from

    required download_root str | PathLike

    Path to the directory to place downloaded file in. If it doesn't exist, it will be created.

    required extract_root str | Path | None

    Path to the directory the file will be extracted to. The given directory will be created if not exist. If omitted, the download_root is used.

    None filename str | None

    Name to save the downloaded file under. If None, use the basename of the URL

    None md5 str | None

    MD5 checksum of the download. If None, do not check

    None remove_finished bool

    If True, remove the downloaded file after the extraction. Defaults to False.

    False Source code in src/nplinker/utils.py
    def download_and_extract_archive(\n    url: str,\n    download_root: str | PathLike,\n    extract_root: str | Path | None = None,\n    filename: str | None = None,\n    md5: str | None = None,\n    remove_finished: bool = False,\n) -> None:\n    \"\"\"Download a file from url and extract it.\n\n       This method is a wrapper of `download_url` and `extract_archive` methods.\n\n    Args:\n        url: URL to download file from\n        download_root: Path to the directory to place downloaded\n            file in. If it doesn't exist, it will be created.\n        extract_root: Path to the directory the file\n            will be extracted to. The given directory will be created if not exist.\n            If omitted, the `download_root` is used.\n        filename: Name to save the downloaded file under.\n            If None, use the basename of the URL\n        md5: MD5 checksum of the download. If None, do not check\n        remove_finished: If `True`, remove the downloaded file\n             after the extraction. Defaults to False.\n    \"\"\"\n    download_root = Path(download_root)\n    if extract_root is None:\n        extract_root = download_root\n    else:\n        extract_root = Path(extract_root)\n    if not filename:\n        filename = Path(url).name\n\n    download_url(url, download_root, filename, md5)\n\n    archive = download_root / filename\n    extract_archive(archive, extract_root, remove_finished=remove_finished)\n
    "},{"location":"api/utils/#nplinker.utils.download_url","title":"download_url","text":"
    download_url(\n    url: str,\n    root: str | PathLike,\n    filename: str | None = None,\n    md5: str | None = None,\n    http_method: str = \"GET\",\n    allow_http_redirect: bool = True,\n) -> None\n

    Download a file from a url and place it in root.

    Parameters:

    Name Type Description Default url str

    URL to download file from

    required root str | PathLike

    Directory to place downloaded file in. If it doesn't exist, it will be created.

    required filename str | None

    Name to save the file under. If None, use the basename of the URL.

    None md5 str | None

    MD5 checksum of the download. If None, do not check.

    None http_method str

    HTTP request method, e.g. \"GET\", \"POST\". Defaults to \"GET\".

    'GET' allow_http_redirect bool

    If true, enable following redirects for all HTTP (\"http:\") methods.

    True Source code in src/nplinker/utils.py
    def download_url(\n    url: str,\n    root: str | PathLike,\n    filename: str | None = None,\n    md5: str | None = None,\n    http_method: str = \"GET\",\n    allow_http_redirect: bool = True,\n) -> None:\n    \"\"\"Download a file from a url and place it in root.\n\n    Args:\n        url: URL to download file from\n        root: Directory to place downloaded file in. If it doesn't exist, it will be created.\n        filename: Name to save the file under. If None, use the\n            basename of the URL.\n        md5: MD5 checksum of the download. If None, do not check.\n        http_method: HTTP request method, e.g. \"GET\", \"POST\".\n            Defaults to \"GET\".\n        allow_http_redirect: If true, enable following redirects for all HTTP (\"http:\") methods.\n    \"\"\"\n    root = transform_to_full_path(root)\n    # create the download directory if not exist\n    root.mkdir(exist_ok=True)\n    if not filename:\n        filename = Path(url).name\n    fpath = root / filename\n\n    # check if file is already present locally\n    if fpath.is_file() and md5 is not None and check_md5(fpath, md5):\n        logger.info(\"Using downloaded and verified file: \" + str(fpath))\n        return\n\n    # download the file\n    with open(fpath, \"wb\") as fh:\n        with httpx.stream(http_method, url, follow_redirects=allow_http_redirect) as response:\n            if not response.is_success:\n                fpath.unlink(missing_ok=True)\n                raise RuntimeError(\n                    f\"Failed to download url {url} with status code {response.status_code}\"\n                )\n            total = int(response.headers.get(\"Content-Length\", 0))\n\n            with Progress(\n                TextColumn(\"[progress.description]{task.description}\"),\n                BarColumn(bar_width=None),\n                \"[progress.percentage]{task.percentage:>3.1f}%\",\n                \"\u2022\",\n                DownloadColumn(),\n                \"\u2022\",\n                TransferSpeedColumn(),\n                \"\u2022\",\n                TimeRemainingColumn(),\n                \"\u2022\",\n                TimeElapsedColumn(),\n            ) as progress:\n                task = progress.add_task(f\"[hot_pink]Downloading {fpath.name}\", total=total)\n                for chunk in response.iter_bytes():\n                    fh.write(chunk)\n                    progress.update(task, advance=len(chunk))\n\n    # check integrity of downloaded file\n    if md5 is not None and not check_md5(fpath, md5):\n        raise RuntimeError(\"MD5 validation failed.\")\n
    "},{"location":"api/utils/#nplinker.utils.extract_archive","title":"extract_archive","text":"
    extract_archive(\n    from_path: str | PathLike,\n    extract_root: str | PathLike | None = None,\n    members: list | None = None,\n    remove_finished: bool = False,\n) -> str\n

    Extract an archive.

    The archive type and a possible compression is automatically detected from the file name. If the file is compressed but not an archive the call is dispatched to :func:decompress.

    Parameters:

    Name Type Description Default from_path str | PathLike

    Path to the file to be extracted.

    required extract_root str | PathLike | None

    Path to the directory the file will be extracted to. The given directory will be created if not exist. If omitted, the directory of the archive file is used.

    None members list | None

    Optional selection of members to extract. If not specified, all members are extracted. Members must be a subset of the list returned by - zipfile.ZipFile.namelist() or a list of strings for zip file - tarfile.TarFile.getmembers() for tar file

    None remove_finished bool

    If True, remove the file after the extraction.

    False

    Returns:

    Type Description str

    Path to the directory the file was extracted to.

    Source code in src/nplinker/utils.py
    def extract_archive(\n    from_path: str | PathLike,\n    extract_root: str | PathLike | None = None,\n    members: list | None = None,\n    remove_finished: bool = False,\n) -> str:\n    \"\"\"Extract an archive.\n\n    The archive type and a possible compression is automatically detected from\n    the file name. If the file is compressed but not an archive the call is\n    dispatched to :func:`decompress`.\n\n    Args:\n        from_path: Path to the file to be extracted.\n        extract_root: Path to the directory the file will be extracted to.\n            The given directory will be created if not exist.\n            If omitted, the directory of the archive file is used.\n        members: Optional selection of members to extract. If not specified,\n            all members are extracted.\n            Members must be a subset of the list returned by\n            - `zipfile.ZipFile.namelist()` or a list of strings for zip file\n            - `tarfile.TarFile.getmembers()` for tar file\n        remove_finished: If `True`, remove the file after the extraction.\n\n    Returns:\n        Path to the directory the file was extracted to.\n    \"\"\"\n    from_path = Path(from_path)\n\n    if extract_root is None:\n        extract_root = from_path.parent\n    else:\n        extract_root = Path(extract_root)\n\n    # create the extract directory if not exist\n    extract_root.mkdir(exist_ok=True)\n\n    logger.info(f\"Extracting {from_path} to {extract_root}\")\n    suffix, archive_type, compression = _detect_file_type(from_path)\n    if not archive_type:\n        return _decompress(\n            from_path,\n            extract_root / from_path.name.replace(suffix, \"\"),\n            remove_finished=remove_finished,\n        )\n\n    extractor = _ARCHIVE_EXTRACTORS[archive_type]\n\n    extractor(str(from_path), str(extract_root), members, compression)\n    if remove_finished:\n        from_path.unlink()\n\n    return str(extract_root)\n
    "},{"location":"api/utils/#nplinker.utils.find_delimiter","title":"find_delimiter","text":"
    find_delimiter(file: str | PathLike) -> str\n

    Detect the delimiter for the given tabular file.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to tabular file.

    required

    Returns:

    Type Description str

    Detected delimiter character.

    Examples:

    >>> delim = find_delimiter(\"~/table.csv\")\n
    Source code in src/nplinker/utils.py
    def find_delimiter(file: str | PathLike) -> str:\n    \"\"\"Detect the delimiter for the given tabular file.\n\n    Args:\n        file: Path to tabular file.\n\n    Returns:\n        Detected delimiter character.\n\n    Examples:\n        >>> delim = find_delimiter(\"~/table.csv\")\n    \"\"\"\n    sniffer = csv.Sniffer()\n    with open(file, mode=\"rt\", encoding=\"utf-8\") as fp:\n        delimiter = sniffer.sniff(fp.read(5000)).delimiter\n    return delimiter\n
    "},{"location":"api/utils/#nplinker.utils.get_headers","title":"get_headers","text":"
    get_headers(file: str | PathLike) -> list[str]\n

    Read headers from the given tabular file.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the file to read the header from.

    required

    Returns:

    Type Description list[str]

    A list of column names from the header.

    Source code in src/nplinker/utils.py
    def get_headers(file: str | PathLike) -> list[str]:\n    \"\"\"Read headers from the given tabular file.\n\n    Args:\n        file: Path to the file to read the header from.\n\n    Returns:\n        A list of column names from the header.\n    \"\"\"\n    with open(file) as f:\n        headers = f.readline().strip()\n        dl = find_delimiter(file)\n        return headers.split(dl)\n
    "},{"location":"api/utils/#nplinker.utils.is_file_format","title":"is_file_format","text":"
    is_file_format(\n    file: str | PathLike, format: str = \"tsv\"\n) -> bool\n

    Check if the file is in the given format.

    Parameters:

    Name Type Description Default file str | PathLike

    Path to the file to check.

    required format str

    The format to check for, either \"tsv\" or \"csv\".

    'tsv'

    Returns:

    Type Description bool

    True if the file is in the given format, False otherwise.

    Source code in src/nplinker/utils.py
    def is_file_format(file: str | PathLike, format: str = \"tsv\") -> bool:\n    \"\"\"Check if the file is in the given format.\n\n    Args:\n        file: Path to the file to check.\n        format: The format to check for, either \"tsv\" or \"csv\".\n\n    Returns:\n        True if the file is in the given format, False otherwise.\n    \"\"\"\n    try:\n        with open(file, \"rt\") as f:\n            if format == \"tsv\":\n                reader = csv.reader(f, delimiter=\"\\t\")\n            elif format == \"csv\":\n                reader = csv.reader(f, delimiter=\",\")\n            else:\n                raise ValueError(f\"Unknown format '{format}'.\")\n            for _ in reader:\n                pass\n        return True\n    except csv.Error:\n        return False\n
    "},{"location":"api/utils/#nplinker.utils.list_dirs","title":"list_dirs","text":"
    list_dirs(\n    root: str | PathLike, keep_parent: bool = True\n) -> list[str]\n

    List all directories at a given root.

    Parameters:

    Name Type Description Default root str | PathLike

    Path to directory whose folders need to be listed

    required keep_parent bool

    If true, prepends the path to each result, otherwise only returns the name of the directories found

    True Source code in src/nplinker/utils.py
    def list_dirs(root: str | PathLike, keep_parent: bool = True) -> list[str]:\n    \"\"\"List all directories at a given root.\n\n    Args:\n        root: Path to directory whose folders need to be listed\n        keep_parent: If true, prepends the path to each result, otherwise\n            only returns the name of the directories found\n    \"\"\"\n    root = transform_to_full_path(root)\n    directories = [str(p) for p in root.iterdir() if p.is_dir()]\n    if not keep_parent:\n        directories = [os.path.basename(d) for d in directories]\n    return directories\n
    "},{"location":"api/utils/#nplinker.utils.list_files","title":"list_files","text":"
    list_files(\n    root: str | PathLike,\n    prefix: str | tuple[str, ...] = \"\",\n    suffix: str | tuple[str, ...] = \"\",\n    keep_parent: bool = True,\n) -> list[str]\n

    List all files at a given root.

    Parameters:

    Name Type Description Default root str | PathLike

    Path to directory whose files need to be listed

    required prefix str | tuple[str, ...]

    Prefix of the file names to match, Defaults to empty string '\"\"'.

    '' suffix str | tuple[str, ...]

    Suffix of the files to match, e.g. \".png\" or (\".jpg\", \".png\"). Defaults to empty string '\"\"'.

    '' keep_parent bool

    If true, prepends the parent path to each result, otherwise only returns the name of the files found. Defaults to False.

    True Source code in src/nplinker/utils.py
    def list_files(\n    root: str | PathLike,\n    prefix: str | tuple[str, ...] = \"\",\n    suffix: str | tuple[str, ...] = \"\",\n    keep_parent: bool = True,\n) -> list[str]:\n    \"\"\"List all files at a given root.\n\n    Args:\n        root: Path to directory whose files need to be listed\n        prefix: Prefix of the file names to match,\n            Defaults to empty string '\"\"'.\n        suffix: Suffix of the files to match, e.g. \".png\" or\n            (\".jpg\", \".png\").\n            Defaults to empty string '\"\"'.\n        keep_parent: If true, prepends the parent path to each\n            result, otherwise only returns the name of the files found.\n            Defaults to False.\n    \"\"\"\n    root = Path(root)\n    files = [\n        str(p)\n        for p in root.iterdir()\n        if p.is_file() and p.name.startswith(prefix) and p.name.endswith(suffix)\n    ]\n\n    if not keep_parent:\n        files = [os.path.basename(f) for f in files]\n\n    return files\n
    "},{"location":"api/utils/#nplinker.utils.transform_to_full_path","title":"transform_to_full_path","text":"
    transform_to_full_path(p: str | PathLike) -> Path\n

    Transform a path to a full path.

    The path is expanded (i.e. the ~ will be replaced with actual path) and converted to an absolute path (i.e. . or .. will be replaced with actual path).

    Parameters:

    Name Type Description Default p str | PathLike

    The path to transform.

    required

    Returns:

    Type Description Path

    The transformed full path.

    Source code in src/nplinker/utils.py
    def transform_to_full_path(p: str | PathLike) -> Path:\n    \"\"\"Transform a path to a full path.\n\n    The path is expanded (i.e. the `~` will be replaced with actual path) and converted to an\n    absolute path (i.e. `.` or `..` will be replaced with actual path).\n\n    Args:\n        p: The path to transform.\n\n    Returns:\n        The transformed full path.\n    \"\"\"\n    # Multiple calls to `Path` are used to ensure static typing compatibility.\n    p = Path(p).expanduser()\n    p = Path(p).resolve()\n    return Path(p)\n
    "},{"location":"concepts/bigscape/","title":"BigScape","text":"

    NPLinker can run BigScape automatically if the bigscape directory does not exist in the working directory.

    To run BigScape, NPLinker requires the following BigScape parameters:

    And the following parameters are not allowed:

    If BigScape parameter --mibig is set, make sure setting the mibig.to_use to true in your config file nplinker.toml and mibig.version to the version of mibig used by bigscape.

    See the default configurations for the default parameters of BigScape.

    "},{"location":"concepts/config_file/","title":"Config File","text":""},{"location":"concepts/config_file/#configuration-template","title":"Configuration Template","text":"
    #############################\n# NPLinker configuration file\n#############################\n\n# The root directory of the NPLinker project. You need to create it first.\n# The value is required and must be a full path.\nroot_dir = \"<NPLinker root directory>\"\n# The mode for preparing dataset.\n# The available modes are \"podp\" and \"local\".\n# \"podp\" mode is for using the PODP platform (https://pairedomicsdata.bioinformatics.nl/) to prepare the dataset.\n# \"local\" mode is for preparing the dataset locally. So uers do not need to upload their data to the PODP platform.\n# The value is required.\nmode = \"podp\"\n# The PODP project identifier.\n# The value is required if the mode is \"podp\".\npodp_id = \"\"\n\n\n[log]\n# Log level. The available levels are same as the levels in python package `logging`:\n# \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\".\n# The default value is \"INFO\".\nlevel = \"INFO\"\n# The log file to append log messages.\n# The value is optional. \n# If not set or use empty string, log messages will not be written to a file.\n# The file will be created if it does not exist. Log messages will be appended to the file if it exists.\nfile = \"path/to/logfile\"\n# Whether to write log meesages to console.\n# The default value is true.\nuse_console = true\n\n\n[mibig]\n# Whether to use mibig metadta (json).\n# The default value is true.\nto_use = true\n# The version of mibig metadata.\n# Make sure using the same version of mibig in bigscape.\n# The default value is \"3.1\"\nversion = \"3.1\"\n\n\n[bigscape]\n# The parameters to use for running BiG-SCAPE.\n# Required bigscape parameters are `--mix`, `--include_singletons` and `--cutoffs`. NPLinker needs\n# them to run the analysis properly.\n# Parameters that must NOT exist: `--inputdir`, `--outputdir`, `--pfam_dir`. NPLinker will\n# automatically configure them.\n# If parameter `--mibig` is set, make sure setting the config `mibig.to_use` to true and\n# `mibig.version` to the version of mibig in bigscape.\n# The default value is \"--mibig --clans-off --mix --include_singletons --cutoffs 0.30\".\nparameters = \"--mibig --clans-off --mix --include_singletons --cutoffs 0.30\"\n# Which bigscape cutoff to use for NPLinker analysis.\n# There might be multiple cutoffs in bigscape output.\n# Note that this value must be a string.\n# The default value is \"0.30\".\ncutoff = \"0.30\"\n\n\n[scoring]\n# Scoring methods.\n# Valid values are \"metcalf\" and \"rosetta\".\n# The default value is \"metcalf\".\nmethods = [\"metcalf\"]\n
    "},{"location":"concepts/config_file/#default-configurations","title":"Default Configurations","text":"

    The default configurations are automatically used by NPLinker if you don't set them in your config file.

    # NPLinker default configurations\n\n[log]\nlevel = \"INFO\"\nuse_console = true\n\n[mibig]\nto_use = true\nversion = \"3.1\"\n\n[bigscape]\nparameters = \"--mibig --clans-off --mix --include_singletons --cutoffs 0.30\"\ncutoff = \"0.30\"\n\n[scoring]\nmethods = [\"metcalf\"]\n
    "},{"location":"concepts/config_file/#config-loader","title":"Config loader","text":"

    You can load the configuration file using the load_config function.

    from nplinker.config import load_config\nconfig = load_config('path/to/nplinker.toml')\n

    When you use NPLinker as an application, you can get access to the configuration object directly:

    from nplinker import NPLinker\nnpl = NPLinker('path/to/nplinker.toml')\nprint(npl.config)\n
    "},{"location":"concepts/gnps_data/","title":"GNPS Data","text":"

    NPLinker requires GNPS molecular networking data as input. It currently accepts data from the following GNPS workflows:

    "},{"location":"concepts/gnps_data/#mappings-from-gnps-data-to-nplinker-input","title":"Mappings from GNPS data to NPLinker input","text":"METABOLOMICS-SNETS workflowMETABOLOMICS-SNETS-V2FEATURE-BASED-MOLECULAR-NETWORKING NPLinker input GNPS file in the archive of Download Clustered Spectra as MGF spectra.mgf METABOLOMICS-SNETS*.mgf molecular_families.tsv networkedges_selfloop/*.pairsinfo annotations.tsv result_specnets_DB/*.tsv file_mappings.tsv clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv

    For example, the file METABOLOMICS-SNETS*.mgf from the downloaded zip archive is used as the spectra.mgf input file of NPLinker.

    When manually preparing GNPS data for NPLinker, the METABOLOMICS-SNETS*.mgf must be renamed to spectra.mgf and placed in the gnps sub-directory of the NPLinker working directory.

    NPLinker input GNPS file in the archive of Download Clustered Spectra as MGF spectra.mgf METABOLOMICS-SNETS-V2*.mgf molecular_families.tsv networkedges_selfloop/*.selfloop annotations.tsv result_specnets_DB/*.tsv file_mappings.tsv clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.clustersummary NPLinker input GNPS file in the archive of Download Cytoscape Data spectra.mgf spectra/*.mgf molecular_families.tsv networkedges_selfloop/*.selfloop annotations.tsv DB_result/*.tsv file_mappings.csv quantification_table/*.csv

    Note that file_mappings.csv is a CSV file, not a TSV file, different from the other workflows.

    "},{"location":"concepts/working_dir_structure/","title":"Working Directory Structure","text":"

    NPLinker requires a fixed structure of working directory with fixed names for the input and output data.

    root_dir # (1)!\n    \u2502\n    \u251c\u2500\u2500 nplinker.toml                           # (2)!\n    \u251c\u2500\u2500 strain_mappings.json                [F] # (3)!\n    \u251c\u2500\u2500 strains_selected.json               [F][O] # (4)!\n    \u2502\n    \u251c\u2500\u2500 gnps                                [F] # (5)!\n    \u2502       \u251c\u2500\u2500 spectra.mgf                 [F]\n    \u2502       \u251c\u2500\u2500 molecular_families.tsv      [F]\n    \u2502       \u251c\u2500\u2500 annotations.tsv             [F]\n    \u2502       \u2514\u2500\u2500 file_mappings.tsv (.csv)    [F] # (6)!\n    \u2502\n    \u251c\u2500\u2500 antismash                           [F] # (7)!\n    \u2502   \u251c\u2500\u2500 GCF_000514975.1\n    \u2502   \u2502   \u251c\u2500\u2500 xxx.region001.gbk\n    \u2502   \u2502   \u2514\u2500\u2500 ...\n    \u2502   \u251c\u2500\u2500 GCF_000016425.1\n    \u2502   \u2502   \u251c\u2500\u2500 xxxx.region001.gbk\n    \u2502   \u2502   \u2514\u2500\u2500 ...\n    \u2502   \u2514\u2500\u2500 ...\n    \u2502\n    \u251c\u2500\u2500 bigscape                            [F][O] # (8)!\n    \u2502   \u251c\u2500\u2500 mix_clustering_c0.30.tsv        [F]    # (9)!\n    \u2502   \u2514\u2500\u2500 bigscape_running_output\n    \u2502       \u2514\u2500\u2500 ...\n    \u2502\n    \u251c\u2500\u2500 downloads                           [F][A] # (10)!\n    \u2502       \u251c\u2500\u2500 paired_datarecord_4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.json # (11)!\n    \u2502       \u251c\u2500\u2500 GCF_000016425.1.zip\n    \u2502       \u251c\u2500\u2500 GCF_0000514975.1.zip\n    \u2502       \u251c\u2500\u2500 c22f44b14a3d450eb836d607cb9521bb.zip\n    \u2502       \u251c\u2500\u2500 genome_status.json\n    \u2502       \u2514\u2500\u2500 mibig_json_3.1.tar.gz\n    \u2502\n    \u251c\u2500\u2500 mibig                               [F][A] # (12)!\n    \u2502   \u251c\u2500\u2500 BGC0000001.json\n    \u2502   \u251c\u2500\u2500 BGC0000002.json\n    \u2502   \u2514\u2500\u2500 ...\n    \u2502\n    \u251c\u2500\u2500 output                              [F][A] # (13)!\n    \u2502   \u2514\u2500\u2500 ...\n    \u2502\n    \u2514\u2500\u2500 ...                                        # (14)!\n
    1. root_dir is the working directory you created, used as the root directory for NPLinker.
    2. nplinker.toml is the configuration file (toml format) provided by the user for running NPLinker.
    3. strain_mappings.json contains the mappings from strain to genomics and metabolomics data. It is generated by NPLinker for podp mode; for local mode, users need to create it manually. [F] means the file name nplinker.toml is a fixed name (including the extension) and must be named as shown.
    4. strains_selected.json is an optional file containing the list of strains to be used in the analysis. If it is not provided, NPLinker will use all strains detected from the input data. [O] means the file strains_selected.json is optional for users to provide.
    5. gnps directory contains the GNPS data. The files in this directory must be named as shown. See XXX for more information about the GNPS data.
    6. This file could be .tsv or .csv format.
    7. antismash directory contains a collection of AntiSMASH BGC data. The BGC data (*.region*.gbk files) must be stored in subdirectories named after NCBI accession number (e.g. GCF_000514975.1).
    8. bigscape directory is optional and contains the output of BigScape. If the directory is not provided, NPLinker will run BigScape automatically to generate the data using the AntiSMASH BGC data.
    9. mix_clustering_c0.30.tsv is an example output of BigScape. The file name must follow the pattern mix_clustering_c{cutoff}.tsv, where {cutoff} is the cutoff value used in the BigScape run.
    10. downloads directory is automatically created and managed by NPLinker. It stores the downloaded data from the internet. Users can also use it to store their own downloaded data. [A] means the directory is automatically created and/or managed by NPLinker.
    11. This is an example file, the actual file would be different. Same as the other files in the downloads directory.
    12. mibig directory contains the MIBiG metadata, which is automatically created and downloaded by NPLinker. Users should not interfere with this directory and its content.
    13. output directory is automatically created by NPLinker. It stores the output data of NPLinker.
    14. It's flexible to extend NPLinker by adding other types of data.

    Tip

    "},{"location":"diagrams/arranger/","title":"Dataset Arranging Pipeline","text":"

    The DatasetArranger is implemented according to the following flowcharts.

    "},{"location":"diagrams/arranger/#strain-mappings-file","title":"Strain mappings file","text":"
    flowchart TD\n    StrainMappings[`strain_mappings.json`] --> SM{Is the mode PODP?}\n    SM --> |No |SM0[Validate the file]\n    SM --> |Yes|SM1[Generate the file] --> SM0
    "},{"location":"diagrams/arranger/#strain-selection-file","title":"Strain selection file","text":"
    flowchart TD\n    StrainsSelected[`strains_selected.json`] --> S{Does the file exist?}\n    S --> |No | S0[Nothing to do]\n    S --> |Yes| S1[Validate the file]
    "},{"location":"diagrams/arranger/#podp-project-metadata-json-file","title":"PODP project metadata json file","text":"
    flowchart TD\n    podp[PODP project metadata json file] --> A{Is the mode PODP?}\n    A --> |No | A0[Nothing to do]\n    A --> |Yes| P{Does the file exist?}\n    P --> |No | P0[Download the file] --> P1\n    P --> |Yes| P1[Validate the file]
    "},{"location":"diagrams/arranger/#gnps-antismash-and-bigscape","title":"GNPS, AntiSMASH and BigScape","text":"
    flowchart TD\n    ConfigError[Dynaconf config validation error]\n    DataError[Data validation error]\n    UseIt[Use the data]\n    Download[First remove existing data if relevent, then download or generate data]\n\n    A[GNPS, antiSMASH and BigSCape] --> B{Pass Dynaconf config validation?}\n    B -->|No | ConfigError\n    B -->|Yes| G{Is the mode PODP?}\n\n    G -->|No, local mode| G1{Does data dir exist?}\n    G1 -->|No | DataError\n    G1 -->|Yes| H{Pass data validation?}\n    H --> |No | DataError\n    H --> |Yes| UseIt \n\n    G -->|Yes, podp mode| G2{Does data dir exist?}\n    G2 --> |No | Download\n    G2 --> |Yes | J{Pass data validation?}\n    J -->|No | Download --> |try max 2 times| J\n    J -->|Yes| UseIt
    "},{"location":"diagrams/arranger/#mibig-data","title":"MIBiG Data","text":"

    MIBiG data is always downloaded automatically. Users cannot provide their own MIBiG data.

    flowchart TD\n    Mibig[MIBiG] --> M0{Pass Dynaconf config validation?}\n    M0 -->|No | M01[Dynaconf config validation error]\n    M0 -->|Yes | MibigDownload[First remove existing data if relevant and then download data]
    "},{"location":"diagrams/loader/","title":"Dataset Loading Pipeline","text":"

    The DatasetLoader is implemented according to the following pipeline.

    "}]} \ No newline at end of file diff --git a/dev/sitemap.xml b/dev/sitemap.xml index 9283202c5..c9de0b0b1 100644 --- a/dev/sitemap.xml +++ b/dev/sitemap.xml @@ -2,152 +2,157 @@ https://nplinker.github.io/nplinker/latest/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/install/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/logging/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/quickstart/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/antismash/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/arranger/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/bigscape/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/genomics/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/genomics_abc/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/genomics_utils/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/gnps/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/loader/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/metabolomics/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/metabolomics_abc/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/metabolomics_utils/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/mibig/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/nplinker/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/schema/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/scoring/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/scoring_abc/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/scoring_methods/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/scoring_utils/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/strain/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/strain_utils/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/api/utils/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/concepts/bigscape/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/concepts/config_file/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/concepts/gnps_data/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/concepts/working_dir_structure/ - 2024-06-20 + 2024-06-28 daily https://nplinker.github.io/nplinker/latest/diagrams/arranger/ - 2024-06-20 + 2024-06-28 + daily + + + https://nplinker.github.io/nplinker/latest/diagrams/loader/ + 2024-06-28 daily \ No newline at end of file diff --git a/dev/sitemap.xml.gz b/dev/sitemap.xml.gz index 7b53e35b4f7b995867c6a4acafe1fe3ef2ff8d42..970a0adbbd1a1252990760068778876521c9de90 100644 GIT binary patch literal 416 zcmV;R0bl+fiwFqd;C*HS|8r?{Wo=<_E_iKh0M(emZi6rkhVOj}#J!}erm34Kz3mCM zGjc;3tcHZfW^8Z2&~}KOCrxrL36^7ha`;V5dHC#H_F*Dn@caCs*yNeaS9*B z03XqkhTez)w#J7JHHibR;i%<74~$Mwu6-{D)&*z;Rf^6q9D|c3z9qCM0saD)CBhEV zM@0M-_d)nna^Z26FJkJFon*l~DLbQr>pKeH9MO7Bhd3~3qu_U<6&ZBZc z*Gm>kY5eHWA5APQazC{NhMS}uupw5JS{AI=1Mv{LdRuNEHs#}HTc(h!whGfdcxw*P zoazqHeN}3uT)@yWUYkU|`lQ`f8S}hNHVswIP?MFGVEh$EmEhAXVba%gq%+Kf!HPEt4O zG&!Jc4(Nn|FI)Yl0|r^xbBM>q@;Z!7>dZ;=6lGMtL$DDB`p179=L;FGzX2r;13_OE F006qa!q)%* diff --git a/latest b/latest index f6b81852e..90012116c 120000 --- a/latest +++ b/latest @@ -1 +1 @@ -2.0.0a2 \ No newline at end of file +dev \ No newline at end of file diff --git a/versions.json b/versions.json index 34d299648..725923177 100644 --- a/versions.json +++ b/versions.json @@ -2,13 +2,13 @@ { "version": "dev", "title": "dev", - "aliases": [] + "aliases": [ + "latest" + ] }, { "version": "2.0.0a2", "title": "2.0.0a2", - "aliases": [ - "latest" - ] + "aliases": [] } ]