NPLinker · CunliangGeng · Dec 4, 2024 · Oct 15, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44

diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ dev = [
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",
+    "types-tabulate",
     "pandas-stubs",
     # docs
     "black",

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING
+from typing import Any
 from deprecated import deprecated
 from nplinker.strain import Strain
 from .aa_pred import predict_aa
@@ -116,18 +117,6 @@ def __reduce__(self) -> tuple:
         """Reduce function for pickling."""
         return (self.__class__, (self.id, *self.product_prediction), self.__dict__)
 
-    def add_parent(self, gcf: GCF) -> None:
-        """Add a parent GCF to the BGC.
-
-        Args:
-            gcf: gene cluster family
-        """
-        gcf.add_bgc(self)
-
-    def detach_parent(self, gcf: GCF) -> None:
-        """Remove a parent GCF."""
-        gcf.detach_bgc(self)
-
     @property
     def strain(self) -> Strain | None:
         """Get the strain of the BGC."""
@@ -161,6 +150,18 @@ def bigscape_classes(self) -> set[str | None]:
         """
         return {p.bigscape_class for p in self.parents}
 
+    def add_parent(self, gcf: GCF) -> None:
+        """Add a parent GCF to the BGC.
+
+        Args:
+            gcf: gene cluster family
+        """
+        gcf.add_bgc(self)
+
+    def detach_parent(self, gcf: GCF) -> None:
+        """Remove a parent GCF."""
+        gcf.detach_bgc(self)
+
     def is_mibig(self) -> bool:
         """Check if the BGC is a MIBiG reference BGC or not.
 
@@ -173,6 +174,36 @@ def is_mibig(self) -> bool:
         """
         return self.id.startswith("BGC")
 
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the BGC object to a dictionary for exporting results.
+
+        This method compiles relevant information from the BGC object and formats it into a dictionary.
+        Each key-value pair in the dictionary represents a specific attribute of the BGC.
+
+        Returns:
+            A dictionary containing the following key-value pairs:
+            - GCF_id (set): A set of GCF IDs.
+            - GCF_bigscape_class (set): A set of BiG-SCAPE classes.
+            - strain_id (str | None): The ID of the strain.
+            - description (str | None): A description of the BGC.
+            - BGC_name (str): The name of the BGC.
+            - product_prediction (tuple): (predicted) natural products or product classes of the BGC.
+            - mibig_bgc_class (tuple[str] | None):  MIBiG biosynthetic classes to which the BGC belongs.
+            - antismash_id (str | None): The antiSMASH ID.
+            - antismash_region (int | None): The antiSMASH region.
+        """
+        return {
+            "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
+            "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
+            "strain_id": self.strain.id if self.strain is not None else None,
+            "description": self.description,
+            "BGC_name": self.id,
+            "product_prediction": self.product_prediction,
+            "mibig_bgc_class": self.mibig_bgc_class,
+            "antismash_id": self.antismash_id,
+            "antismash_region": self.antismash_region,
+        }
+
     # CG: why not providing whole product but only amino acid as product monomer?
     # this property is not used in NPLinker core business.
     @property

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from functools import cached_property
 from typing import TYPE_CHECKING
+from typing import Any
 import numpy as np
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
@@ -97,3 +98,29 @@ def has_strain(self, strain: Strain) -> bool:
             True when the given strain exist in the spectrum.
         """
         return strain in self.strains
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the Spectrum object to a dictionary for exporting results.
+
+        This method compiles relevant information from the Spectrum object into a dictionary format.
+        Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object.
+
+        Returns:
+            A dictionary containing containing the following key-value pairs:
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str | None ): The identifier of the molecular family.
+                - "gnps_id" (str | None ): The GNPS identifier.
+                - "gnps_annotations" (dict): A dictionary of GNPS annotations.
+        """
+        return {
+            "spectrum_id": self.id,
+            "num_strains_with_spectrum": len(self.strains),
+            "precursor_mz": round(self.precursor_mz, 4),
+            "rt": round(self.rt, 3),
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
+        }
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -168,34 +168,50 @@ def scoring_methods(self) -> list[str]:
         """Get names of all valid scoring methods."""
         return list(self._valid_scoring_methods.keys())
 
-    def load_data(self):
-        """Load all data from files into memory.
-
-        This method is a convenience function that calls the
-        [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files
-        (download, generate and/or validate data) in the [correct directory structure][working-directory-structure],
-        and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data
-        from the files into memory.
+    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
+        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
 
-        The loaded data is stored in various data containers for easy access, e.g.
-        [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects,
-        [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc.
+        Args:
+            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
+            filename (str): The name of the file where the data will be saved.
         """
-        arranger = DatasetArranger(self.config)
-        arranger.arrange()
-        loader = DatasetLoader(self.config)
-        loader.load()
+        headers = objects[0].to_dict().keys()
+        with open(self._output_dir / filename, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for obj in objects:
+                row_data = obj.to_dict()
+                formatted_row = []
+                for header in headers:
+                    item = row_data.get(header, "")
+                    # Convert list, tuple, set to comma-separated string
+                    if isinstance(item, (list, tuple, set)):
+                        formatted_row.append(", ".join(map(str, item)))
+                    # Convert dict to comma-separated string
+                    elif isinstance(item, dict):
+                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
+                    # Convert non-empty value to string
+                    elif item:
+                        formatted_row.append(str(item))
+                    # Convert empty value to empty string
+                    else:
+                        formatted_row.append("")
+                f.write("\t".join(formatted_row) + "\n")
+
+    def export_results(self, lg: LinkGraph | None = None) -> None:
+        """Exports the results to the output directory in tab-separated format.
+
+        This method exports genomics and metabolomics data to their respective
+        TSV files in the specified output directory. If a LinkGraph object is
+        provided, it also exports the links data to a TSV file.
 
-        self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}
-        self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}
-        self._spec_dict = {spec.id: spec for spec in loader.spectra}
-        self._mf_dict = {mf.id: mf for mf in loader.mfs}
-
-        self._mibig_bgcs = loader.mibig_bgcs
-        self._strains = loader.strains
-        self._product_types = loader.product_types
-        self._chem_classes = loader.chem_classes
-        self._class_matches = loader.class_matches
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.export_objects(self.bgcs, "genomics_data.tsv")
+        self.export_objects(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.export_links(self._output_dir / "links.tsv")
 
     @overload
     def get_links(
@@ -281,6 +297,35 @@ def get_links(
 
         return scoring.get_links(*objects, **scoring_params)
 
+    def load_data(self):
+        """Load all data from files into memory.
+
+        This method is a convenience function that calls the
+        [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files
+        (download, generate and/or validate data) in the [correct directory structure][working-directory-structure],
+        and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data
+        from the files into memory.
+
+        The loaded data is stored in various data containers for easy access, e.g.
+        [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects,
+        [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc.
+        """
+        arranger = DatasetArranger(self.config)
+        arranger.arrange()
+        loader = DatasetLoader(self.config)
+        loader.load()
+
+        self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}
+        self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}
+        self._spec_dict = {spec.id: spec for spec in loader.spectra}
+        self._mf_dict = {mf.id: mf for mf in loader.mfs}
+
+        self._mibig_bgcs = loader.mibig_bgcs
+        self._strains = loader.strains
+        self._product_types = loader.product_types
+        self._chem_classes = loader.chem_classes
+        self._class_matches = loader.class_matches
+
     def lookup_bgc(self, id: str) -> BGC | None:
         """Get the BGC object with the given ID.