NPLinker · CunliangGeng · Dec 4, 2024 · Oct 15, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44

diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ dev = [
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",
+    "types-tabulate",
     "pandas-stubs",
     # docs
     "black",

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING
+from typing import Any
 from deprecated import deprecated
 from nplinker.strain import Strain
 from .aa_pred import predict_aa
@@ -173,6 +174,73 @@ def is_mibig(self) -> bool:
         """
         return self.id.startswith("BGC")
 
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the BGC object to a dictionary for exporting purpose.
+
+        Returns:
+            A dictionary containing the following key-value pairs:
+
+            - GCF_id (list[str]): A list of GCF IDs.
+            - GCF_bigscape_class (list[str]): A list of BiG-SCAPE classes.
+            - strain_id (str | None): The ID of the strain.
+            - description (str | None): A description of the BGC.
+            - BGC_name (str): The name of the BGC.
+            - product_prediction (list[str]): (predicted) products or product classes of the BGC.
+            - mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes.
+            - antismash_id (str | None): The antiSMASH ID.
+            - antismash_region (int | None): The antiSMASH region number.
+        """
+        # Keys are ordered to make the output easier to analyze
+        return {
+            "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None],
+            "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None],
+            "strain_id": self.strain.id if self.strain is not None else None,
+            "description": self.description,
+            "BGC_name": self.id,
+            "product_prediction": list(self.product_prediction),
+            "mibig_bgc_class": self.mibig_bgc_class,
+            "antismash_id": self.antismash_id,
+            "antismash_region": self.antismash_region,
+        }
+
+    def to_tabular(self) -> dict[str, str]:
+        """Convert the BGC object to a tabular format.
+
+        Returns:
+            dict: A dictionary representing the BGC object in tabular format.
+                The keys can be treated as headers and values are strings in which tabs are removed.
+                This dict can be exported as a TSV file.
+        """
+        return {
+            key: self._to_string(value).replace("\t", "    ")
+            for key, value in self.to_dict().items()
+        }
+
+    @staticmethod
+    def _to_string(value: Any) -> str:
+        """Convert various types of values to a string.
+
+        Args:
+            value: The value to be converted to a string.
+                Can be a list, dict, or any other JSON-compatible type.
+
+        Returns:
+            A string representation of the input value.
+        """
+        # Convert list to comma-separated string
+        if isinstance(value, list):
+            formatted_value = ", ".join(map(str, value))
+        # Convert dict to comma-separated string
+        elif isinstance(value, dict):
+            formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()])
+        # Convert None to empty string
+        elif value is None:
+            formatted_value = ""
+        # Convert anything else to string
+        else:
+            formatted_value = str(value)
+        return formatted_value
+
     # CG: why not providing whole product but only amino acid as product monomer?
     # this property is not used in NPLinker core business.
     @property

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from functools import cached_property
 from typing import TYPE_CHECKING
+from typing import Any
 import numpy as np
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
@@ -108,3 +109,65 @@ def has_strain(self, strain: Strain) -> bool:
             True when the given strain exist in the spectrum.
         """
         return strain in self.strains
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the Spectrum object to a dictionary for exporting purpose.
+
+        Returns:
+            A dictionary containing containing the following key-value pairs:
+
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str | None ): The identifier of the molecular family.
+                - "gnps_id" (str | None ): The GNPS identifier.
+                - "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations.
+        """
+        return {
+            "spectrum_id": self.id,
+            "num_strains_with_spectrum": len(self.strains),
+            "precursor_mz": round(self.precursor_mz, 4),
+            "rt": round(self.rt, 3),
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
+        }
+
+    def to_tabular(self) -> dict[str, str]:
+        """Convert the Spectrum object to a tabular format.
+
+        Returns:
+            dict: A dictionary representing the Spectrum object in tabular format.
+                The keys can be treated as headers and values are strings in which tabs are removed.
+                This dict can be exported as a TSV file.
+        """
+        return {
+            key: self._to_string(value).replace("\t", "    ")
+            for key, value in self.to_dict().items()
+        }
+
+    @staticmethod
+    def _to_string(value: Any) -> str:
+        """Convert various types of values to a string.
+
+        Args:
+            value: The value to be converted to a string.
+                Can be a list, dict, or any other JSON-compatible type.
+
+        Returns:
+            A string representation of the input value.
+        """
+        # Convert list to comma-separated string
+        if isinstance(value, list):
+            formatted_value = ", ".join(map(str, value))
+        # Convert dict to comma-separated string
+        elif isinstance(value, dict):
+            formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()])
+        # Convert None to empty string
+        elif value is None:
+            formatted_value = ""
+        # Convert anything else to string
+        else:
+            formatted_value = str(value)
+        return formatted_value
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import csv
 import logging
 import pickle
 from collections.abc import Sequence
@@ -355,3 +356,43 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
+
+    def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None:
+        """Exports a list of BGC or Spectrum objects to a tsv file.
+
+        Args:
+            objects (list): A list of BGC or a list of Spectrum objects to be exported.
+            filename (str): The name of the output file.
+        """
+        if not objects:
+            raise ValueError("No objects provided to export")
+
+        # Ensure all elements in the list are of the same type
+        obj_type = type(objects[0])
+        if not all(isinstance(obj, obj_type) for obj in objects):
+            raise TypeError("All objects in the list must be of the same type")
+
+        with open(self._output_dir / filename, "w", newline="") as outfile:
+            headers = objects[0].to_tabular().keys()
+            writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t")
+            writer.writeheader()
+            for obj in objects:
+                writer.writerow(obj.to_tabular())
+
+    def to_tsv(self, lg: LinkGraph | None = None) -> None:
+        """Export data to tsv files.
+
+        This method exports following data to seperated TSV files:
+
+         - BGC objects: `genomics_data.tsv`
+         - Spectrum objects: `metabolomics_data.tsv`
+         - LinkGraph object (if given): `links.tsv`
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.objects_to_tsv(self.bgcs, "genomics_data.tsv")
+        self.objects_to_tsv(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.to_tsv(self._output_dir / "links.tsv")