add methods to export results in tabular format (#280)

* add print links method to LinkGraph, improve LinkGraph string representation * feat: add a method to print tabular results files * improve method names and docstrings, remove unused method to export gcf file * improve doctring and typing * fix a failing test * refactor a little bit the spectrum method to covert to dict * change the output format for gnps_annotations in metabolomics results file, improve docstrings * fix: convert int to str before using join * change representation of empty values in output files for improved integration to excel * refactoring the export methods * small refactor: specify staticmethod * add more tests * correct typing in doctrings * typing: changed typings to pass mypy static typing checks * refactor: change the order of methods/functions * restore the order of already existing functions and methods * make dicts json compatible * rename functions and variables * refactor: changed the place when the index is added to the link dict * use csv package to write the tabular output files * make sure all elements of the input list have the same type of data. * shorten to long doc string lines, correct some doc strings * tests: adapted the test to the changes * remove a file that was committed by accident * Improve docstrings Apply suggestions from code review Co-authored-by: Cunliang Geng <[email protected]> * Improve docstrings Apply suggestions from code review Co-authored-by: Cunliang Geng <[email protected]> * refactor: add method to convert a value to string for tabular output * improve doctring, add a comment about key order of bgc dict representation * move to_string method to the BGC/Spectrum class, add a to_tabular method * add tests for the to_string method * change to_tabular to it returns a list and not a string * refactor: to_tabular returns dict, to_string turned into private func, tabs are replaced in to_tabular * fix typing in to_tabular methods * update docstrings and comments * ensure 0 and 0.0 are correctly converted to strings, and not to empty strings * change the order of methods * remove whitespace in blank lines * update and add tests * change variable name to fix mypy error * test: trying to fix unit test issue where the spectrum rt is a dict instead of numerical * tests: add precursor charge to the test spectra * Update src/nplinker/metabolomics/spectrum.py --------- Co-authored-by: Cunliang Geng <[email protected]>
NPLinker · Dec 4, 2024 · 481a068 · 481a068
1 parent 4d57ccd
commit 481a068
Show file tree

Hide file tree

Showing 10 changed files with 528 additions and 30 deletions.
diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44

diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ dev = [
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",
+    "types-tabulate",
     "pandas-stubs",
     # docs
     "black",

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING
+from typing import Any
 from deprecated import deprecated
 from nplinker.strain import Strain
 from .aa_pred import predict_aa
@@ -173,6 +174,73 @@ def is_mibig(self) -> bool:
         """
         return self.id.startswith("BGC")
 
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the BGC object to a dictionary for exporting purpose.
+
+        Returns:
+            A dictionary containing the following key-value pairs:
+
+            - GCF_id (list[str]): A list of GCF IDs.
+            - GCF_bigscape_class (list[str]): A list of BiG-SCAPE classes.
+            - strain_id (str | None): The ID of the strain.
+            - description (str | None): A description of the BGC.
+            - BGC_name (str): The name of the BGC.
+            - product_prediction (list[str]): (predicted) products or product classes of the BGC.
+            - mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes.
+            - antismash_id (str | None): The antiSMASH ID.
+            - antismash_region (int | None): The antiSMASH region number.
+        """
+        # Keys are ordered to make the output easier to analyze
+        return {
+            "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None],
+            "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None],
+            "strain_id": self.strain.id if self.strain is not None else None,
+            "description": self.description,
+            "BGC_name": self.id,
+            "product_prediction": list(self.product_prediction),
+            "mibig_bgc_class": self.mibig_bgc_class,
+            "antismash_id": self.antismash_id,
+            "antismash_region": self.antismash_region,
+        }
+
+    def to_tabular(self) -> dict[str, str]:
+        """Convert the BGC object to a tabular format.
+
+        Returns:
+            dict: A dictionary representing the BGC object in tabular format.
+                The keys can be treated as headers and values are strings in which tabs are removed.
+                This dict can be exported as a TSV file.
+        """
+        return {
+            key: self._to_string(value).replace("\t", "    ")
+            for key, value in self.to_dict().items()
+        }
+
+    @staticmethod
+    def _to_string(value: Any) -> str:
+        """Convert various types of values to a string.
+
+        Args:
+            value: The value to be converted to a string.
+                Can be a list, dict, or any other JSON-compatible type.
+
+        Returns:
+            A string representation of the input value.
+        """
+        # Convert list to comma-separated string
+        if isinstance(value, list):
+            formatted_value = ", ".join(map(str, value))
+        # Convert dict to comma-separated string
+        elif isinstance(value, dict):
+            formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()])
+        # Convert None to empty string
+        elif value is None:
+            formatted_value = ""
+        # Convert anything else to string
+        else:
+            formatted_value = str(value)
+        return formatted_value
+
     # CG: why not providing whole product but only amino acid as product monomer?
     # this property is not used in NPLinker core business.
     @property

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from functools import cached_property
 from typing import TYPE_CHECKING
+from typing import Any
 import numpy as np
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
@@ -108,3 +109,65 @@ def has_strain(self, strain: Strain) -> bool:
             True when the given strain exist in the spectrum.
         """
         return strain in self.strains
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the Spectrum object to a dictionary for exporting purpose.
+
+        Returns:
+            A dictionary containing containing the following key-value pairs:
+
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str | None ): The identifier of the molecular family.
+                - "gnps_id" (str | None ): The GNPS identifier.
+                - "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations.
+        """
+        return {
+            "spectrum_id": self.id,
+            "num_strains_with_spectrum": len(self.strains),
+            "precursor_mz": round(self.precursor_mz, 4),
+            "rt": round(self.rt, 3),
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
+        }
+
+    def to_tabular(self) -> dict[str, str]:
+        """Convert the Spectrum object to a tabular format.
+
+        Returns:
+            dict: A dictionary representing the Spectrum object in tabular format.
+                The keys can be treated as headers and values are strings in which tabs are removed.
+                This dict can be exported as a TSV file.
+        """
+        return {
+            key: self._to_string(value).replace("\t", "    ")
+            for key, value in self.to_dict().items()
+        }
+
+    @staticmethod
+    def _to_string(value: Any) -> str:
+        """Convert various types of values to a string.
+
+        Args:
+            value: The value to be converted to a string.
+                Can be a list, dict, or any other JSON-compatible type.
+
+        Returns:
+            A string representation of the input value.
+        """
+        # Convert list to comma-separated string
+        if isinstance(value, list):
+            formatted_value = ", ".join(map(str, value))
+        # Convert dict to comma-separated string
+        elif isinstance(value, dict):
+            formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()])
+        # Convert None to empty string
+        elif value is None:
+            formatted_value = ""
+        # Convert anything else to string
+        else:
+            formatted_value = str(value)
+        return formatted_value
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import csv
 import logging
 import pickle
 from collections.abc import Sequence
@@ -355,3 +356,43 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
+
+    def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None:
+        """Exports a list of BGC or Spectrum objects to a tsv file.
+
+        Args:
+            objects (list): A list of BGC or a list of Spectrum objects to be exported.
+            filename (str): The name of the output file.
+        """
+        if not objects:
+            raise ValueError("No objects provided to export")
+
+        # Ensure all elements in the list are of the same type
+        obj_type = type(objects[0])
+        if not all(isinstance(obj, obj_type) for obj in objects):
+            raise TypeError("All objects in the list must be of the same type")
+
+        with open(self._output_dir / filename, "w", newline="") as outfile:
+            headers = objects[0].to_tabular().keys()
+            writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t")
+            writer.writeheader()
+            for obj in objects:
+                writer.writerow(obj.to_tabular())
+
+    def to_tsv(self, lg: LinkGraph | None = None) -> None:
+        """Export data to tsv files.
+
+        This method exports following data to seperated TSV files:
+
+         - BGC objects: `genomics_data.tsv`
+         - Spectrum objects: `metabolomics_data.tsv`
+         - LinkGraph object (if given): `links.tsv`
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.objects_to_tsv(self.bgcs, "genomics_data.tsv")
+        self.objects_to_tsv(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.to_tsv(self._output_dir / "links.tsv")