Skip to content

Commit

Permalink
add methods to export results in tabular format (#280)
Browse files Browse the repository at this point in the history
* add print links method to LinkGraph, improve LinkGraph string representation

* feat: add a method to print tabular results files

* improve method names and docstrings, remove unused method to export gcf file

* improve doctring and typing

* fix a failing test

* refactor a little bit the spectrum method to covert to dict

* change the output format for gnps_annotations in metabolomics results file, improve docstrings

* fix: convert int to str before using join

* change representation of empty values in output files for improved integration to excel

* refactoring the export methods

* small refactor: specify staticmethod

* add more tests

* correct typing in doctrings

* typing: changed typings to pass mypy static typing checks

* refactor: change the order of methods/functions

* restore the order of already existing functions and methods

* make dicts json compatible

* rename functions and variables

* refactor: changed the place when the index is added to the link dict

* use csv package to write the tabular output files

* make sure all elements of the input list have the same type of data.

* shorten to long doc string lines, correct some doc strings

* tests: adapted the test to the changes

* remove a file that was committed by accident

* Improve docstrings

Apply suggestions from code review

Co-authored-by: Cunliang Geng <[email protected]>

* Improve docstrings

Apply suggestions from code review

Co-authored-by: Cunliang Geng <[email protected]>

* refactor: add method to convert a value to string for tabular output

* improve doctring, add a comment about key order of bgc dict representation

* move to_string method to the BGC/Spectrum class, add a to_tabular method

* add tests for the to_string method

* change to_tabular to it returns a list and not a string

* refactor: to_tabular returns dict, to_string turned into private func, tabs are replaced in to_tabular

* fix typing in to_tabular methods

* update docstrings and comments

* ensure 0 and 0.0 are correctly converted to strings, and not to empty strings

* change the order of methods

* remove whitespace in blank lines

* update and add tests

* change variable name to fix mypy error

* test: trying to fix unit test issue where the spectrum rt is a dict instead of numerical

* tests: add precursor charge to the test spectra

* Update src/nplinker/metabolomics/spectrum.py

---------

Co-authored-by: Cunliang Geng <[email protected]>
  • Loading branch information
liannette and CunliangGeng authored Dec 4, 2024
1 parent 4d57ccd commit 481a068
Show file tree
Hide file tree
Showing 10 changed files with 528 additions and 30 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/format-typing-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- name: Install ruff and mypy
run: |
pip install ruff mypy typing_extensions \
types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs
types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs
- name: Get all changed python files
id: changed-python-files
uses: tj-actions/changed-files@v44
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ dev = [
"types-beautifulsoup4",
"types-jsonschema",
"types-networkx",
"types-tabulate",
"pandas-stubs",
# docs
"black",
Expand Down
68 changes: 68 additions & 0 deletions src/nplinker/genomics/bgc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from typing import Any
from deprecated import deprecated
from nplinker.strain import Strain
from .aa_pred import predict_aa
Expand Down Expand Up @@ -173,6 +174,73 @@ def is_mibig(self) -> bool:
"""
return self.id.startswith("BGC")

def to_dict(self) -> dict[str, Any]:
"""Convert the BGC object to a dictionary for exporting purpose.
Returns:
A dictionary containing the following key-value pairs:
- GCF_id (list[str]): A list of GCF IDs.
- GCF_bigscape_class (list[str]): A list of BiG-SCAPE classes.
- strain_id (str | None): The ID of the strain.
- description (str | None): A description of the BGC.
- BGC_name (str): The name of the BGC.
- product_prediction (list[str]): (predicted) products or product classes of the BGC.
- mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes.
- antismash_id (str | None): The antiSMASH ID.
- antismash_region (int | None): The antiSMASH region number.
"""
# Keys are ordered to make the output easier to analyze
return {
"GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None],
"GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None],
"strain_id": self.strain.id if self.strain is not None else None,
"description": self.description,
"BGC_name": self.id,
"product_prediction": list(self.product_prediction),
"mibig_bgc_class": self.mibig_bgc_class,
"antismash_id": self.antismash_id,
"antismash_region": self.antismash_region,
}

def to_tabular(self) -> dict[str, str]:
"""Convert the BGC object to a tabular format.
Returns:
dict: A dictionary representing the BGC object in tabular format.
The keys can be treated as headers and values are strings in which tabs are removed.
This dict can be exported as a TSV file.
"""
return {
key: self._to_string(value).replace("\t", " ")
for key, value in self.to_dict().items()
}

@staticmethod
def _to_string(value: Any) -> str:
"""Convert various types of values to a string.
Args:
value: The value to be converted to a string.
Can be a list, dict, or any other JSON-compatible type.
Returns:
A string representation of the input value.
"""
# Convert list to comma-separated string
if isinstance(value, list):
formatted_value = ", ".join(map(str, value))
# Convert dict to comma-separated string
elif isinstance(value, dict):
formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()])
# Convert None to empty string
elif value is None:
formatted_value = ""
# Convert anything else to string
else:
formatted_value = str(value)
return formatted_value

# CG: why not providing whole product but only amino acid as product monomer?
# this property is not used in NPLinker core business.
@property
Expand Down
63 changes: 63 additions & 0 deletions src/nplinker/metabolomics/spectrum.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations
from functools import cached_property
from typing import TYPE_CHECKING
from typing import Any
import numpy as np
from nplinker.strain import Strain
from nplinker.strain import StrainCollection
Expand Down Expand Up @@ -108,3 +109,65 @@ def has_strain(self, strain: Strain) -> bool:
True when the given strain exist in the spectrum.
"""
return strain in self.strains

def to_dict(self) -> dict[str, Any]:
"""Convert the Spectrum object to a dictionary for exporting purpose.
Returns:
A dictionary containing containing the following key-value pairs:
- "spectrum_id" (str): The unique identifier of the spectrum.
- "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
- "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
- "rt" (float): The retention time, rounded to three decimal places.
- "molecular_family" (str | None ): The identifier of the molecular family.
- "gnps_id" (str | None ): The GNPS identifier.
- "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations.
"""
return {
"spectrum_id": self.id,
"num_strains_with_spectrum": len(self.strains),
"precursor_mz": round(self.precursor_mz, 4),
"rt": round(self.rt, 3),
"molecular_family": self.family.id if self.family else None,
"gnps_id": self.gnps_id,
"gnps_annotations": self.gnps_annotations,
}

def to_tabular(self) -> dict[str, str]:
"""Convert the Spectrum object to a tabular format.
Returns:
dict: A dictionary representing the Spectrum object in tabular format.
The keys can be treated as headers and values are strings in which tabs are removed.
This dict can be exported as a TSV file.
"""
return {
key: self._to_string(value).replace("\t", " ")
for key, value in self.to_dict().items()
}

@staticmethod
def _to_string(value: Any) -> str:
"""Convert various types of values to a string.
Args:
value: The value to be converted to a string.
Can be a list, dict, or any other JSON-compatible type.
Returns:
A string representation of the input value.
"""
# Convert list to comma-separated string
if isinstance(value, list):
formatted_value = ", ".join(map(str, value))
# Convert dict to comma-separated string
elif isinstance(value, dict):
formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()])
# Convert None to empty string
elif value is None:
formatted_value = ""
# Convert anything else to string
else:
formatted_value = str(value)
return formatted_value
41 changes: 41 additions & 0 deletions src/nplinker/nplinker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations
import csv
import logging
import pickle
from collections.abc import Sequence
Expand Down Expand Up @@ -355,3 +356,43 @@ def save_data(
data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
with open(file, "wb") as f:
pickle.dump(data, f)

def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None:
"""Exports a list of BGC or Spectrum objects to a tsv file.
Args:
objects (list): A list of BGC or a list of Spectrum objects to be exported.
filename (str): The name of the output file.
"""
if not objects:
raise ValueError("No objects provided to export")

# Ensure all elements in the list are of the same type
obj_type = type(objects[0])
if not all(isinstance(obj, obj_type) for obj in objects):
raise TypeError("All objects in the list must be of the same type")

with open(self._output_dir / filename, "w", newline="") as outfile:
headers = objects[0].to_tabular().keys()
writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t")
writer.writeheader()
for obj in objects:
writer.writerow(obj.to_tabular())

def to_tsv(self, lg: LinkGraph | None = None) -> None:
"""Export data to tsv files.
This method exports following data to seperated TSV files:
- BGC objects: `genomics_data.tsv`
- Spectrum objects: `metabolomics_data.tsv`
- LinkGraph object (if given): `links.tsv`
Args:
lg (LinkGraph | None): An optional LinkGraph object. If provided,
the links data will be exported to 'links.tsv'.
"""
self.objects_to_tsv(self.bgcs, "genomics_data.tsv")
self.objects_to_tsv(self.spectra, "metabolomics_data.tsv")
if lg is not None:
lg.to_tsv(self._output_dir / "links.tsv")
Loading

0 comments on commit 481a068

Please sign in to comment.