Skip to content

Commit

Permalink
use uniformed attribute name id (#253)
Browse files Browse the repository at this point in the history
* change Strain parameter `primary_id` to `id`

* change MolecularFamily attribute `family_id` to `id`

* change Spectrum attribute `spectrum_id` to `id`

* change BGC attribute `bgc_id` to `id`

* change GCF attribute `gcf_id` to `id`
  • Loading branch information
CunliangGeng authored Jun 12, 2024
1 parent c7d2be8 commit ccda2d8
Show file tree
Hide file tree
Showing 31 changed files with 135 additions and 143 deletions.
9 changes: 5 additions & 4 deletions notebooks/npclassscore_linking/prospecting/class_linking.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
"""Initial code for NPClassScore."""

import glob
import os
import sys
Expand Down Expand Up @@ -541,7 +542,7 @@ def class_linking_score(self, obj, target):

if is_spectrum:
# list of list of tuples/None - todo: add to spectrum object
spec_like_classes = self.canopus.spectra_classes.get(str(spec_like.spectrum_id))
spec_like_classes = self.canopus.spectra_classes.get(str(spec_like.id))
spec_like_classes_names_inds = self.canopus.spectra_classes_names_inds
else: # molfam
spec_like_classes = self.canopus.molfam_classes.get(str(spec_like.family_id))
Expand Down Expand Up @@ -654,7 +655,7 @@ def npclass_score(self, obj, target, method="main"):
if is_spectrum:
# list of list of tuples/None - todo: add to spectrum object
# take only 'best' (first) classification per ontology level
all_classes = self.canopus.spectra_classes.get(str(spec_like.spectrum_id))
all_classes = self.canopus.spectra_classes.get(str(spec_like.id))
if all_classes:
spec_like_classes = [
cls_per_lvl
Expand All @@ -675,7 +676,7 @@ def npclass_score(self, obj, target, method="main"):
spec_like_classes_names_inds = self.canopus.molfam_classes_names_inds
if use_mne and not spec_like_classes: # if mne or when main/canopus does not get classes
if is_spectrum:
spec_like_classes = self.molnetenhancer.spectra_classes(spec_like.spectrum_id)
spec_like_classes = self.molnetenhancer.spectra_classes(spec_like.id)
else: # molfam
spec_like_classes = self.molnetenhancer.molfam_classes.get(str(spec_like.family_id))
# classes are same for molfam and spectrum so names are irrespective of is_spectrum
Expand Down Expand Up @@ -777,4 +778,4 @@ def _get_bgc_like_classes(self, bgc_like, is_bgc):
return bgc_like_classes_dict

def _get_bgc_like_gcf(self, bgc_like):
return [gcf for gcf in self.gcfs if bgc_like.bgc_id in [b.bgc_id for b in gcf.bgcs]][0]
return [gcf for gcf in self.gcfs if bgc_like.bgc_id in [b.id for b in gcf.bgcs]][0]
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
# 2. check chemical compound predictions from canopus and molnetenhancer
test_spec = list(npl.spectra)[500]

print(npl.canopus.spectra_classes.get(str(test_spec.spectrum_id)))
print(npl.molnetenhancer.spectra_classes(str(test_spec.spectrum_id)))
print(npl.canopus.spectra_classes.get(str(test_spec.id)))
print(npl.molnetenhancer.spectra_classes(str(test_spec.id)))

# 3. example of a good score, (predicted) NRP linking to a (predicted) peptide like spectrum
print(npl.class_linking_score(list(npl.gcfs)[0], test_spec))
Expand Down
6 changes: 3 additions & 3 deletions src/nplinker/class_info/chem_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,17 @@ class prediction for a level. When no class is present, instead of Tuple it will
molfam_classes = {}

for molfam in molfams:
fid = molfam.family_id # the key
fid = molfam.id # the key
spectra = molfam.spectra
# if singleton family, format like 'fid_spectrum-id'
if fid.startswith("singleton-"):
spec_id = spectra[0].spectrum_id
spec_id = spectra[0].id
fid += f"_{spec_id}"
len_molfam = len(spectra)

classes_per_spectra = []
for spec in spectra:
spec_classes = self.spectra_classes.get(spec.spectrum_id)
spec_classes = self.spectra_classes.get(spec.id)
if spec_classes: # account for spectra without prediction
classes_per_spectra.append(spec_classes)

Expand Down
20 changes: 9 additions & 11 deletions src/nplinker/genomics/bgc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class BGC:
and used by MIBiG.
Attributes:
bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession.
id: BGC identifier, e.g. MIBiG accession, GenBank accession.
product_prediction: A tuple of (predicted) natural
products or product classes of the BGC.
For antiSMASH's GenBank data, the feature `region /product`
Expand Down Expand Up @@ -59,15 +59,15 @@ class BGC:
strain: The strain of the BGC.
"""

def __init__(self, bgc_id: str, /, *product_prediction: str):
def __init__(self, id: str, /, *product_prediction: str):
"""Initialize the BGC object.
Args:
bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession.
id: BGC identifier, e.g. MIBiG accession, GenBank accession.
product_prediction: BGC's (predicted) natural products or product classes.
"""
# BGC metadata
self.bgc_id = bgc_id
self.id = id
self.product_prediction = product_prediction

self.mibig_bgc_class: tuple[str] | None = None
Expand All @@ -87,23 +87,21 @@ def __repr__(self):
return str(self)

def __str__(self):
return "{}(bgc_id={}, strain={}, asid={}, region={})".format(
return "{}(id={}, strain={}, asid={}, region={})".format(
self.__class__.__name__,
self.bgc_id,
self.id,
self.strain,
self.antismash_id,
self.antismash_region,
)

def __eq__(self, other) -> bool:
if isinstance(other, BGC):
return (
self.bgc_id == other.bgc_id and self.product_prediction == other.product_prediction
)
return self.id == other.id and self.product_prediction == other.product_prediction
return NotImplemented

def __hash__(self) -> int:
return hash((self.bgc_id, self.product_prediction))
return hash((self.id, self.product_prediction))

def add_parent(self, gcf: GCF) -> None:
"""Add a parent GCF to the BGC.
Expand Down Expand Up @@ -146,7 +144,7 @@ def is_mibig(self) -> bool:
Returns:
True if it's MIBiG reference BGC
"""
return self.bgc_id.startswith("BGC")
return self.id.startswith("BGC")

# CG: why not providing whole product but only amino acid as product monomer?
# this property is not used in NPLinker core business.
Expand Down
20 changes: 10 additions & 10 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,29 @@ class GCF:
tools such as BiG-SCAPE and BiG-SLICE.
Attributes:
gcf_id: id of the GCF object.
id: id of the GCF object.
bgc_ids: a set of BGC ids that belongs to the GCF.
bigscape_class: BiG-SCAPE's BGC class.
BiG-SCAPE's BGC classes are similar to those defined in MiBIG
but have more categories (7 classes). More details see:
https://doi.org/10.1038%2Fs41589-019-0400-9.
"""

def __init__(self, gcf_id: str, /) -> None:
def __init__(self, id: str, /) -> None:
"""Initialize the GCF object.
Args:
gcf_id: id of the GCF object.
id: id of the GCF object.
"""
self.gcf_id = gcf_id
self.id = id
self.bgc_ids: set[str] = set()
self.bigscape_class: str | None = None
self._bgcs: set[BGC] = set()
self._strains: StrainCollection = StrainCollection()

def __str__(self) -> str:
return (
f"GCF(id={self.gcf_id}, #BGC_objects={len(self.bgcs)}, #bgc_ids={len(self.bgc_ids)},"
f"GCF(id={self.id}, #BGC_objects={len(self.bgcs)}, #bgc_ids={len(self.bgc_ids)},"
f"#strains={len(self._strains)})."
)

Expand All @@ -49,7 +49,7 @@ def __repr__(self) -> str:

def __eq__(self, other) -> bool:
if isinstance(other, GCF):
return self.gcf_id == other.gcf_id and self.bgcs == other.bgcs
return self.id == other.id and self.bgcs == other.bgcs
return NotImplemented

def __hash__(self) -> int:
Expand All @@ -58,7 +58,7 @@ def __hash__(self) -> int:
Note that GCF class is a mutable container. We only hash the GCF id to
avoid the hash value changes when `self._bgcs` is updated.
"""
return hash(self.gcf_id)
return hash(self.id)

@property
def bgcs(self) -> set[BGC]:
Expand All @@ -74,17 +74,17 @@ def add_bgc(self, bgc: BGC) -> None:
"""Add a BGC object to the GCF."""
bgc.parents.add(self)
self._bgcs.add(bgc)
self.bgc_ids.add(bgc.bgc_id)
self.bgc_ids.add(bgc.id)
if bgc.strain is not None:
self._strains.add(bgc.strain)
else:
logger.warning("No strain specified for the BGC %s", bgc.bgc_id)
logger.warning("No strain specified for the BGC %s", bgc.id)

def detach_bgc(self, bgc: BGC) -> None:
"""Remove a child BGC object."""
bgc.parents.remove(self)
self._bgcs.remove(bgc)
self.bgc_ids.remove(bgc.bgc_id)
self.bgc_ids.remove(bgc.id)
if bgc.strain is not None:
for other_bgc in self._bgcs:
if other_bgc.strain == bgc.strain:
Expand Down
6 changes: 3 additions & 3 deletions src/nplinker/genomics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[
bgc_without_strain = []
for bgc in bgcs:
try:
strain_list = strains.lookup(bgc.bgc_id)
strain_list = strains.lookup(bgc.id)
except ValueError:
bgc_without_strain.append(bgc)
continue
if len(strain_list) > 1:
raise ValueError(
f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
f"Multiple strain objects found for BGC id '{bgc.id}'."
f"BGC object accept only one strain."
)
bgc.strain = strain_list[0]
Expand Down Expand Up @@ -136,7 +136,7 @@ def add_bgc_to_gcf(
- The dictionary contains GCF objects as keys and a set of ids of missing
BGC objects as values.
"""
bgc_dict = {bgc.bgc_id: bgc for bgc in bgcs}
bgc_dict = {bgc.id: bgc for bgc in bgcs}
gcf_with_bgc = []
gcf_without_bgc = []
gcf_missing_bgc: dict[GCF, set[str]] = {}
Expand Down
16 changes: 8 additions & 8 deletions src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,22 +98,22 @@ def _load(self) -> None:
for row in reader:
spec1_id = row["CLUSTERID1"]
spec2_id = row["CLUSTERID2"]
family_id = row["ComponentIndex"]
if family_id not in family_dict:
family_dict[family_id] = set([spec1_id, spec2_id])
mf_id = row["ComponentIndex"]
if mf_id not in family_dict:
family_dict[mf_id] = set([spec1_id, spec2_id])
else:
family_dict[family_id].add(spec1_id)
family_dict[family_id].add(spec2_id)
family_dict[mf_id].add(spec1_id)
family_dict[mf_id].add(spec2_id)
# convert dict to list of MolecularFamily objects
for family_id, spectra_ids in family_dict.items():
if family_id == "-1": # "-1" is from GNPS, it means the singleton molecular family
for mf_id, spectra_ids in family_dict.items():
if mf_id == "-1": # "-1" is from GNPS, it means the singleton molecular family
for spectrum_id in spectra_ids:
# family id must be unique, so using "singleton-" + spectrum id as family id
family = MolecularFamily("singleton-" + str(spectrum_id))
family.spectra_ids = set([spectrum_id])
self._mfs.append(family)
else:
# for regular molecular families, use the value of "ComponentIndex" as family id
family = MolecularFamily(family_id)
family = MolecularFamily(mf_id)
family.spectra_ids = spectra_ids
self._mfs.append(family)
2 changes: 1 addition & 1 deletion src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _load(self):
rt = spec["params"].get("rtinseconds", 0)

spectrum = Spectrum(
spectrum_id=spectrum_id,
id=spectrum_id,
mz=list(spec["m/z array"]),
intensity=list(spec["intensity array"]),
precursor_mz=precursor_mz,
Expand Down
18 changes: 9 additions & 9 deletions src/nplinker/metabolomics/molecular_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,24 @@ class MolecularFamily:
"""Class to model molecular family.
Attributes:
family_id: Unique id for the molecular family.
id: Unique id for the molecular family.
spectra_ids: Set of spectrum ids in the molecular family.
"""

def __init__(self, family_id: str):
def __init__(self, id: str):
"""Initialize the MolecularFamily.
Args:
family_id: Unique id for the molecular family.
id: Unique id for the molecular family.
"""
self.family_id: str = family_id
self.id: str = id
self.spectra_ids: set[str] = set()
self._spectra: set[Spectrum] = set()
self._strains: StrainCollection = StrainCollection()

def __str__(self) -> str:
return (
f"MolecularFamily(family_id={self.family_id}, #Spectrum_objects={len(self._spectra)}, "
f"MolecularFamily(id={self.id}, #Spectrum_objects={len(self._spectra)}, "
f"#spectrum_ids={len(self.spectra_ids)}, #strains={len(self._strains)})"
)

Expand All @@ -38,11 +38,11 @@ def __repr__(self) -> str:

def __eq__(self, other) -> bool:
if isinstance(other, MolecularFamily):
return self.family_id == other.family_id
return self.id == other.id
return NotImplemented

def __hash__(self) -> int:
return hash(self.family_id)
return hash(self.id)

@property
def spectra(self) -> set[Spectrum]:
Expand All @@ -61,7 +61,7 @@ def add_spectrum(self, spectrum: Spectrum) -> None:
spectrum: `Spectrum` object to add to the molecular family.
"""
self._spectra.add(spectrum)
self.spectra_ids.add(spectrum.spectrum_id)
self.spectra_ids.add(spectrum.id)
self._strains = self._strains + spectrum.strains
# add the molecular family to the spectrum
spectrum.family = self
Expand All @@ -73,7 +73,7 @@ def detach_spectrum(self, spectrum: Spectrum) -> None:
spectrum: `Spectrum` object to remove from the molecular family.
"""
self._spectra.remove(spectrum)
self.spectra_ids.remove(spectrum.spectrum_id)
self.spectra_ids.remove(spectrum.id)
self._strains = self._update_strains()
# remove the molecular family from the spectrum
spectrum.family = None
Expand Down
14 changes: 7 additions & 7 deletions src/nplinker/metabolomics/spectrum.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Spectrum:
"""Class to model MS/MS Spectrum.
Attributes:
spectrum_id: the spectrum ID.
id: the spectrum ID.
mz: the list of m/z values.
intensity: the list of intensity values.
precursor_mz: the m/z value of the precursor.
Expand All @@ -30,7 +30,7 @@ class Spectrum:

def __init__(
self,
spectrum_id: str,
id: str,
mz: list[float],
intensity: list[float],
precursor_mz: float,
Expand All @@ -40,15 +40,15 @@ def __init__(
"""Initialize the Spectrum.
Args:
spectrum_id: the spectrum ID.
id: the spectrum ID.
mz: the list of m/z values.
intensity: the list of intensity values.
precursor_mz: the precursor m/z.
rt: the retention time in seconds. Defaults to 0.
metadata: the metadata of the spectrum, i.e. the header infomation
in the MGF file.
"""
self.spectrum_id = spectrum_id
self.id = id
self.mz = mz
self.intensity = intensity
self.precursor_mz = precursor_mz
Expand All @@ -61,18 +61,18 @@ def __init__(
self.family: MolecularFamily | None = None

def __str__(self) -> str:
return f"Spectrum(spectrum_id={self.spectrum_id}, #strains={len(self.strains)})"
return f"Spectrum(id={self.id}, #strains={len(self.strains)})"

def __repr__(self) -> str:
return str(self)

def __eq__(self, other) -> bool:
if isinstance(other, Spectrum):
return self.spectrum_id == other.spectrum_id and self.precursor_mz == other.precursor_mz
return self.id == other.id and self.precursor_mz == other.precursor_mz
return NotImplemented

def __hash__(self) -> int:
return hash((self.spectrum_id, self.precursor_mz))
return hash((self.id, self.precursor_mz))

@cached_property
def peaks(self) -> np.ndarray:
Expand Down
Loading

0 comments on commit ccda2d8

Please sign in to comment.