use uniformed attribute name id (#253)

* change Strain parameter `primary_id` to `id` * change MolecularFamily attribute `family_id` to `id` * change Spectrum attribute `spectrum_id` to `id` * change BGC attribute `bgc_id` to `id` * change GCF attribute `gcf_id` to `id`
NPLinker · Jun 12, 2024 · ccda2d8 · ccda2d8
1 parent c7d2be8
commit ccda2d8
Show file tree

Hide file tree

Showing 31 changed files with 135 additions and 143 deletions.
diff --git a/notebooks/npclassscore_linking/prospecting/class_linking.py b/notebooks/npclassscore_linking/prospecting/class_linking.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 """Initial code for NPClassScore."""
+
 import glob
 import os
 import sys
@@ -541,7 +542,7 @@ def class_linking_score(self, obj, target):
 
         if is_spectrum:
             # list of list of tuples/None - todo: add to spectrum object
-            spec_like_classes = self.canopus.spectra_classes.get(str(spec_like.spectrum_id))
+            spec_like_classes = self.canopus.spectra_classes.get(str(spec_like.id))
             spec_like_classes_names_inds = self.canopus.spectra_classes_names_inds
         else:  # molfam
             spec_like_classes = self.canopus.molfam_classes.get(str(spec_like.family_id))
@@ -654,7 +655,7 @@ def npclass_score(self, obj, target, method="main"):
             if is_spectrum:
                 # list of list of tuples/None - todo: add to spectrum object
                 # take only 'best' (first) classification per ontology level
-                all_classes = self.canopus.spectra_classes.get(str(spec_like.spectrum_id))
+                all_classes = self.canopus.spectra_classes.get(str(spec_like.id))
                 if all_classes:
                     spec_like_classes = [
                         cls_per_lvl
@@ -675,7 +676,7 @@ def npclass_score(self, obj, target, method="main"):
                 spec_like_classes_names_inds = self.canopus.molfam_classes_names_inds
         if use_mne and not spec_like_classes:  # if mne or when main/canopus does not get classes
             if is_spectrum:
-                spec_like_classes = self.molnetenhancer.spectra_classes(spec_like.spectrum_id)
+                spec_like_classes = self.molnetenhancer.spectra_classes(spec_like.id)
             else:  # molfam
                 spec_like_classes = self.molnetenhancer.molfam_classes.get(str(spec_like.family_id))
             # classes are same for molfam and spectrum so names are irrespective of is_spectrum
@@ -777,4 +778,4 @@ def _get_bgc_like_classes(self, bgc_like, is_bgc):
         return bgc_like_classes_dict
 
     def _get_bgc_like_gcf(self, bgc_like):
-        return [gcf for gcf in self.gcfs if bgc_like.bgc_id in [b.bgc_id for b in gcf.bgcs]][0]
+        return [gcf for gcf in self.gcfs if bgc_like.bgc_id in [b.id for b in gcf.bgcs]][0]
diff --git a/notebooks/npclassscore_linking/prospecting/class_linking_test.py b/notebooks/npclassscore_linking/prospecting/class_linking_test.py
@@ -35,8 +35,8 @@
     # 2. check chemical compound predictions from canopus and molnetenhancer
     test_spec = list(npl.spectra)[500]
 
-    print(npl.canopus.spectra_classes.get(str(test_spec.spectrum_id)))
-    print(npl.molnetenhancer.spectra_classes(str(test_spec.spectrum_id)))
+    print(npl.canopus.spectra_classes.get(str(test_spec.id)))
+    print(npl.molnetenhancer.spectra_classes(str(test_spec.id)))
 
     # 3. example of a good score, (predicted) NRP linking to a (predicted) peptide like spectrum
     print(npl.class_linking_score(list(npl.gcfs)[0], test_spec))

diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py
@@ -396,17 +396,17 @@ class prediction for a level. When no class is present, instead of Tuple it will
         molfam_classes = {}
 
         for molfam in molfams:
-            fid = molfam.family_id  # the key
+            fid = molfam.id  # the key
             spectra = molfam.spectra
             # if singleton family, format like 'fid_spectrum-id'
             if fid.startswith("singleton-"):
-                spec_id = spectra[0].spectrum_id
+                spec_id = spectra[0].id
                 fid += f"_{spec_id}"
             len_molfam = len(spectra)
 
             classes_per_spectra = []
             for spec in spectra:
-                spec_classes = self.spectra_classes.get(spec.spectrum_id)
+                spec_classes = self.spectra_classes.get(spec.id)
                 if spec_classes:  # account for spectra without prediction
                     classes_per_spectra.append(spec_classes)
 

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -28,7 +28,7 @@ class BGC:
     and used by MIBiG.
 
     Attributes:
-        bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession.
+        id: BGC identifier, e.g. MIBiG accession, GenBank accession.
         product_prediction: A tuple of (predicted) natural
             products or product classes of the BGC.
             For antiSMASH's GenBank data, the feature `region /product`
@@ -59,15 +59,15 @@ class BGC:
         strain: The strain of the BGC.
     """
 
-    def __init__(self, bgc_id: str, /, *product_prediction: str):
+    def __init__(self, id: str, /, *product_prediction: str):
         """Initialize the BGC object.
 
         Args:
-            bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession.
+            id: BGC identifier, e.g. MIBiG accession, GenBank accession.
             product_prediction: BGC's (predicted) natural products or product classes.
         """
         # BGC metadata
-        self.bgc_id = bgc_id
+        self.id = id
         self.product_prediction = product_prediction
 
         self.mibig_bgc_class: tuple[str] | None = None
@@ -87,23 +87,21 @@ def __repr__(self):
         return str(self)
 
     def __str__(self):
-        return "{}(bgc_id={}, strain={}, asid={}, region={})".format(
+        return "{}(id={}, strain={}, asid={}, region={})".format(
             self.__class__.__name__,
-            self.bgc_id,
+            self.id,
             self.strain,
             self.antismash_id,
             self.antismash_region,
         )
 
     def __eq__(self, other) -> bool:
         if isinstance(other, BGC):
-            return (
-                self.bgc_id == other.bgc_id and self.product_prediction == other.product_prediction
-            )
+            return self.id == other.id and self.product_prediction == other.product_prediction
         return NotImplemented
 
     def __hash__(self) -> int:
-        return hash((self.bgc_id, self.product_prediction))
+        return hash((self.id, self.product_prediction))
 
     def add_parent(self, gcf: GCF) -> None:
         """Add a parent GCF to the BGC.
@@ -146,7 +144,7 @@ def is_mibig(self) -> bool:
         Returns:
             True if it's MIBiG reference BGC
         """
-        return self.bgc_id.startswith("BGC")
+        return self.id.startswith("BGC")
 
     # CG: why not providing whole product but only amino acid as product monomer?
     # this property is not used in NPLinker core business.

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
@@ -18,29 +18,29 @@ class GCF:
     tools such as BiG-SCAPE and BiG-SLICE.
 
     Attributes:
-        gcf_id: id of the GCF object.
+        id: id of the GCF object.
         bgc_ids: a set of BGC ids that belongs to the GCF.
         bigscape_class: BiG-SCAPE's BGC class.
             BiG-SCAPE's BGC classes are similar to those defined in MiBIG
             but have more categories (7 classes). More details see:
             https://doi.org/10.1038%2Fs41589-019-0400-9.
     """
 
-    def __init__(self, gcf_id: str, /) -> None:
+    def __init__(self, id: str, /) -> None:
         """Initialize the GCF object.
 
         Args:
-            gcf_id: id of the GCF object.
+            id: id of the GCF object.
         """
-        self.gcf_id = gcf_id
+        self.id = id
         self.bgc_ids: set[str] = set()
         self.bigscape_class: str | None = None
         self._bgcs: set[BGC] = set()
         self._strains: StrainCollection = StrainCollection()
 
     def __str__(self) -> str:
         return (
-            f"GCF(id={self.gcf_id}, #BGC_objects={len(self.bgcs)}, #bgc_ids={len(self.bgc_ids)},"
+            f"GCF(id={self.id}, #BGC_objects={len(self.bgcs)}, #bgc_ids={len(self.bgc_ids)},"
             f"#strains={len(self._strains)})."
         )
 
@@ -49,7 +49,7 @@ def __repr__(self) -> str:
 
     def __eq__(self, other) -> bool:
         if isinstance(other, GCF):
-            return self.gcf_id == other.gcf_id and self.bgcs == other.bgcs
+            return self.id == other.id and self.bgcs == other.bgcs
         return NotImplemented
 
     def __hash__(self) -> int:
@@ -58,7 +58,7 @@ def __hash__(self) -> int:
         Note that GCF class is a mutable container. We only hash the GCF id to
         avoid the hash value changes when `self._bgcs` is updated.
         """
-        return hash(self.gcf_id)
+        return hash(self.id)
 
     @property
     def bgcs(self) -> set[BGC]:
@@ -74,17 +74,17 @@ def add_bgc(self, bgc: BGC) -> None:
         """Add a BGC object to the GCF."""
         bgc.parents.add(self)
         self._bgcs.add(bgc)
-        self.bgc_ids.add(bgc.bgc_id)
+        self.bgc_ids.add(bgc.id)
         if bgc.strain is not None:
             self._strains.add(bgc.strain)
         else:
-            logger.warning("No strain specified for the BGC %s", bgc.bgc_id)
+            logger.warning("No strain specified for the BGC %s", bgc.id)
 
     def detach_bgc(self, bgc: BGC) -> None:
         """Remove a child BGC object."""
         bgc.parents.remove(self)
         self._bgcs.remove(bgc)
-        self.bgc_ids.remove(bgc.bgc_id)
+        self.bgc_ids.remove(bgc.id)
         if bgc.strain is not None:
             for other_bgc in self._bgcs:
                 if other_bgc.strain == bgc.strain:

diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py
@@ -91,13 +91,13 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[
     bgc_without_strain = []
     for bgc in bgcs:
         try:
-            strain_list = strains.lookup(bgc.bgc_id)
+            strain_list = strains.lookup(bgc.id)
         except ValueError:
             bgc_without_strain.append(bgc)
             continue
         if len(strain_list) > 1:
             raise ValueError(
-                f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
+                f"Multiple strain objects found for BGC id '{bgc.id}'."
                 f"BGC object accept only one strain."
             )
         bgc.strain = strain_list[0]
@@ -136,7 +136,7 @@ def add_bgc_to_gcf(
             - The dictionary contains GCF objects as keys and a set of ids of missing
                 BGC objects as values.
     """
-    bgc_dict = {bgc.bgc_id: bgc for bgc in bgcs}
+    bgc_dict = {bgc.id: bgc for bgc in bgcs}
     gcf_with_bgc = []
     gcf_without_bgc = []
     gcf_missing_bgc: dict[GCF, set[str]] = {}

diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py
@@ -98,22 +98,22 @@ def _load(self) -> None:
             for row in reader:
                 spec1_id = row["CLUSTERID1"]
                 spec2_id = row["CLUSTERID2"]
-                family_id = row["ComponentIndex"]
-                if family_id not in family_dict:
-                    family_dict[family_id] = set([spec1_id, spec2_id])
+                mf_id = row["ComponentIndex"]
+                if mf_id not in family_dict:
+                    family_dict[mf_id] = set([spec1_id, spec2_id])
                 else:
-                    family_dict[family_id].add(spec1_id)
-                    family_dict[family_id].add(spec2_id)
+                    family_dict[mf_id].add(spec1_id)
+                    family_dict[mf_id].add(spec2_id)
         # convert dict to list of MolecularFamily objects
-        for family_id, spectra_ids in family_dict.items():
-            if family_id == "-1":  # "-1" is from GNPS, it means the singleton molecular family
+        for mf_id, spectra_ids in family_dict.items():
+            if mf_id == "-1":  # "-1" is from GNPS, it means the singleton molecular family
                 for spectrum_id in spectra_ids:
                     # family id must be unique, so using "singleton-" + spectrum id as family id
                     family = MolecularFamily("singleton-" + str(spectrum_id))
                     family.spectra_ids = set([spectrum_id])
                     self._mfs.append(family)
             else:
                 # for regular molecular families, use the value of "ComponentIndex" as family id
-                family = MolecularFamily(family_id)
+                family = MolecularFamily(mf_id)
                 family.spectra_ids = spectra_ids
                 self._mfs.append(family)
diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py
@@ -89,7 +89,7 @@ def _load(self):
             rt = spec["params"].get("rtinseconds", 0)
 
             spectrum = Spectrum(
-                spectrum_id=spectrum_id,
+                id=spectrum_id,
                 mz=list(spec["m/z array"]),
                 intensity=list(spec["intensity array"]),
                 precursor_mz=precursor_mz,

diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py
@@ -12,24 +12,24 @@ class MolecularFamily:
     """Class to model molecular family.
 
     Attributes:
-        family_id: Unique id for the molecular family.
+        id: Unique id for the molecular family.
         spectra_ids: Set of spectrum ids in the molecular family.
     """
 
-    def __init__(self, family_id: str):
+    def __init__(self, id: str):
         """Initialize the MolecularFamily.
 
         Args:
-            family_id: Unique id for the molecular family.
+            id: Unique id for the molecular family.
         """
-        self.family_id: str = family_id
+        self.id: str = id
         self.spectra_ids: set[str] = set()
         self._spectra: set[Spectrum] = set()
         self._strains: StrainCollection = StrainCollection()
 
     def __str__(self) -> str:
         return (
-            f"MolecularFamily(family_id={self.family_id}, #Spectrum_objects={len(self._spectra)}, "
+            f"MolecularFamily(id={self.id}, #Spectrum_objects={len(self._spectra)}, "
             f"#spectrum_ids={len(self.spectra_ids)}, #strains={len(self._strains)})"
         )
 
@@ -38,11 +38,11 @@ def __repr__(self) -> str:
 
     def __eq__(self, other) -> bool:
         if isinstance(other, MolecularFamily):
-            return self.family_id == other.family_id
+            return self.id == other.id
         return NotImplemented
 
     def __hash__(self) -> int:
-        return hash(self.family_id)
+        return hash(self.id)
 
     @property
     def spectra(self) -> set[Spectrum]:
@@ -61,7 +61,7 @@ def add_spectrum(self, spectrum: Spectrum) -> None:
             spectrum: `Spectrum` object to add to the molecular family.
         """
         self._spectra.add(spectrum)
-        self.spectra_ids.add(spectrum.spectrum_id)
+        self.spectra_ids.add(spectrum.id)
         self._strains = self._strains + spectrum.strains
         # add the molecular family to the spectrum
         spectrum.family = self
@@ -73,7 +73,7 @@ def detach_spectrum(self, spectrum: Spectrum) -> None:
             spectrum: `Spectrum` object to remove from the molecular family.
         """
         self._spectra.remove(spectrum)
-        self.spectra_ids.remove(spectrum.spectrum_id)
+        self.spectra_ids.remove(spectrum.id)
         self._strains = self._update_strains()
         # remove the molecular family from the spectrum
         spectrum.family = None

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -14,7 +14,7 @@ class Spectrum:
     """Class to model MS/MS Spectrum.
 
     Attributes:
-        spectrum_id: the spectrum ID.
+        id: the spectrum ID.
         mz: the list of m/z values.
         intensity: the list of intensity values.
         precursor_mz: the m/z value of the precursor.
@@ -30,7 +30,7 @@ class Spectrum:
 
     def __init__(
         self,
-        spectrum_id: str,
+        id: str,
         mz: list[float],
         intensity: list[float],
         precursor_mz: float,
@@ -40,15 +40,15 @@ def __init__(
         """Initialize the Spectrum.
 
         Args:
-            spectrum_id: the spectrum ID.
+            id: the spectrum ID.
             mz: the list of m/z values.
             intensity: the list of intensity values.
             precursor_mz: the precursor m/z.
             rt: the retention time in seconds. Defaults to 0.
             metadata: the metadata of the spectrum, i.e. the header infomation
                 in the MGF file.
         """
-        self.spectrum_id = spectrum_id
+        self.id = id
         self.mz = mz
         self.intensity = intensity
         self.precursor_mz = precursor_mz
@@ -61,18 +61,18 @@ def __init__(
         self.family: MolecularFamily | None = None
 
     def __str__(self) -> str:
-        return f"Spectrum(spectrum_id={self.spectrum_id}, #strains={len(self.strains)})"
+        return f"Spectrum(id={self.id}, #strains={len(self.strains)})"
 
     def __repr__(self) -> str:
         return str(self)
 
     def __eq__(self, other) -> bool:
         if isinstance(other, Spectrum):
-            return self.spectrum_id == other.spectrum_id and self.precursor_mz == other.precursor_mz
+            return self.id == other.id and self.precursor_mz == other.precursor_mz
         return NotImplemented
 
     def __hash__(self) -> int:
-        return hash((self.spectrum_id, self.precursor_mz))
+        return hash((self.id, self.precursor_mz))
 
     @cached_property
     def peaks(self) -> np.ndarray: