From 50fd7ef06b7c0bb4f0d7b42e17d3945c18f8a135 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 4 Dec 2023 16:31:21 +0100 Subject: [PATCH 1/6] add utilities for loading metabolomics data --- src/nplinker/metabolomics/__init__.py | 14 ++- src/nplinker/metabolomics/utils.py | 130 ++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 src/nplinker/metabolomics/utils.py diff --git a/src/nplinker/metabolomics/__init__.py b/src/nplinker/metabolomics/__init__.py index cf34b44c..b73e1e5c 100644 --- a/src/nplinker/metabolomics/__init__.py +++ b/src/nplinker/metabolomics/__init__.py @@ -1,8 +1,20 @@ import logging from .molecular_family import MolecularFamily from .spectrum import Spectrum +from .utils import add_annotation_to_spectrum +from .utils import add_spectrum_to_mf +from .utils import add_strains_to_spectrum +from .utils import get_spectra_from_mfs logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["MolecularFamily", "Spectrum"] + +__all__ = [ + "MolecularFamily", + "Spectrum", + "get_spectra_from_mfs", + "add_annotation_to_spectrum", + "add_spectrum_to_mf", + "add_strains_to_spectrum", +] diff --git a/src/nplinker/metabolomics/utils.py b/src/nplinker/metabolomics/utils.py new file mode 100644 index 00000000..df95245d --- /dev/null +++ b/src/nplinker/metabolomics/utils.py @@ -0,0 +1,130 @@ +from nplinker.logconfig import LogConfig +from nplinker.strain_collection import StrainCollection +from .molecular_family import MolecularFamily +from .spectrum import Spectrum + + +logger = LogConfig.getLogger(__name__) + + +def add_annotation_to_spectrum(annotations: dict[str, dict], spectra: list[Spectrum]) -> None: + """Add GNPS annotations to the `Spectrum.gnps_annotaions` attribute for input spectra. + + It is possible that some spectra don't have annotations. + Note that the input `spectra` list is changed in place. + + Args: + annotations(dict[str, dict]): A dictionary of GNPS annotations, where the keys are + spectrum ids and the values are GNPS annotations. + spectra(list[Spectrum]): A list of Spectrum objects. + """ + for spec in spectra: + if spec.spectrum_id in annotations: + spec.gnps_annotations = annotations[spec.spectrum_id] + + +def add_strains_to_spectrum( + strains: StrainCollection, spectra: list[Spectrum] +) -> tuple[list[Spectrum], list[Spectrum]]: + """Add `Strain` objects to the `Spectrum.strains` attribute for input spectra. + + Note that the input `spectra` list is changed in place. + + Args: + strains(StrainCollection): A collection of strain objects. + spectra(list[Spectrum]): A list of Spectrum objects. + + Returns: + tuple(list[Spectrum], list[Spectrum]): A tuple of two lists of Spectrum + objects. The first list contains Spectrum objects that are updated + with Strain objects; the second list contains Spectrum objects that + are not updated with Strain objects becuase no Strain objects are found. + """ + spectra_with_strains = [] + spectra_without_strains = [] + for spec in spectra: + try: + strain_list = strains.lookup(spec.spectrum_id) + except ValueError: + spectra_without_strains.append(spec) + continue + + for strain in strain_list: + spec.strains.add(strain) + spectra_with_strains.append(spec) + + logger.info( + f"{len(spectra_with_strains)} Spectrum objects updated with Strain objects.\n" + f"{len(spectra_without_strains)} Spectrum objects not updated with Strain objects." + ) + + return spectra_with_strains, spectra_without_strains + + +def add_spectrum_to_mf( + spectra: list[Spectrum], mfs: list[MolecularFamily] +) -> tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]: + """Add Spectrum objects to MolecularFamily objects. + + The attribute of `spectra_ids` of MolecularFamily object contains the ids of Spectrum objects. + These ids are used to find Spectrum objects from the input `spectra` list. The found Spectrum + objects are added to the `spectra` attribute of MolecularFamily object. It is possible that + some spectrum ids are not found in the input `spectra` list, and so their Spectrum objects are + missing in the MolecularFamily object. + + Note that the input `mfs` list is changed in place. + + Args: + spectra(list[Spectrum]): A list of Spectrum objects. + mfs(list[MolecularFamily]): A list of MolecularFamily objects. + + Returns: + tuple(list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]): + The first list contains MolecularFamily objects that are updated with Spectrum objects. + The second list contains MolecularFamily objects that are not updated with Spectrum + objects (all Spectrum objects are missing). + The dictionary contains MolecularFamily objects as keys and a set of ids of missing + Spectrum objects as values. + """ + spec_dict = {spec.spectrum_id: spec for spec in spectra} + mf_with_spec = [] + mf_without_spec = [] + mf_missing_spec: dict[MolecularFamily, set[str]] = {} + for mf in mfs: + for spec_id in mf.spectra_ids: + try: + spec = spec_dict[spec_id] + except KeyError: + if mf not in mf_missing_spec: + mf_missing_spec[mf] = {spec_id} + else: + mf_missing_spec[mf].add(spec_id) + continue + mf.add_spectrum(spec) + + if mf.spectra: + mf_with_spec.append(mf) + else: + mf_without_spec.append(mf) + + logger.info( + f"{len(mf_with_spec)} MolecularFamily objects updated with Spectrum objects.\n" + f"{len(mf_without_spec)} MolecularFamily objects not updated with Spectrum objects.\n" + f"{len(mf_missing_spec)} MolecularFamily objects have missing Spectrum objects." + ) + return mf_with_spec, mf_without_spec, mf_missing_spec + + +def get_spectra_from_mfs(mfs: list[MolecularFamily]) -> list[Spectrum]: + """Get all Spectrum objects from given MolecularFamily objects. + + Args: + mfs(list[MolecularFamily]): A list of MolecularFamily objects. + + Returns: + list[Spectrum]: A list of Spectrum objects. + """ + s = set() + for mf in mfs: + s |= set(mf.spectra) + return list(s) From 5949c90f40bbf2a9da91fd65f6c13e6a7f3cc995 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 5 Dec 2023 16:25:48 +0100 Subject: [PATCH 2/6] Create test_utils.py --- tests/metabolomics/test_utils.py | 100 +++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tests/metabolomics/test_utils.py diff --git a/tests/metabolomics/test_utils.py b/tests/metabolomics/test_utils.py new file mode 100644 index 00000000..81dffa9b --- /dev/null +++ b/tests/metabolomics/test_utils.py @@ -0,0 +1,100 @@ +import pytest +from nplinker.metabolomics import MolecularFamily +from nplinker.metabolomics import Spectrum +from nplinker.metabolomics import add_annotation_to_spectrum +from nplinker.metabolomics import add_spectrum_to_mf +from nplinker.metabolomics import add_strains_to_spectrum +from nplinker.metabolomics import get_spectra_from_mfs +from nplinker.strain import Strain +from nplinker.strain_collection import StrainCollection + + +@pytest.fixture +def spectra(): + """Fixture for a list of Spectrum objects.""" + # The order of the spectra is important for the tests. + return [ + Spectrum("spec0", [100, 200], [0.1, 0.2], 150), + Spectrum("spec1", [100, 200], [0.1, 0.2], 150), + Spectrum("spec2", [100, 200], [0.1, 0.2], 150), + ] + + +def test_add_annotation_to_spectrum(spectra): + """Test the add_annotation_to_spectrum function.""" + annotations = { + "spec0": {"annotation": "annotation_0"}, + "spec1": {"annotation": "annotation_1"}, + "spec3": {"annotation": "annotation_3"}, + } + + add_annotation_to_spectrum(annotations, spectra) + + for i, spec in enumerate(spectra): + if i < 2: + assert spec.gnps_annotations == {"annotation": f"annotation_{i}"} + else: + assert spec.gnps_annotations == {} + + +def test_add_strains_to_spectrum(spectra): + """Test the add_strains_to_spectrum function.""" + strains = StrainCollection() + strain0 = Strain("spec0") # spectrum id as strain id + strain1 = Strain("strain1") + strain1.add_alias("spec1") # spectrum id as strain alias + strains.add(strain0) + strains.add(strain1) + + spectra_with_strains, spectra_without_strains = add_strains_to_spectrum(strains, spectra) + + assert len(spectra_with_strains) == 2 + assert len(spectra_without_strains) == 1 + assert strain0 in spectra_with_strains[0].strains + assert strain1 in spectra_with_strains[1].strains + assert spectra_without_strains[0].strains == StrainCollection() + + +def test_add_spectrum_to_mf(spectra): + """Test the add_spectrum_to_mf function.""" + # Prepare the molecular families + mf0 = MolecularFamily("mf0") + mf0.spectra_ids = {"spec0", "spec1"} + mf1 = MolecularFamily("mf1") + mf1.spectra_ids = { + "spec2", + "spec-missing-1", + } + mf2 = MolecularFamily("mf2") + mf2.spectra_ids = {"spec-missing-2", "spec-missing-3"} + mfs = [mf0, mf1, mf2] + + mf_with_spec, mf_without_spec, mf_missing_spec = add_spectrum_to_mf(spectra, mfs) + + assert len(mf_with_spec) == 2 + assert len(mf_without_spec) == 1 + assert len(mf_missing_spec) == 2 + assert mf_with_spec == [mf0, mf1] + assert mf_without_spec == [mf2] + assert mf_missing_spec == {mf1: {"spec-missing-1"}, mf2: {"spec-missing-2", "spec-missing-3"}} + + +def test_get_spectra_from_mfs(spectra): + """Test the get_spectra_from_mfs function.""" + mf0 = MolecularFamily("mf0") + mf0.spectra_ids = {"spec0", "spec1"} + mf0.add_spectrum(spectra[0]) + mf0.add_spectrum(spectra[1]) + mf1 = MolecularFamily("mf1") + mf1.spectra_ids = { + "spec2", + "spec-missing-1", + } + mf1.add_spectrum(spectra[2]) + mf2 = MolecularFamily("mf2") + mf2.spectra_ids = {"spec-missing-2", "spec-missing-3"} + mfs = [mf0, mf1, mf2] + + spec_from_mfs = get_spectra_from_mfs(mfs) + + assert len(spec_from_mfs) == 3 From 2b2643f289751ab5f406ecc37b1cc419293f1720 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 5 Dec 2023 14:49:17 +0100 Subject: [PATCH 3/6] update logics of loading metabolomics data --- src/nplinker/loader.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 7c0ab98c..4642d788 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -17,6 +17,13 @@ from nplinker.globals import PFAM_PATH from nplinker.globals import STRAIN_MAPPINGS_FILENAME from nplinker.logconfig import LogConfig +from nplinker.metabolomics import add_annotation_to_spectrum +from nplinker.metabolomics import add_spectrum_to_mf +from nplinker.metabolomics import add_strains_to_spectrum +from nplinker.metabolomics import get_spectra_from_mfs +from nplinker.metabolomics.gnps import GNPSAnnotationLoader +from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader +from nplinker.metabolomics.gnps import GNPSSpectrumLoader from nplinker.pairedomics.downloader import PODPDownloader from nplinker.pairedomics.runbigscape import run_bigscape from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings @@ -399,8 +406,30 @@ def _load_strain_mappings(self): return True - # TODO CG: rewrite the loading process using GPNSLoader def _load_metabolomics(self): + """Loads metabolomics data to Spectrum and MolecularFamily objects.""" + logger.debug("\nLoading metabolomics data starts...") + + # Step 1: load all Spectrum objects + raw_spectra = GNPSSpectrumLoader(self.mgf_file).spectra + # Step 2: load all GNPS annotations + raw_annotations = GNPSAnnotationLoader(self.annotations_config_file).annotations + # Step 3: load all MolecularFamily objects + raw_molfams = GNPSMolecularFamilyLoader(self.edges_file).get_mfs(keep_singleton=False) + + # Step 4: add GNPS annotations to Spectrum.gnps_annotations + add_annotation_to_spectrum(raw_annotations, raw_spectra) + # Step 5: add strains to Spectrum.strains + spectra_with_strains, _ = add_strains_to_spectrum(self.strains, raw_spectra) + + # Step 6: add Spectrum objects to MolecularFamily + mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_molfams) + + # Step 7: get MolecularFamily objects and their Spectrum members + self.molfams = mf_with_spec + self.spectra = get_spectra_from_mfs(self.molfams) + + logger.debug("Loading metabolomics data completed\n") return True def _load_genomics(self): From 14b2f873c08e92b95486468bb30bc1372714a6d7 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 11 Dec 2023 16:29:05 +0100 Subject: [PATCH 4/6] remove function `test_get_spectra_from_mfs` --- src/nplinker/metabolomics/__init__.py | 2 -- src/nplinker/metabolomics/utils.py | 15 --------------- tests/metabolomics/test_utils.py | 22 ---------------------- 3 files changed, 39 deletions(-) diff --git a/src/nplinker/metabolomics/__init__.py b/src/nplinker/metabolomics/__init__.py index b73e1e5c..8c723f5e 100644 --- a/src/nplinker/metabolomics/__init__.py +++ b/src/nplinker/metabolomics/__init__.py @@ -4,7 +4,6 @@ from .utils import add_annotation_to_spectrum from .utils import add_spectrum_to_mf from .utils import add_strains_to_spectrum -from .utils import get_spectra_from_mfs logging.getLogger(__name__).addHandler(logging.NullHandler()) @@ -13,7 +12,6 @@ __all__ = [ "MolecularFamily", "Spectrum", - "get_spectra_from_mfs", "add_annotation_to_spectrum", "add_spectrum_to_mf", "add_strains_to_spectrum", diff --git a/src/nplinker/metabolomics/utils.py b/src/nplinker/metabolomics/utils.py index df95245d..fdd887d3 100644 --- a/src/nplinker/metabolomics/utils.py +++ b/src/nplinker/metabolomics/utils.py @@ -113,18 +113,3 @@ def add_spectrum_to_mf( f"{len(mf_missing_spec)} MolecularFamily objects have missing Spectrum objects." ) return mf_with_spec, mf_without_spec, mf_missing_spec - - -def get_spectra_from_mfs(mfs: list[MolecularFamily]) -> list[Spectrum]: - """Get all Spectrum objects from given MolecularFamily objects. - - Args: - mfs(list[MolecularFamily]): A list of MolecularFamily objects. - - Returns: - list[Spectrum]: A list of Spectrum objects. - """ - s = set() - for mf in mfs: - s |= set(mf.spectra) - return list(s) diff --git a/tests/metabolomics/test_utils.py b/tests/metabolomics/test_utils.py index 81dffa9b..e9fbf7a1 100644 --- a/tests/metabolomics/test_utils.py +++ b/tests/metabolomics/test_utils.py @@ -4,7 +4,6 @@ from nplinker.metabolomics import add_annotation_to_spectrum from nplinker.metabolomics import add_spectrum_to_mf from nplinker.metabolomics import add_strains_to_spectrum -from nplinker.metabolomics import get_spectra_from_mfs from nplinker.strain import Strain from nplinker.strain_collection import StrainCollection @@ -77,24 +76,3 @@ def test_add_spectrum_to_mf(spectra): assert mf_with_spec == [mf0, mf1] assert mf_without_spec == [mf2] assert mf_missing_spec == {mf1: {"spec-missing-1"}, mf2: {"spec-missing-2", "spec-missing-3"}} - - -def test_get_spectra_from_mfs(spectra): - """Test the get_spectra_from_mfs function.""" - mf0 = MolecularFamily("mf0") - mf0.spectra_ids = {"spec0", "spec1"} - mf0.add_spectrum(spectra[0]) - mf0.add_spectrum(spectra[1]) - mf1 = MolecularFamily("mf1") - mf1.spectra_ids = { - "spec2", - "spec-missing-1", - } - mf1.add_spectrum(spectra[2]) - mf2 = MolecularFamily("mf2") - mf2.spectra_ids = {"spec-missing-2", "spec-missing-3"} - mfs = [mf0, mf1, mf2] - - spec_from_mfs = get_spectra_from_mfs(mfs) - - assert len(spec_from_mfs) == 3 From 433a056419df36ac3fb514a5a1cc051e69c1f05a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 11 Dec 2023 16:30:21 +0100 Subject: [PATCH 5/6] update the assignment of attributes `self.spectra` and `self.molfams` --- src/nplinker/loader.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 4642d788..dc96e48d 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -20,7 +20,6 @@ from nplinker.metabolomics import add_annotation_to_spectrum from nplinker.metabolomics import add_spectrum_to_mf from nplinker.metabolomics import add_strains_to_spectrum -from nplinker.metabolomics import get_spectra_from_mfs from nplinker.metabolomics.gnps import GNPSAnnotationLoader from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader from nplinker.metabolomics.gnps import GNPSSpectrumLoader @@ -407,7 +406,16 @@ def _load_strain_mappings(self): return True def _load_metabolomics(self): - """Loads metabolomics data to Spectrum and MolecularFamily objects.""" + """Loads metabolomics data to Spectrum and MolecularFamily objects. + + The attribute of `self.spectra` is set to the loaded Spectrum objects that have Strain + objects added (i.e. `Spectrum.strains` updated). If a Spectrum object does not have Strain + objects, it is not added to `self.spectra`. + + The attribute of `self.molfams` is set to the loaded MolecularFamily objects that have + Strain objects added (i.e. `MolecularFamily._strains` updated). This means only Spectra + objects with updated strains (i.e. `self.spectra`) can be added to MolecularFamily objects. + """ logger.debug("\nLoading metabolomics data starts...") # Step 1: load all Spectrum objects @@ -425,9 +433,9 @@ def _load_metabolomics(self): # Step 6: add Spectrum objects to MolecularFamily mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_molfams) - # Step 7: get MolecularFamily objects and their Spectrum members + # Step 7: set attributes of self.spectra and self.molfams with valid objects + self.spectra = spectra_with_strains self.molfams = mf_with_spec - self.spectra = get_spectra_from_mfs(self.molfams) logger.debug("Loading metabolomics data completed\n") return True From 5354122412755166f1b94ac8359ce326d30a57c7 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 12 Dec 2023 12:01:49 +0100 Subject: [PATCH 6/6] Update test_utils.py --- tests/metabolomics/test_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/metabolomics/test_utils.py b/tests/metabolomics/test_utils.py index e9fbf7a1..84a11aa9 100644 --- a/tests/metabolomics/test_utils.py +++ b/tests/metabolomics/test_utils.py @@ -49,6 +49,8 @@ def test_add_strains_to_spectrum(spectra): assert len(spectra_with_strains) == 2 assert len(spectra_without_strains) == 1 + assert spectra_with_strains == [spectra[0], spectra[1]] + assert spectra_without_strains == [spectra[2]] assert strain0 in spectra_with_strains[0].strains assert strain1 in spectra_with_strains[1].strains assert spectra_without_strains[0].strains == StrainCollection() @@ -76,3 +78,6 @@ def test_add_spectrum_to_mf(spectra): assert mf_with_spec == [mf0, mf1] assert mf_without_spec == [mf2] assert mf_missing_spec == {mf1: {"spec-missing-1"}, mf2: {"spec-missing-2", "spec-missing-3"}} + assert mf0.spectra == {spectra[0], spectra[1]} + assert mf1.spectra == {spectra[2]} + assert mf2.spectra == set()