From 5cb129723d66c6a583b4113f534caeef90ab9e22 Mon Sep 17 00:00:00 2001
From: Wout Bittremieux <wout@bittremieux.be>
Date: Sun, 21 Apr 2024 17:24:38 +0200
Subject: [PATCH] Generate theoretical isotopic fragments

---
 docs/src/index.md                     |   2 +-
 spectrum_utils/fragment_annotation.py | 108 ++++++++++++++++----------
 spectrum_utils/spectrum.py            |  73 ++++++++++-------
 tests/fragment_annotation_test.py     |  40 ++++++++++
 4 files changed, 153 insertions(+), 70 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 39bcbd9..9ebd956 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -3,7 +3,7 @@
 [![conda](https://img.shields.io/conda/vn/bioconda/spectrum_utils?color=green)](http://bioconda.github.io/recipes/spectrum_utils/README.html)
 [![PyPI](https://img.shields.io/pypi/v/spectrum_utils?color=green)](https://pypi.org/project/spectrum_utils/)
 [![Build status](https://github.com/bittremieux/spectrum_utils/workflows/tests/badge.svg)](https://github.com/bittremieux/spectrum_utils/actions?query=workflow:tests)
-[![docs](https://readthedocs.org/projects/spectrum_utils/badge/?version=latest)](https://spectrum-utils.readthedocs.io/en/latest/?badge=latest)
+[![docs](https://readthedocs.org/projects/spectrum-utils/badge/?version=latest)](https://spectrum-utils.readthedocs.io/en/latest/?badge=latest)
 
 ## About spectrum_utils
 
diff --git a/spectrum_utils/fragment_annotation.py b/spectrum_utils/fragment_annotation.py
index 0de40dc..caef019 100644
--- a/spectrum_utils/fragment_annotation.py
+++ b/spectrum_utils/fragment_annotation.py
@@ -11,14 +11,14 @@
 
 
 # Amino acid and special amino acid masses.
-_aa_mass = {
+AA_MASS = {
     **pmass.std_aa_mass,
     # Aspartic acid / asparagine (ambiguous mass).
     # "B": 0,
     # Glutamic acid / glutamine (ambiguous mass).
     # "Z": 0,
     # Leucine / isoleucine.
-    "J": 113.08406,
+    "J": 113.084_064,
     # Selenocysteine (in Pyteomics).
     # "U": 150.95363,
     # Pyrrolysine (in Pyteomics).
@@ -27,39 +27,42 @@
     "X": 0,
 }
 
+# Offset for isotopic peaks.
+C13_MASS_DIFF = 1.003_354
+
 # Common neutral losses.
-_neutral_loss = {
+NEUTRAL_LOSS = {
     # No neutral loss.
     None: 0,
     # Hydrogen.
-    "H": -1.007825,
+    "H": -1.007_825,
     # Ammonia.
-    "NH3": -17.026549,
+    "NH3": -17.026_549,
     # Water.
-    "H2O": -18.010565,
+    "H2O": -18.010_565,
     # Carbon monoxide.
-    "CO": -27.994915,
+    "CO": -27.994_915,
     # Carbon dioxide.
-    "CO2": -43.989829,
+    "CO2": -43.989_829,
     # Formamide.
-    "HCONH2": -45.021464,
+    "HCONH2": -45.021_464,
     # Formic acid.
-    "HCOOH": -46.005479,
+    "HCOOH": -46.005_479,
     # Methanesulfenic acid.
-    "CH4OS": -63.998301,
+    "CH4OS": -63.998_301,
     # Sulfur trioxide.
-    "SO3": -79.956818,
+    "SO3": -79.956_818,
     # Metaphosphoric acid.
-    "HPO3": -79.966331,
+    "HPO3": -79.966_331,
     # Mercaptoacetamide.
-    "C2H5NOS": -91.009195,
+    "C2H5NOS": -91.009_195,
     # Mercaptoacetic acid.
-    "C2H4O2S": -91.993211,
+    "C2H4O2S": -91.993_211,
     # Phosphoric acid.
-    "H3PO4": -97.976896,
+    "H3PO4": -97.976_896,
 }
 
-_supported_ions = "?abcxyzIm_prf"
+SUPPORTED_IONS = "?abcxyzIm_prf"
 
 
 class FragmentAnnotation:
@@ -125,7 +128,7 @@ def __init__(
             raise NotImplementedError(
                 "Advanced ion types are not yet supported"
             )
-        elif ion_type[0] not in _supported_ions:
+        elif ion_type[0] not in SUPPORTED_IONS:
             raise ValueError("Unknown ion type")
         if ion_type == "?" and (
             neutral_loss is not None
@@ -242,6 +245,8 @@ def __getitem__(self, key) -> FragmentAnnotation:
 def get_theoretical_fragments(
     proteoform: proforma.Proteoform,
     ion_types: str = "by",
+    *,
+    max_isotope: int = 0,
     max_charge: int = 1,
     neutral_losses: Optional[Dict[Optional[str], float]] = None,
 ) -> List[Tuple[FragmentAnnotation, float]]:
@@ -252,19 +257,24 @@ def get_theoretical_fragments(
     Parameters
     ----------
     proteoform : proforma.Proteoform
-        The proteoform for which the fragment annotations will be generated.
+        The proteoform for which the fragment annotations will be
+        generated.
     ion_types : str
-        The ion types to generate. Can be any combination of 'a', 'b', 'c',
-        'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm' for
-        internal fragment ions, 'p' for the precursor ion, and 'r' for reporter
-        ions. The default is 'by', which means that b and y peptide ions will
-        be generated.
+        The ion types to generate. Can be any combination of 'a', 'b',
+        'c', 'x', 'y', and 'z' for peptide fragments, 'I' for immonium
+        ions, 'm' for internal fragment ions, 'p' for the precursor ion,
+        and 'r' for reporter ions. The default is 'by', which means that
+        b and y peptide ions will be generated.
+    max_isotope : int
+        The maximum isotope to consider (the default is 0 to only
+        generate the monoisotopic peaks).
     max_charge : int
-        All fragments up to and including the given charge will be generated
-        (the default is 1 to only generate singly-charged fragments).
+        All fragments up to and including the given charge will be
+        generated (the default is 1 to only generate singly-charged
+        fragments).
     neutral_losses : Optional[Dict[Optional[str], float]]
-        A dictionary with neutral loss names and (negative) mass differences to
-        be considered.
+        A dictionary with neutral loss names and (negative) mass
+        differences to be considered.
 
     Returns
     -------
@@ -273,9 +283,9 @@ def get_theoretical_fragments(
         ascending m/z order.
     """
     for ion_type in ion_types:
-        if ion_type not in _supported_ions:
+        if ion_type not in SUPPORTED_IONS:
             raise ValueError(
-                f"{ion_type} is not a supported ion type ({_supported_ions})"
+                f"{ion_type} is not a supported ion type ({SUPPORTED_IONS})"
             )
     if "B" in proteoform.sequence:
         raise ValueError(
@@ -363,8 +373,8 @@ def get_theoretical_fragments(
 
     # Generate all internal fragment ions.
     if "m" in ion_types:
-        # Skip internal fragments with start position 1, which are actually
-        # b ions.
+        # Skip internal fragments with start position 1, which are
+        # actually b ions.
         for start_i in range(1, len(proteoform.sequence)):
             mod_i_start, mod_mass = 0, 0
             # Skip unlocalized and prefix modifications.
@@ -380,8 +390,8 @@ def get_theoretical_fragments(
             ):
                 mod_i_start += 1
             mod_i_stop = mod_i_start
-            # Internal fragments of only one residue are encoded as immonium
-            # ions.
+            # Internal fragments of only one residue are encoded as
+            # immonium ions.
             for stop_i in range(start_i + 2, len(proteoform.sequence)):
                 fragment_sequence = proteoform.sequence[start_i:stop_i]
                 # Include internal modifications.
@@ -392,8 +402,8 @@ def get_theoretical_fragments(
                 ):
                     mod_mass += proteoform.modifications[mod_i_stop].mass
                     mod_i_stop += 1
-                # Internal fragment mass calculation is equivalent to b ion
-                # mass calculation.
+                # Internal fragment mass calculation is equivalent to b
+                # ion mass calculation.
                 base_fragments.append(
                     (
                         fragment_sequence,
@@ -430,20 +440,20 @@ def get_theoretical_fragments(
                         sequence=fragment_sequence,
                         ion_type=ion_type,
                         charge=charge,
-                        aa_mass=_aa_mass,
+                        aa_mass=AA_MASS,
                     )
                     + mod_mass / charge,
                 )
             )
 
     # Generate all immonium ions (internal single amino acid from the
-    # combination of a type and y type cleavage.
+    # combination of a type and y type cleavage).
     if "I" in ion_types:
         # Amino acid mass minus CO plus charge 1.
         mass_diff = pmass.calculate_mass(formula="CO") - pmass.calculate_mass(
             formula="H"
         )
-        for aa, mass in _aa_mass.items():
+        for aa, mass in AA_MASS.items():
             if aa != "X":
                 fragments_masses.append(
                     (
@@ -452,6 +462,22 @@ def get_theoretical_fragments(
                     )
                 )
 
+    # Generate isotopic peaks for all fragments.
+    isotope_fragments = []
+    for isotope in range(1, max_isotope + 1):
+        for fragment, mass in fragments_masses:
+            isotope_fragments.append(
+                (
+                    FragmentAnnotation(
+                        ion_type=fragment.ion_type,
+                        isotope=isotope,
+                        charge=fragment.charge,
+                    ),
+                    mass + isotope * C13_MASS_DIFF / fragment.charge,
+                )
+            )
+    fragments_masses.extend(isotope_fragments)
+
     # Generate all fragments that differ by a neutral loss from the base
     # fragments.
     neutral_loss_fragments = []
@@ -460,13 +486,13 @@ def get_theoretical_fragments(
             continue
         neutral_loss = f"{'-' if mass_diff < 0 else '+'}{neutral_loss}"
         for fragment, mass in fragments_masses:
-            fragment_mass = mass + mass_diff / fragment.charge
-            if fragment_mass > 0:
+            if (fragment_mass := mass + mass_diff / fragment.charge) > 0:
                 neutral_loss_fragments.append(
                     (
                         FragmentAnnotation(
                             ion_type=fragment.ion_type,
                             neutral_loss=neutral_loss,
+                            isotope=fragment.isotope,
                             charge=fragment.charge,
                         ),
                         fragment_mass,
diff --git a/spectrum_utils/spectrum.py b/spectrum_utils/spectrum.py
index f9c7059..260a478 100755
--- a/spectrum_utils/spectrum.py
+++ b/spectrum_utils/spectrum.py
@@ -631,28 +631,32 @@ def _annotate_proteoforms(
         fragment_tol_mass: float,
         fragment_tol_mode: str,
         ion_types: str = "by",
+        *,
+        max_isotope: int = 0,
         max_ion_charge: Optional[int] = None,
         neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
     ) -> MsmsSpectrum:
         """
-        Assign fragment ion labels to the peaks from a ProForma annotation.
+        Assign fragment ion labels to the peaks from a ProForma
+        annotation.
 
-        This is meant to be an internal function that uses a pre-parsed proforma string
-        instead of parsing it internally. This can be useful when the same parsed
-        sequence is used multiple times (since parsing the sequence is a lot slower
-        than annotating the peaks)
+        This is meant to be an internal function that uses a pre-parsed
+        ProForma string instead of parsing it internally. This can be
+        useful when the same parsed sequence is used multiple times
+        (since parsing the sequence is a lot slower than annotating the
+        peaks).
 
         >>> proforma_sequence = "MYPEPTIDEK/2"
-        >>> parsed_proforma = proforma.parse(proforma_sequence)
         >>> spectrum.annotate_proforma(proforma_sequence, ...)
 
         or
 
+        >>> parsed_proforma = proforma.parse(proforma_sequence)
         >>> spectrum._annotate_proteoforms(parsed_proforma, proforma_sequence, ...)
 
         WARN:
-            This function does not check that the passed sequence corresponds to the
-            passed proteoforms.
+            This function does not check that the passed sequence
+            corresponds to the passed proteoforms.
 
         For additional information on the arguments, see the
         `MsmsSpectrum.annotate_proforma` documentation.
@@ -668,24 +672,28 @@ def _annotate_proteoforms(
         )
 
         self._annotation = np.full_like(self.mz, None, object)
-        # By default, peak charges are assumed to be smaller than the precursor
-        # charge.
+        # By default, peak charges are assumed to be smaller than the
+        # precursor charge.
         if max_ion_charge is None:
             max_ion_charge = max(1, self.precursor_charge - 1)
-        # Make sure the standard peaks (without a neutral loss) are always
-        # considered.
+        # Make sure the standard peaks (without a neutral loss) are
+        # always considered.
         if isinstance(neutral_losses, bool):
             if not neutral_losses:
                 neutral_losses = {None: 0}
             else:
-                neutral_losses = fa._neutral_loss
+                neutral_losses = fa.NEUTRAL_LOSS
         if neutral_losses is not None and None not in neutral_losses:
             neutral_losses[None] = 0
 
         analyte_number = 1 if len(proteoforms) > 1 else None
         for proteoform in proteoforms:
             fragments = fa.get_theoretical_fragments(
-                proteoform, ion_types, max_ion_charge, neutral_losses
+                proteoform,
+                ion_types,
+                max_isotope=max_isotope,
+                max_charge=max_ion_charge,
+                neutral_losses=neutral_losses,
             )
             fragment_i = 0
             for peak_i, peak_mz in enumerate(self.mz):
@@ -727,38 +735,46 @@ def annotate_proforma(
         fragment_tol_mass: float,
         fragment_tol_mode: str,
         ion_types: str = "by",
+        *,
+        max_isotope: int = 0,
         max_ion_charge: Optional[int] = None,
         neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
     ) -> MsmsSpectrum:
         """
-        Assign fragment ion labels to the peaks from a ProForma annotation.
+        Assign fragment ion labels to the peaks from a ProForma
+        annotation.
 
         Parameters
         ----------
         proforma_str : str
             The ProForma spectrum annotation.
         fragment_tol_mass : float
-            Fragment mass tolerance to match spectrum peaks against theoretical
-            peaks.
+            Fragment mass tolerance to match spectrum peaks against
+            theoretical peaks.
         fragment_tol_mode : {'Da', 'ppm'}
             Fragment mass tolerance unit. Either 'Da' or 'ppm'.
         ion_types : str, optional
-            The ion types to generate. Can be any combination of 'a', 'b', 'c',
-            'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
-            for internal fragment ions, 'p' for the precursor ion, and 'r' for
-            reporter ions. The default is 'by', which means that b and y
-            peptide ions will be generated.
+            The ion types to generate. Can be any combination of 'a',
+            'b', 'c', 'x', 'y', and 'z' for peptide fragments, 'I' for
+            immonium ions, 'm' for internal fragment ions, 'p' for the
+            precursor ion, and 'r' for reporter ions. The default is
+            'by', which means that b and y peptide ions will be
+            generated.
+        max_isotope : int
+            The maximum isotope to consider for the fragment ions (the
+            default is 0 to consider only monoisotopic peaks).
         max_ion_charge : Optional[int], optional
             All fragments up to and including the given charge will be
             annotated (by default all fragments with a charge up to the
             precursor minus one (minimum charge one) will be annotated).
         neutral_losses : Union[bool, Dict[Optional[str], float]], optional
-            Neutral losses to consider for each peak. If `None` or `False`, no
-            neutral losses are considered. If specified as a dictionary, keys
-            should be the molecular formulas of the neutral losses and values
-            the corresponding mass differences. Note that mass differences
-            should typically be negative. If `True`, all of the following
-            neutral losses are considered:
+            Neutral losses to consider for each peak. If `None` or
+            `False`, no neutral losses are considered. If specified as a
+            dictionary, keys should be the molecular formulas of the
+            neutral losses and values the corresponding mass
+            differences. Note that mass differences should typically be
+            negative. If `True`, all of the following neutral losses are
+            considered:
 
             - Loss of hydrogen (H): -1.007825.
             - Loss of ammonia (NH3): -17.026549.
@@ -786,6 +802,7 @@ def annotate_proforma(
             fragment_tol_mass=fragment_tol_mass,
             fragment_tol_mode=fragment_tol_mode,
             ion_types=ion_types,
+            max_isotope=max_isotope,
             max_ion_charge=max_ion_charge,
             neutral_losses=neutral_losses,
         )
diff --git a/tests/fragment_annotation_test.py b/tests/fragment_annotation_test.py
index 7c8a57e..77b0ace 100644
--- a/tests/fragment_annotation_test.py
+++ b/tests/fragment_annotation_test.py
@@ -228,6 +228,46 @@ def test_get_theoretical_fragments_mod_multiple():
         )
 
 
+def test_get_theoretical_fragments_isotope():
+    peptide = proforma.parse("HPYLEDR")[0]
+    fragments = {
+        "b1^1": 138.066147,
+        "b2^1": 235.118912,
+        "b3^1": 398.182220,
+        "b4^1": 511.266266,
+        "b5^1": 640.308899,
+        "b6^1": 755.335815,
+        "y1^1": 175.118912,
+        "y2^1": 290.145844,
+        "y3^1": 419.188446,
+        "y4^1": 532.272522,
+        "y5^1": 695.335815,
+        "y6^1": 792.388550,
+        "b1^2": 69.536731,
+        "b2^2": 118.063111,
+        "b3^2": 199.594776,
+        "b4^2": 256.136806,
+        "b5^2": 320.658101,
+        "b6^2": 378.171571,
+        "y1^2": 88.063114,
+        "y2^2": 145.576584,
+        "y3^2": 210.097879,
+        "y4^2": 266.639909,
+        "y5^2": 348.171574,
+        "y6^2": 396.697954,
+    }
+    for num_isotopes in range(0, 3):
+        annotations = fragment_annotation.get_theoretical_fragments(
+            peptide, max_charge=2, max_isotope=num_isotopes
+        )
+        assert len(annotations) == len(fragments) * (num_isotopes + 1)
+        for annotation, fragment_mz in annotations:
+            assert fragment_mz == pytest.approx(
+                fragments[f"{annotation.ion_type}^{annotation.charge}"]
+                + 1.003_354 * annotation.isotope / annotation.charge
+            )
+
+
 def test_get_theoretical_fragments_neutral_loss():
     peptide = proforma.parse("HPYLEDR")[0]
     fragments = {