From 5cb129723d66c6a583b4113f534caeef90ab9e22 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 21 Apr 2024 17:24:38 +0200 Subject: [PATCH] Generate theoretical isotopic fragments --- docs/src/index.md | 2 +- spectrum_utils/fragment_annotation.py | 108 ++++++++++++++++---------- spectrum_utils/spectrum.py | 73 ++++++++++------- tests/fragment_annotation_test.py | 40 ++++++++++ 4 files changed, 153 insertions(+), 70 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 39bcbd9..9ebd956 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -3,7 +3,7 @@ [![conda](https://img.shields.io/conda/vn/bioconda/spectrum_utils?color=green)](http://bioconda.github.io/recipes/spectrum_utils/README.html) [![PyPI](https://img.shields.io/pypi/v/spectrum_utils?color=green)](https://pypi.org/project/spectrum_utils/) [![Build status](https://github.com/bittremieux/spectrum_utils/workflows/tests/badge.svg)](https://github.com/bittremieux/spectrum_utils/actions?query=workflow:tests) -[![docs](https://readthedocs.org/projects/spectrum_utils/badge/?version=latest)](https://spectrum-utils.readthedocs.io/en/latest/?badge=latest) +[![docs](https://readthedocs.org/projects/spectrum-utils/badge/?version=latest)](https://spectrum-utils.readthedocs.io/en/latest/?badge=latest) ## About spectrum_utils diff --git a/spectrum_utils/fragment_annotation.py b/spectrum_utils/fragment_annotation.py index 0de40dc..caef019 100644 --- a/spectrum_utils/fragment_annotation.py +++ b/spectrum_utils/fragment_annotation.py @@ -11,14 +11,14 @@ # Amino acid and special amino acid masses. -_aa_mass = { +AA_MASS = { **pmass.std_aa_mass, # Aspartic acid / asparagine (ambiguous mass). # "B": 0, # Glutamic acid / glutamine (ambiguous mass). # "Z": 0, # Leucine / isoleucine. - "J": 113.08406, + "J": 113.084_064, # Selenocysteine (in Pyteomics). # "U": 150.95363, # Pyrrolysine (in Pyteomics). @@ -27,39 +27,42 @@ "X": 0, } +# Offset for isotopic peaks. +C13_MASS_DIFF = 1.003_354 + # Common neutral losses. -_neutral_loss = { +NEUTRAL_LOSS = { # No neutral loss. None: 0, # Hydrogen. - "H": -1.007825, + "H": -1.007_825, # Ammonia. - "NH3": -17.026549, + "NH3": -17.026_549, # Water. - "H2O": -18.010565, + "H2O": -18.010_565, # Carbon monoxide. - "CO": -27.994915, + "CO": -27.994_915, # Carbon dioxide. - "CO2": -43.989829, + "CO2": -43.989_829, # Formamide. - "HCONH2": -45.021464, + "HCONH2": -45.021_464, # Formic acid. - "HCOOH": -46.005479, + "HCOOH": -46.005_479, # Methanesulfenic acid. - "CH4OS": -63.998301, + "CH4OS": -63.998_301, # Sulfur trioxide. - "SO3": -79.956818, + "SO3": -79.956_818, # Metaphosphoric acid. - "HPO3": -79.966331, + "HPO3": -79.966_331, # Mercaptoacetamide. - "C2H5NOS": -91.009195, + "C2H5NOS": -91.009_195, # Mercaptoacetic acid. - "C2H4O2S": -91.993211, + "C2H4O2S": -91.993_211, # Phosphoric acid. - "H3PO4": -97.976896, + "H3PO4": -97.976_896, } -_supported_ions = "?abcxyzIm_prf" +SUPPORTED_IONS = "?abcxyzIm_prf" class FragmentAnnotation: @@ -125,7 +128,7 @@ def __init__( raise NotImplementedError( "Advanced ion types are not yet supported" ) - elif ion_type[0] not in _supported_ions: + elif ion_type[0] not in SUPPORTED_IONS: raise ValueError("Unknown ion type") if ion_type == "?" and ( neutral_loss is not None @@ -242,6 +245,8 @@ def __getitem__(self, key) -> FragmentAnnotation: def get_theoretical_fragments( proteoform: proforma.Proteoform, ion_types: str = "by", + *, + max_isotope: int = 0, max_charge: int = 1, neutral_losses: Optional[Dict[Optional[str], float]] = None, ) -> List[Tuple[FragmentAnnotation, float]]: @@ -252,19 +257,24 @@ def get_theoretical_fragments( Parameters ---------- proteoform : proforma.Proteoform - The proteoform for which the fragment annotations will be generated. + The proteoform for which the fragment annotations will be + generated. ion_types : str - The ion types to generate. Can be any combination of 'a', 'b', 'c', - 'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm' for - internal fragment ions, 'p' for the precursor ion, and 'r' for reporter - ions. The default is 'by', which means that b and y peptide ions will - be generated. + The ion types to generate. Can be any combination of 'a', 'b', + 'c', 'x', 'y', and 'z' for peptide fragments, 'I' for immonium + ions, 'm' for internal fragment ions, 'p' for the precursor ion, + and 'r' for reporter ions. The default is 'by', which means that + b and y peptide ions will be generated. + max_isotope : int + The maximum isotope to consider (the default is 0 to only + generate the monoisotopic peaks). max_charge : int - All fragments up to and including the given charge will be generated - (the default is 1 to only generate singly-charged fragments). + All fragments up to and including the given charge will be + generated (the default is 1 to only generate singly-charged + fragments). neutral_losses : Optional[Dict[Optional[str], float]] - A dictionary with neutral loss names and (negative) mass differences to - be considered. + A dictionary with neutral loss names and (negative) mass + differences to be considered. Returns ------- @@ -273,9 +283,9 @@ def get_theoretical_fragments( ascending m/z order. """ for ion_type in ion_types: - if ion_type not in _supported_ions: + if ion_type not in SUPPORTED_IONS: raise ValueError( - f"{ion_type} is not a supported ion type ({_supported_ions})" + f"{ion_type} is not a supported ion type ({SUPPORTED_IONS})" ) if "B" in proteoform.sequence: raise ValueError( @@ -363,8 +373,8 @@ def get_theoretical_fragments( # Generate all internal fragment ions. if "m" in ion_types: - # Skip internal fragments with start position 1, which are actually - # b ions. + # Skip internal fragments with start position 1, which are + # actually b ions. for start_i in range(1, len(proteoform.sequence)): mod_i_start, mod_mass = 0, 0 # Skip unlocalized and prefix modifications. @@ -380,8 +390,8 @@ def get_theoretical_fragments( ): mod_i_start += 1 mod_i_stop = mod_i_start - # Internal fragments of only one residue are encoded as immonium - # ions. + # Internal fragments of only one residue are encoded as + # immonium ions. for stop_i in range(start_i + 2, len(proteoform.sequence)): fragment_sequence = proteoform.sequence[start_i:stop_i] # Include internal modifications. @@ -392,8 +402,8 @@ def get_theoretical_fragments( ): mod_mass += proteoform.modifications[mod_i_stop].mass mod_i_stop += 1 - # Internal fragment mass calculation is equivalent to b ion - # mass calculation. + # Internal fragment mass calculation is equivalent to b + # ion mass calculation. base_fragments.append( ( fragment_sequence, @@ -430,20 +440,20 @@ def get_theoretical_fragments( sequence=fragment_sequence, ion_type=ion_type, charge=charge, - aa_mass=_aa_mass, + aa_mass=AA_MASS, ) + mod_mass / charge, ) ) # Generate all immonium ions (internal single amino acid from the - # combination of a type and y type cleavage. + # combination of a type and y type cleavage). if "I" in ion_types: # Amino acid mass minus CO plus charge 1. mass_diff = pmass.calculate_mass(formula="CO") - pmass.calculate_mass( formula="H" ) - for aa, mass in _aa_mass.items(): + for aa, mass in AA_MASS.items(): if aa != "X": fragments_masses.append( ( @@ -452,6 +462,22 @@ def get_theoretical_fragments( ) ) + # Generate isotopic peaks for all fragments. + isotope_fragments = [] + for isotope in range(1, max_isotope + 1): + for fragment, mass in fragments_masses: + isotope_fragments.append( + ( + FragmentAnnotation( + ion_type=fragment.ion_type, + isotope=isotope, + charge=fragment.charge, + ), + mass + isotope * C13_MASS_DIFF / fragment.charge, + ) + ) + fragments_masses.extend(isotope_fragments) + # Generate all fragments that differ by a neutral loss from the base # fragments. neutral_loss_fragments = [] @@ -460,13 +486,13 @@ def get_theoretical_fragments( continue neutral_loss = f"{'-' if mass_diff < 0 else '+'}{neutral_loss}" for fragment, mass in fragments_masses: - fragment_mass = mass + mass_diff / fragment.charge - if fragment_mass > 0: + if (fragment_mass := mass + mass_diff / fragment.charge) > 0: neutral_loss_fragments.append( ( FragmentAnnotation( ion_type=fragment.ion_type, neutral_loss=neutral_loss, + isotope=fragment.isotope, charge=fragment.charge, ), fragment_mass, diff --git a/spectrum_utils/spectrum.py b/spectrum_utils/spectrum.py index f9c7059..260a478 100755 --- a/spectrum_utils/spectrum.py +++ b/spectrum_utils/spectrum.py @@ -631,28 +631,32 @@ def _annotate_proteoforms( fragment_tol_mass: float, fragment_tol_mode: str, ion_types: str = "by", + *, + max_isotope: int = 0, max_ion_charge: Optional[int] = None, neutral_losses: Union[bool, Dict[Optional[str], float]] = False, ) -> MsmsSpectrum: """ - Assign fragment ion labels to the peaks from a ProForma annotation. + Assign fragment ion labels to the peaks from a ProForma + annotation. - This is meant to be an internal function that uses a pre-parsed proforma string - instead of parsing it internally. This can be useful when the same parsed - sequence is used multiple times (since parsing the sequence is a lot slower - than annotating the peaks) + This is meant to be an internal function that uses a pre-parsed + ProForma string instead of parsing it internally. This can be + useful when the same parsed sequence is used multiple times + (since parsing the sequence is a lot slower than annotating the + peaks). >>> proforma_sequence = "MYPEPTIDEK/2" - >>> parsed_proforma = proforma.parse(proforma_sequence) >>> spectrum.annotate_proforma(proforma_sequence, ...) or + >>> parsed_proforma = proforma.parse(proforma_sequence) >>> spectrum._annotate_proteoforms(parsed_proforma, proforma_sequence, ...) WARN: - This function does not check that the passed sequence corresponds to the - passed proteoforms. + This function does not check that the passed sequence + corresponds to the passed proteoforms. For additional information on the arguments, see the `MsmsSpectrum.annotate_proforma` documentation. @@ -668,24 +672,28 @@ def _annotate_proteoforms( ) self._annotation = np.full_like(self.mz, None, object) - # By default, peak charges are assumed to be smaller than the precursor - # charge. + # By default, peak charges are assumed to be smaller than the + # precursor charge. if max_ion_charge is None: max_ion_charge = max(1, self.precursor_charge - 1) - # Make sure the standard peaks (without a neutral loss) are always - # considered. + # Make sure the standard peaks (without a neutral loss) are + # always considered. if isinstance(neutral_losses, bool): if not neutral_losses: neutral_losses = {None: 0} else: - neutral_losses = fa._neutral_loss + neutral_losses = fa.NEUTRAL_LOSS if neutral_losses is not None and None not in neutral_losses: neutral_losses[None] = 0 analyte_number = 1 if len(proteoforms) > 1 else None for proteoform in proteoforms: fragments = fa.get_theoretical_fragments( - proteoform, ion_types, max_ion_charge, neutral_losses + proteoform, + ion_types, + max_isotope=max_isotope, + max_charge=max_ion_charge, + neutral_losses=neutral_losses, ) fragment_i = 0 for peak_i, peak_mz in enumerate(self.mz): @@ -727,38 +735,46 @@ def annotate_proforma( fragment_tol_mass: float, fragment_tol_mode: str, ion_types: str = "by", + *, + max_isotope: int = 0, max_ion_charge: Optional[int] = None, neutral_losses: Union[bool, Dict[Optional[str], float]] = False, ) -> MsmsSpectrum: """ - Assign fragment ion labels to the peaks from a ProForma annotation. + Assign fragment ion labels to the peaks from a ProForma + annotation. Parameters ---------- proforma_str : str The ProForma spectrum annotation. fragment_tol_mass : float - Fragment mass tolerance to match spectrum peaks against theoretical - peaks. + Fragment mass tolerance to match spectrum peaks against + theoretical peaks. fragment_tol_mode : {'Da', 'ppm'} Fragment mass tolerance unit. Either 'Da' or 'ppm'. ion_types : str, optional - The ion types to generate. Can be any combination of 'a', 'b', 'c', - 'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm' - for internal fragment ions, 'p' for the precursor ion, and 'r' for - reporter ions. The default is 'by', which means that b and y - peptide ions will be generated. + The ion types to generate. Can be any combination of 'a', + 'b', 'c', 'x', 'y', and 'z' for peptide fragments, 'I' for + immonium ions, 'm' for internal fragment ions, 'p' for the + precursor ion, and 'r' for reporter ions. The default is + 'by', which means that b and y peptide ions will be + generated. + max_isotope : int + The maximum isotope to consider for the fragment ions (the + default is 0 to consider only monoisotopic peaks). max_ion_charge : Optional[int], optional All fragments up to and including the given charge will be annotated (by default all fragments with a charge up to the precursor minus one (minimum charge one) will be annotated). neutral_losses : Union[bool, Dict[Optional[str], float]], optional - Neutral losses to consider for each peak. If `None` or `False`, no - neutral losses are considered. If specified as a dictionary, keys - should be the molecular formulas of the neutral losses and values - the corresponding mass differences. Note that mass differences - should typically be negative. If `True`, all of the following - neutral losses are considered: + Neutral losses to consider for each peak. If `None` or + `False`, no neutral losses are considered. If specified as a + dictionary, keys should be the molecular formulas of the + neutral losses and values the corresponding mass + differences. Note that mass differences should typically be + negative. If `True`, all of the following neutral losses are + considered: - Loss of hydrogen (H): -1.007825. - Loss of ammonia (NH3): -17.026549. @@ -786,6 +802,7 @@ def annotate_proforma( fragment_tol_mass=fragment_tol_mass, fragment_tol_mode=fragment_tol_mode, ion_types=ion_types, + max_isotope=max_isotope, max_ion_charge=max_ion_charge, neutral_losses=neutral_losses, ) diff --git a/tests/fragment_annotation_test.py b/tests/fragment_annotation_test.py index 7c8a57e..77b0ace 100644 --- a/tests/fragment_annotation_test.py +++ b/tests/fragment_annotation_test.py @@ -228,6 +228,46 @@ def test_get_theoretical_fragments_mod_multiple(): ) +def test_get_theoretical_fragments_isotope(): + peptide = proforma.parse("HPYLEDR")[0] + fragments = { + "b1^1": 138.066147, + "b2^1": 235.118912, + "b3^1": 398.182220, + "b4^1": 511.266266, + "b5^1": 640.308899, + "b6^1": 755.335815, + "y1^1": 175.118912, + "y2^1": 290.145844, + "y3^1": 419.188446, + "y4^1": 532.272522, + "y5^1": 695.335815, + "y6^1": 792.388550, + "b1^2": 69.536731, + "b2^2": 118.063111, + "b3^2": 199.594776, + "b4^2": 256.136806, + "b5^2": 320.658101, + "b6^2": 378.171571, + "y1^2": 88.063114, + "y2^2": 145.576584, + "y3^2": 210.097879, + "y4^2": 266.639909, + "y5^2": 348.171574, + "y6^2": 396.697954, + } + for num_isotopes in range(0, 3): + annotations = fragment_annotation.get_theoretical_fragments( + peptide, max_charge=2, max_isotope=num_isotopes + ) + assert len(annotations) == len(fragments) * (num_isotopes + 1) + for annotation, fragment_mz in annotations: + assert fragment_mz == pytest.approx( + fragments[f"{annotation.ion_type}^{annotation.charge}"] + + 1.003_354 * annotation.isotope / annotation.charge + ) + + def test_get_theoretical_fragments_neutral_loss(): peptide = proforma.parse("HPYLEDR")[0] fragments = {