From e7d8611b84308c7ce0242553e78c13ba5a3177e7 Mon Sep 17 00:00:00 2001 From: MrtinoRG Date: Tue, 7 Jan 2025 16:23:01 +0100 Subject: [PATCH] feat: add rdkit functions + add pubchem spectra and GHS functions --- src/chemenv/modal_app.py | 174 ++++++++-- src/chemenv/tools/cheminformatics.py | 453 ++++++++++++++++++++++++++- src/chemenv/tools/pubchem.py | 335 +++++++++++++++----- 3 files changed, 843 insertions(+), 119 deletions(-) diff --git a/src/chemenv/modal_app.py b/src/chemenv/modal_app.py index dff20ab..ad2bce9 100644 --- a/src/chemenv/modal_app.py +++ b/src/chemenv/modal_app.py @@ -1,25 +1,32 @@ -from modal import App, Image +from modal import App from chemenv.tools.cheminformatics import ( + rdkit_image, + mendeleev_image, get_tanimoto_similarity as _get_tanimoto_similarity, get_number_of_topologically_distinct_atoms as _get_topologically_distinct_atoms, get_element_info as _get_element_info, + _get_number_atoms, + _get_number_heavy_atoms, + _get_canonical_smiles, + _get_compound_charge, + _get_number_rings, + _get_number_aromatic_rings, + _get_aromatic_rings, + _get_ring_sizes, + _get_chiral_centers, + _get_number_chiral_centers, + _get_number_cis_bonds, + _get_number_trans_bonds, + _get_molecular_properties, + _has_substructure, + _get_substructure_count, ) from chemenv.tools.pubchem import ( PubChem, pubchem_image as _pubchem_image, ) -from chemenv.tools.pubchem import ( - Smiles2Name as _Smiles2Name, - converters_image as _converters_image, -) import os -# Define the images -rdkit_image = ( - Image.debian_slim(python_version="3.12").pip_install("rdkit").pip_install("numpy") -) -mendeleev_image = Image.debian_slim().pip_install("mendeleev") - chemenv_name = os.getenv("CHEMENV_NAME", "") if chemenv_name and not chemenv_name.startswith("-"): chemenv_name = f"-{chemenv_name}" @@ -38,13 +45,88 @@ def get_number_of_topologically_distinct_atoms(*args, **kwargs): return _get_topologically_distinct_atoms(*args, **kwargs) +@app.function(image=rdkit_image) +def get_number_atoms(*args, **kwargs): + return _get_number_atoms(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_number_heavy_atoms(*args, **kwargs): + return _get_number_heavy_atoms(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_canonical_smiles(*args, **kwargs): + return _get_canonical_smiles(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_compound_charge(*args, **kwargs): + return _get_compound_charge(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_number_rings(*args, **kwargs): + return _get_number_rings(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_ring_sizes(*args, **kwargs): + return _get_ring_sizes(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_number_aromatic_rings(*args, **kwargs): + return _get_number_aromatic_rings(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_aromatic_rings(*args, **kwargs): + return _get_aromatic_rings(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_chiral_centers(*args, **kwargs): + return _get_chiral_centers(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_number_chiral_centers(*args, **kwargs): + return _get_number_chiral_centers(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_number_cis_bonds(*args, **kwargs): + return _get_number_cis_bonds(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_number_trans_bonds(*args, **kwargs): + return _get_number_trans_bonds(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_molecular_properties(*args, **kwargs): + return _get_molecular_properties(*args, **kwargs) + + +@app.function(image=rdkit_image) +def has_substructure(*args, **kwargs): + return _has_substructure(*args, **kwargs) + + +@app.function(image=rdkit_image) +def get_substructure_count(*args, **kwargs): + return _get_substructure_count(*args, **kwargs) + + @app.function(image=mendeleev_image) def get_element_info(*args, **kwargs): return _get_element_info(*args, **kwargs) @app.function(image=_pubchem_image) -async def get_number_atoms(compound_id: str) -> int: +async def get_number_atoms_pubchem(compound_id: str) -> int: pubchem = await PubChem.create(compound_id) number_atoms = await pubchem._get_number_atoms() if number_atoms is None: @@ -53,7 +135,7 @@ async def get_number_atoms(compound_id: str) -> int: @app.function(image=_pubchem_image) -async def get_isomeric_smiles(compound_id: str) -> str: +async def get_isomeric_smiles_pubchem(compound_id: str) -> str: pubchem = await PubChem.create(compound_id) isomeric_smiles = await pubchem._get_isomeric_smiles() if isomeric_smiles is None: @@ -62,7 +144,7 @@ async def get_isomeric_smiles(compound_id: str) -> str: @app.function(image=_pubchem_image) -async def get_canonical_smiles(compound_id: str) -> str: +async def get_canonical_smiles_pubchem(compound_id: str) -> str: pubchem = await PubChem.create(compound_id) smiles = await pubchem._get_canonical_smiles() if smiles is None: @@ -71,7 +153,7 @@ async def get_canonical_smiles(compound_id: str) -> str: @app.function(image=_pubchem_image) -async def get_compound_mass(compound_id: str) -> float: +async def get_compound_mass_pubchem(compound_id: str) -> float: pubchem = await PubChem.create(compound_id) molecular_weight = await pubchem._get_compound_mass() if molecular_weight is None: @@ -80,7 +162,7 @@ async def get_compound_mass(compound_id: str) -> float: @app.function(image=_pubchem_image) -async def get_compound_charge(compound_id: str) -> int: +async def get_compound_charge_pubchem(compound_id: str) -> int: pubchem = await PubChem.create(compound_id) charge = await pubchem._get_compound_charge() if charge is None: @@ -89,7 +171,7 @@ async def get_compound_charge(compound_id: str) -> int: @app.function(image=_pubchem_image) -async def get_compound_formula(compound_id: str) -> str: +async def get_compound_formula_pubchem(compound_id: str) -> str: pubchem = await PubChem.create(compound_id) formula = await pubchem._get_compound_formula() if formula is None: @@ -98,7 +180,7 @@ async def get_compound_formula(compound_id: str) -> str: @app.function(image=_pubchem_image) -async def get_number_isomers(compound_id: str) -> int: +async def get_number_isomers_pubchem(compound_id: str) -> int: pubchem = await PubChem.create(compound_id) number_isomers = await pubchem._get_number_isomers() if number_isomers is None: @@ -107,7 +189,7 @@ async def get_number_isomers(compound_id: str) -> int: @app.function(image=_pubchem_image, timeout=86399) -async def get_compound_isomers(*args, **kwargs): +async def get_compound_isomers_pubchem(*args, **kwargs): pubchem = await PubChem.create(*args, **kwargs) data = await pubchem._get_compound_isomers() if data is None: @@ -116,7 +198,7 @@ async def get_compound_isomers(*args, **kwargs): @app.function(image=_pubchem_image) -async def get_number_heavy_atoms(compound_id: str) -> int: +async def get_number_heavy_atoms_pubchem(compound_id: str) -> int: pubchem = await PubChem.create(compound_id) number_heavy_atoms = await pubchem._get_number_heavy_atoms() if number_heavy_atoms is None: @@ -125,7 +207,7 @@ async def get_number_heavy_atoms(compound_id: str) -> int: @app.function(image=_pubchem_image) -async def _get_number_chiral_atoms(compound_id: str) -> int: +async def get_number_chiral_atoms_pubchem(compound_id: str) -> int: pubchem = await PubChem.create(compound_id) number_chiral_atoms = await pubchem._get_number_chiral_atoms() if number_chiral_atoms is None: @@ -133,10 +215,46 @@ async def _get_number_chiral_atoms(compound_id: str) -> int: return number_chiral_atoms -@app.function(image=_converters_image) -async def get_iupac_name(smiles: str) -> str: - converter = _Smiles2Name(smiles) - name = await converter.get_name() - if name is None: - return "" - return name +@app.function(image=_pubchem_image) +async def get_c_nmr_spectra_pubchem(compound_id: str) -> dict: + pubchem = await PubChem.create(compound_id) + c_nmr_spectra = await pubchem._get_c_nmr_spectra() + if c_nmr_spectra is None: + raise ValueError("No C-NMR spectra found") + return c_nmr_spectra + + +@app.function(image=_pubchem_image) +async def get_h_nmr_spectra_pubchem(compound_id: str) -> dict: + pubchem = await PubChem.create(compound_id) + h_nmr_spectra = await pubchem._get_h_nmr_spectra() + if h_nmr_spectra is None: + raise ValueError("No H-NMR spectra found") + return h_nmr_spectra + + +@app.function(image=_pubchem_image) +async def get_uv_spectra_pubchem(compound_id: str) -> dict: + pubchem = await PubChem.create(compound_id) + uv_spectra = await pubchem._get_uv_spectra() + if uv_spectra is None: + raise ValueError("No UV spectra found") + return uv_spectra + + +@app.function(image=_pubchem_image) +async def get_ms_spectra_pubchem(compound_id: str) -> dict: + pubchem = await PubChem.create(compound_id) + ms_spectra = await pubchem._get_ms_spectra() + if ms_spectra is None: + raise ValueError("No MS spectra found") + return ms_spectra + + +@app.function(image=_pubchem_image) +async def get_ghs_classification_pubchem(compound_id: str) -> str: + pubchem = await PubChem.create(compound_id) + ghs_classification = await pubchem._get_ghs_classification() + if ghs_classification is None: + raise ValueError("No GHS classification found") + return ghs_classification diff --git a/src/chemenv/tools/cheminformatics.py b/src/chemenv/tools/cheminformatics.py index afffe19..c0001fa 100644 --- a/src/chemenv/tools/cheminformatics.py +++ b/src/chemenv/tools/cheminformatics.py @@ -1,3 +1,19 @@ +from modal import Image + +rdkit_image = ( + Image.debian_slim(python_version="3.12").pip_install("rdkit").pip_install("numpy") +) +mendeleev_image = Image.debian_slim().pip_install("mendeleev") + +with rdkit_image.imports(): + from rdkit import Chem, DataStructs + from rdkit.Chem import AllChem + import numpy as np + +with mendeleev_image.imports(): + from mendeleev import element + + def get_tanimoto_similarity(s1: str, s2: str) -> float: """ Calculate the Tanimoto similarity of two SMILES strings. @@ -20,9 +36,6 @@ def get_tanimoto_similarity(s1: str, s2: str) -> float: >>> get_tanimoto_similarity("CCO", "CC") 0.143 """ - from rdkit import Chem, DataStructs - from rdkit.Chem import AllChem - try: mol1 = Chem.MolFromSmiles(s1) mol2 = Chem.MolFromSmiles(s2) @@ -53,10 +66,6 @@ def get_number_of_topologically_distinct_atoms(smiles: str, atomic_number: int = >>> get_number_of_topologically_distinct_atoms("CCO", 6) 2 """ - - from rdkit import Chem - import numpy as np - try: molecule = Chem.MolFromSmiles(smiles) @@ -104,8 +113,6 @@ def get_element_info(identifier: str) -> dict: >>> get_element_info("H")["name"] 'Hydrogen' """ - from mendeleev import element - try: # Try to get the element if isinstance(identifier, int) or identifier.isdigit(): @@ -130,3 +137,431 @@ def get_element_info(identifier: str) -> dict: except ValueError: raise ValueError(f"Error: '{identifier}' is not a valid element identifier.") + + +def _get_number_atoms(smiles: str) -> int: + """ + Get the number of atoms in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of atoms in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_atoms("CCO") + 3 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + return mol.GetNumAtoms() + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_number_heavy_atoms(smiles: str) -> int: + """ + Get the number of heavy atoms in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of heavy atoms in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_heavy_atoms("CCO") + 3 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + return mol.GetNumHeavyAtoms() + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_canonical_smiles(smiles: str) -> str: + """ + Get the canonical SMILES string of a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + str: The canonical SMILES string of the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _canonical_smiles("CCO") + 'CCO' + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + return Chem.MolToSmiles(mol) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_compound_charge(smiles: str) -> int: + """ + Get the charge of a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The charge of the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_compound_charge("CCO") + 0 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + return Chem.GetFormalCharge(mol) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_number_rings(smiles: str) -> int: + """ + Get the number of aromatic rings in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of aromatic rings in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_aromatic_rings("c1ccccc1") + 1 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + rings = Chem.GetRingInfo(mol) + return rings.numRings() + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_ring_sizes(smiles: str) -> list: + """ + Get the sizes of rings in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + list: A list of ring sizes in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_ring_sizes("C1CCCCC1") + [6] + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + rings = Chem.GetRingInfo(mol) + return [len(ring) for ring in rings.AtomRings()] + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_number_aromatic_rings(smiles: str) -> int: + """ + Get the number of aromatic rings in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of aromatic rings in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_aromatic_rings("c1ccccc1") + 1 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + rings = Chem.GetRingInfo(mol) + return len( + [ + ring + for ring in rings.AtomRings() + if all(mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in ring) + ] + ) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_aromatic_rings(smiles: str) -> list: + """ + Get the aromatic rings in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + list: A list of aromatic rings in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_aromatic_rings("c1ccccc1") + [[0, 1, 2, 3, 4, 5]] + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + rings = Chem.GetRingInfo(mol) + return [ + ring + for ring in rings.AtomRings() + if all(mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in ring) + ] + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_chiral_centers(smiles: str) -> int: + """ + Get the number of chiral centers in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of chiral centers in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_chiral_centers("CCO") + 0 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + Chem.AssignStereochemistry(mol) + return Chem.FindMolChiralCenters(mol, includeUnassigned=True) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_number_chiral_centers(smiles: str) -> int: + """ + Get the number of chiral centers in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of chiral centers in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_chiral_centers("CCO") + 0 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + Chem.AssignStereochemistry(mol) + return len(Chem.FindMolChiralCenters(mol, includeUnassigned=True)) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_number_cis_bonds(smiles: str) -> int: + """ + Get the number of cis bonds in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of cis bonds in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_cis_bonds("C/C=C/C") + 1 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + Chem.AssignStereochemistry(mol) + return len( + [ + bond + for bond in mol.GetBonds() + if bond.GetStereo() == Chem.BondStereo.STEREOCIS + ] + ) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_number_trans_bonds(smiles: str) -> int: + """ + Get the number of trans bonds in a molecule given its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + + Returns: + int: The number of trans bonds in the molecule. + + Raises: + ValueError: If the SMILES string is invalid. + + Example: + >>> _get_number_trans_bonds("C/C=C/C") + 1 + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + Chem.AssignStereochemistry(mol) + return len( + [ + bond + for bond in mol.GetBonds() + if bond.GetStereo() == Chem.BondStereo.STEREOTRANS + ] + ) + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _get_molecular_properties(smiles: str) -> dict: + """Get basic molecular properties. + + Args: + smiles (str): SMILES string of the molecule + + Returns: + dict: Dictionary containing properties like LogP, TPSA, etc. + + Raises: + ValueError: If the SMILES string is invalid + + Example: + >>> _get_molecular_properties("CCO") + {'logp': 0.22399999999999998, 'tpsa': 20.23, 'molecular_weight': 46.069, 'rotatable_bonds': 1, 'hbd': 1, 'hba': 1} + """ + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError("Invalid SMILES string") + return { + "logp": Chem.Descriptors.MolLogP(mol), + "tpsa": Chem.Descriptors.TPSA(mol), + "molecular_weight": Chem.Descriptors.ExactMolWt(mol), + "rotatable_bonds": Chem.Descriptors.NumRotatableBonds(mol), + "hbd": Chem.Descriptors.NumHDonors(mol), + "hba": Chem.Descriptors.NumHAcceptors(mol), + } + except Exception as e: + raise ValueError(f"Invalid SMILES string: {e}") + + +def _has_substructure(smiles: str, substructure_smarts: str) -> bool: + """Check if a molecule contains a specific substructure. + + Args: + smiles (str): SMILES string of the molecule + substructure_smarts (str): SMARTS pattern of the substructure + + Returns: + bool: True if substructure is present + + Raises: + ValueError: If the SMILES or SMARTS pattern is invalid + + Example: + >>> _has_substructure("CCO", "CO") + True + """ + try: + mol = Chem.MolFromSmiles(smiles) + pattern = Chem.MolFromSmarts(substructure_smarts) + if mol is None or pattern is None: + raise ValueError("Invalid SMILES or SMARTS pattern") + return mol.HasSubstructMatch(pattern) + except Exception as e: + raise ValueError(f"Error in substructure matching: {e}") + + +def _get_substructure_count(smiles: str, substructure_smarts: str) -> int: + """Get the count of a specific substructure in a molecule. + + Args: + smiles (str): SMILES string of the molecule + substructure_smarts (str): SMARTS pattern of the substructure + + Returns: + int: Number of occurrences of the substructure + + Raises: + ValueError: If the SMILES string or SMARTS pattern is invalid + + Example: + >>> _get_substructure_count("CCO", "C") + 2 + """ + try: + mol = Chem.MolFromSmiles(smiles) + pattern = Chem.MolFromSmarts(substructure_smarts) + if mol is None or pattern is None: + raise ValueError("Invalid SMILES or SMARTS pattern") + return len(mol.GetSubstructMatches(pattern)) + except Exception as e: + raise ValueError(f"Error in substructure matching: {e}") diff --git a/src/chemenv/tools/pubchem.py b/src/chemenv/tools/pubchem.py index 383974c..e40958d 100644 --- a/src/chemenv/tools/pubchem.py +++ b/src/chemenv/tools/pubchem.py @@ -43,6 +43,7 @@ import pubchempy as pcp from rdkit import Chem from urllib.parse import quote + from time import sleep from loguru import logger @@ -59,6 +60,7 @@ def __init__(self): """ self.cid = None self.base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" + self.long_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON/?response_type=display&heading=" logger.info(f"Initialized PubChem handler for CID {self.cid}") @classmethod @@ -330,7 +332,7 @@ async def _get_number_isomers(self) -> Optional[int]: logger.info(f"Number of compound isomers for CID {self.cid}: {data}") return data - async def _get_compound_isomers(self) -> List[Optional[str]]: + async def _get_compound_isomers(self) -> List[str]: """ Get the compound isomers for a compound from PubChem. This function can take some time depending on the number of isomers. @@ -349,9 +351,19 @@ async def _get_compound_isomers(self) -> List[Optional[str]]: isomers_cids = (await self.get_data_from_url(url))["IdentifierList"]["CID"] data = [] for i in isomers_cids: + sleep(0.5) self.cid = i - # Isomeric SMILES to capture enantiomers - data.append(await self._get_isomeric_smiles()) + try: + # Isomeric SMILES to capture enantiomers + smiles = await self._get_isomeric_smiles() + if smiles: + data.append(smiles) + except ValueError as ve: + logger.warning(f"Could not get SMILES for CID {i}: {ve}") + continue + except Exception as e: + logger.error(f"Unexpected error getting SMILES for CID {i}: {e}") + continue except Exception as e: logger.error(f"No compound isomers found. {e}") raise ValueError(f"No compound isomers found. {e}") @@ -402,106 +414,265 @@ async def _get_number_chiral_atoms(self) -> Optional[int]: logger.info(f"Number of chiral atoms for CID {self.cid}: {data}") return data + async def _format_long_url(self, heading): + """ + Format the long URL to get specific information from PubChem and return the data. -class Smiles2Name: - def __init__(self, smiles): - """Initialize Name2Smiles converter with a chemical compound name. - Takes a chemical compound name and prepares it for API queries by URL-encoding. - Sets default timeout for API requests to 10 seconds. Args: - name (str): Chemical compound name to convert to SMILES notation. - Should be a valid IUPAC or common chemical name. + heading (str): Heading of the information to retrieve from PubChem + + Returns: + dict: Data retrieved from PubChem + Raises: - ValueError: If the name cannot be URL-encoded or contains invalid characters. + ValueError: If the data could not be retrieved + Example: - >>> converter = Name2Smiles("ethanol") - >>> await converter.get_smiles() - 'CCO' + >>> await self._format_long_url("Mass Spectrometry") + {'Information': [{'Name': 'Mass bank ID', 'ReferenceNumber': 1, 'Value': {'StringWithMarkup': [{'String... """ - mol = Chem.MolFromSmiles(smiles) - if mol is None: - raise ValueError(f"Invalid SMILES: {smiles}") + url = self.long_url.format(cid=self.cid) + quote(heading) + logger.info(f"Getting spectral information for CID {self.cid}") + try: + return await self.get_data_from_url(url) + except Exception as e: + logger.error(f"Failed to get spectral information: {str(e)}") + raise ValueError(f"Failed to get spectral information: {str(e)}") - self.smiles = smiles - self.timeout = 10 # seconds + async def _format_ms_spectra(self, data): + """ + Format the MS spectra data retrieved from PubChem. - @backoff.on_exception( - backoff.expo, - (aiohttp.ClientError, asyncio.TimeoutError), - max_time=10, - logger=logger, - ) - async def pubchem(self) -> Optional[str]: + Args: + data (dict): Data retrieved from PubChem + + Returns: + dict: Formatted MS spectra data + + Raises: + ValueError: If the data could not be retrieved + """ + try: + information = data["Information"] + except KeyError: + raise ValueError("No MS spectra found") + + field_mapping = { + "Mass bank ID": "MoNA ID", + "Spectra type": "MS Category", + "MS Type": "MS Type", + "MS Level": "MS Level", + "Instrument": "Instrument", + "Instrument Type": "Instrument Type", + "Ionization Mode": "Ionization Mode", + "Top Peaks": "StringWithMarkup", + } + + results = {} + + counter = 0 + for info in information: + if counter > 5: + return results + ref_num = info.get("ReferenceNumber") + if not ref_num: + continue + + value = info.get("Value", {}) + + for orig_key, mapped_key in field_mapping.items(): + if mapped_key == "StringWithMarkup": + string_with_markup = value.get("StringWithMarkup", []) + peaks = [item["String"] for item in string_with_markup] + results[orig_key] = peaks + else: + string_with_markup = value.get("StringWithMarkup", []) + if string_with_markup: + results[orig_key] = string_with_markup[0]["String"] + else: + results[orig_key] = None + + counter += 1 + + return results + + async def _format_nmr_spectra(self, data): """ - Query PubChem API to get IUPAC name from SMILES. + Format the NMR spectra data retrieved from PubChem. + + Args: + data (dict): Data retrieved from PubChem + Returns: - Optional[str]: IUPAC name if found, None if the query failed. + dict: Formatted NMR spectra data + Raises: - aiohttp.ClientError: If the API request fails. - asyncio.TimeoutError: If the request times out. + ValueError: If the data could not be retrieved """ - smiles = quote(self.smiles) - url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/IUPACName/TXT" - async with aiohttp.ClientSession() as session: - try: - async with session.get(url, timeout=self.timeout) as response: - if response.status == 200: - return await response.text() - raise ValueError( - f"PubChem API failed with status {response.status}" - ) - except Exception as e: - raise e + try: + information = data["Information"] + except KeyError: + raise ValueError("No 1H NMR spectra found") + + # Define field mapping + field_mapping = { + "Instrument Type": "instrument", + "Frequency": "frequency", + "Solvent": "solvent", + "pH": "ph", + "Shifts [ppm]:Intensity": "shifts", + } + + results = {} + + for info in information: + ref_num = info.get("ReferenceNumber") + if not ref_num: + continue - @backoff.on_exception( - backoff.expo, - (aiohttp.ClientError, asyncio.TimeoutError), - max_time=10, - logger=logger, - ) - async def cactus(self) -> Optional[str]: + name = info.get("Name") + string_value = ( + info.get("Value", {}).get("StringWithMarkup", [{}])[0].get("String") + ) + + if not (name and string_value): + continue + + if ref_num not in results: + results[ref_num] = {} + + if name in field_mapping: + results[ref_num][field_mapping[name]] = string_value + + return results + + async def _get_c_nmr_spectra(self): """ - Query CACTUS API to get IUPAC name from SMILES. + Get the C-NMR spectra for a compound from PubChem. + Returns: - Optional[str]: IUPAC name if found, None if the query failed. + dict: C-NMR spectra data + Raises: - aiohttp.ClientError: If the API request fails. - asyncio.TimeoutError: If the request times out. + ValueError: If the data could not be retrieved """ - inchi = Chem.MolToInchi(Chem.MolFromSmiles(self.smiles)) - url = f"https://cactus.nci.nih.gov/chemical/structure/{inchi}/iupac_name" + try: + data = await self._format_long_url("13C NMR Spectra") + return self._format_nmr_spectra(data) + except Exception: + raise ValueError("No C-NMR spectra found. {e}") - async with aiohttp.ClientSession() as session: - try: - async with session.get(url, timeout=self.timeout) as response: - if response.status == 200: - return await response.text() - raise ValueError(f"CACTUS API failed with status {response.status}") - except Exception as e: - raise e + async def _get_h_nmr_spectra(self): + """ + Get the 1H NMR spectra for a compound from PubChem. + + Returns: + dict: 1H NMR spectra data + + Raises: + ValueError: If the data could not be retrieved + """ + try: + data = await self._format_long_url("1H NMR Spectra") + return self._format_nmr_spectra(data) + except Exception: + raise ValueError("No 1H NMR spectra found. {e}") - async def get_name(self) -> Optional[str]: + async def _get_uv_spectra(self): """ - Query multiple chemical APIs in parallel to get IUPAC name. - Attempts to retrieve the IUPAC name by querying multiple chemical databases - concurrently (CACTUS and PubChem). Returns the first successful result. + Get the UV spectra for a compound from PubChem. + Returns: - str: The IUPAC name of the chemical compound. + str: UV spectra data + Raises: - ValueError: If no name could be found in any of the chemical databases. + ValueError: If the data could not be retrieved """ - tasks = [ - self.cactus(), - self.pubchem(), - ] + data = await self._format_long_url("UV Spectra") + results = {} + try: + for info in data["Information"]: + ref_num = info["ReferenceNumber"] + string_value = info["Value"]["StringWithMarkup"][0]["String"] - for result in asyncio.as_completed(tasks): - try: - name = await result - if name: - return name.strip() - except Exception: - continue + if ref_num not in results: + results[ref_num] = "" + + if ( + "MAX ABSORPTION" in string_value.upper() + or "UV MAX" in string_value.upper() + ): + if results[ref_num]: + results[ref_num] += "\n" + results[ref_num] += string_value + + if not results: + raise ValueError("No UV spectra found") + + output = [] + for ref_num, value in sorted(results.items()): + output.append(f"Reference {ref_num}:\n{value}") + + return "\n\n".join(output) + + except Exception: + raise ValueError("No UV spectra found") + + async def _get_ms_spectra(self): + """ + Get the MS spectra for a compound from PubChem. + + Returns: + dict: MS spectra data + + Raises: + ValueError: If the data could not be retrieved + """ + try: + data = await self._format_long_url("Mass Spectrometry") + results = {} + for section in data["Record"]["Section"]: + title = section["TOCHeading"] + spectra = self._format_ms_spectra(section) + if spectra: + results[title] = spectra + + if not results: + raise ValueError("No MS spectra found") + + return results + + except Exception: + raise ValueError("No MS spectra found") + + async def _get_ghs_classification(self): + """ + Get the GHS classification for a compound from PubChem. + + Returns: + dict: GHS classification data + + Raises: + ValueError: If the data could not be retrieved + """ + data = await self._format_long_url("GHS%20Classification") + logger.info(f"Getting GHS classification for CID {self.cid}") + try: + information_list = data["Record"]["Section"][0]["Section"][0]["Section"][0][ + "Information" + ] + + hazard_statements = {} + + for info in information_list: + if info.get("Name") == "GHS Hazard Statements": + ref_number = info.get("ReferenceNumber") + string_values = [ + markup["String"] for markup in info["Value"]["StringWithMarkup"] + ] + hazard_statements[ref_number] = string_values - logger.error(f"Could not find name for {self.smiles}") - raise ValueError(f"Could not find name for {self.smiles}") + return hazard_statements + except Exception: + logger.error("Failed to get GHS classification") + raise ValueError("Failed to get GHS classification")