diff --git a/.gitignore b/.gitignore index 7e7bfc0..940741e 100644 --- a/.gitignore +++ b/.gitignore @@ -160,5 +160,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +.vscode .DS_Store _version.py diff --git a/pyproject.toml b/pyproject.toml index c67e044..c5abc04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ license = {file = "LICENSE"} readme = "README.md" requires-python = ">=3.9" dynamic = ["version"] -dependencies = ["modal", "rdkit", "mendeleev", "fire"] +dependencies = ["modal", "fire"] [project.optional-dependencies] dev = ["pytest", "pre-commit", "ruff", "pytest-asyncio"] diff --git a/src/chemenv/modal_app.py b/src/chemenv/modal_app.py index 45051dc..edef805 100644 --- a/src/chemenv/modal_app.py +++ b/src/chemenv/modal_app.py @@ -4,6 +4,20 @@ get_number_of_topologically_distinct_atoms as _get_topologically_distinct_atoms, get_element_info as _get_element_info, ) +from chemenv.tools.converters import ( + _converters_image, + _safe_image, + _Smiles2Name, + _Name2Smiles, + _smiles_to_selfies, + _smiles_to_deepsmiles, + _smiles_to_inchi, + _smiles_to_inchikey, + _smiles_to_safe, + _selfies_to_smiles, + _inchi_to_smiles, + _deepsmiles_to_smiles, +) import os # Define the images @@ -17,6 +31,7 @@ if chemenv_name and not chemenv_name.startswith("-"): chemenv_name = f"-{chemenv_name}" + # Create the app app = App(f"chemenv{chemenv_name}") @@ -34,3 +49,173 @@ def get_number_of_topologically_distinct_atoms(*args, **kwargs): @app.function(image=mendeleev_image) def get_element_info(*args, **kwargs): return _get_element_info(*args, **kwargs) + + +@app.function(image=_converters_image) +async def get_iupac_name(smiles: str, timeout: int = 10) -> str: + """ + Get the IUPAC name of a molecule from its SMILES string. + + Args: + smiles (str): The SMILES string of the molecule. + timeout (int): The timeout in seconds for the request. + + Returns: + str: The IUPAC name of the molecule. + + Raises: + ValueError: If the conversion fails. + """ + converter = _Smiles2Name(smiles, timeout) + try: + name = await converter.get_name() + return name + except Exception as e: + raise ValueError(f"Error: {e}") from e + + +@app.function(image=_converters_image) +async def get_smiles_from_name(name: str, timeout: int = 10) -> str: + """ + Get the SMILES string of a molecule from its IUPAC name. + + Args: + name (str): The IUPAC name of the molecule. + timeout (int): The timeout in seconds for the request. + + Returns: + str: The SMILES string of the molecule. + + Raises: + ValueError: If the conversion fails. + """ + converter = _Name2Smiles(name, timeout) + try: + smiles = await converter.get_smiles() + return smiles + except Exception as e: + raise ValueError(f"Error converting name to SMILES: {e}") from e + + +@app.function(image=_converters_image) +def convert_to_selfies(smiles: str) -> str: + """ + Convert SMILES to SELFIES encoding. + + Args: + smiles (str): The SMILES string to convert. + + Returns: + str: The SELFIES encoding of the molecule. + """ + return _smiles_to_selfies(smiles) + + +@app.function(image=_converters_image) +def convert_to_deepsmiles(smiles: str) -> str: + """ + Convert SMILES to DeepSMILES encoding. + + Args: + smiles (str): The SMILES string to convert. + + Returns: + str: The DeepSMILES encoding of the molecule. + """ + return _smiles_to_deepsmiles(smiles) + + +@app.function(image=_converters_image) +def convert_to_inchi(smiles: str) -> str: + """ + Convert SMILES to InChI. + + Args: + smiles (str): The SMILES string to convert. + + Returns: + str: The InChI encoding of the molecule. + """ + try: + return _smiles_to_inchi(smiles) + except Exception as e: + raise ValueError(f"Error converting SMILES to InChI: {e}") from e + + +@app.function(image=_converters_image) +def convert_to_inchikey(smiles: str) -> str: + """ + Convert SMILES to InChIKey. + + Args: + smiles (str): The SMILES string to convert. + + Returns: + str: The InChIKey encoding of the molecule. + """ + try: + return _smiles_to_inchikey(smiles) + except Exception as e: + raise ValueError(f"Error converting SMILES to InChIKey: {e}") from e + + +@app.function(image=_safe_image) +def convert_to_safe(smiles: str) -> str: + """ + Convert SMILES to SAFE encoding. + + Args: + smiles (str): The SMILES string to convert. + + Returns: + str: The SAFE encoding of the molecule. + """ + return _smiles_to_safe(smiles) + + +@app.function(image=_converters_image) +def selfies_to_smiles(selfies: str) -> str: + """ + Convert SELFIES to SMILES. + + Args: + selfies (str): The SELFIES encoding to convert. + + Returns: + str: The SMILES string of the molecule. + """ + return _selfies_to_smiles(selfies) + + +@app.function(image=_converters_image) +def inchi_to_smiles(inchi: str) -> str: + """ + Convert InChI to SMILES. + + Args: + inchi (str): The InChI encoding to convert. + + Returns: + str: The SMILES string of the molecule. + """ + try: + return _inchi_to_smiles(inchi) + except Exception as e: + raise ValueError(f"Error converting InChI to SMILES: {e}") from e + + +@app.function(image=_converters_image) +def deepsmiles_to_smiles(deepsmiles: str) -> str: + """ + Convert DeepSMILES to SMILES. + + Args: + deepsmiles (str): The DeepSMILES encoding to convert. + + Returns: + str: The SMILES string of the molecule. + """ + try: + return _deepsmiles_to_smiles(deepsmiles) + except Exception as e: + raise ValueError(f"Error converting DeepSMILES to SMILES: {e}") from e diff --git a/src/chemenv/tools/converters.py b/src/chemenv/tools/converters.py new file mode 100644 index 0000000..af0b6e1 --- /dev/null +++ b/src/chemenv/tools/converters.py @@ -0,0 +1,426 @@ +from modal import Image + + +_converters_image = Image.debian_slim(python_version="3.12").pip_install( + [ + "rdkit", + "selfies", + "deepsmiles", + "aiohttp", + "backoff", + "loguru", + ] +) + +with _converters_image.imports(): + from loguru import logger + import aiohttp + import backoff + import asyncio + from typing import Optional + from urllib.parse import quote, unquote + from rdkit import Chem + import deepsmiles + import selfies + + +_safe_image = Image.debian_slim().pip_install("safe-mol") + +with _safe_image.imports(): + import safe + + +class _Name2Smiles: + """Convert chemical names to SMILES notation using multiple chemical APIs. + + This class attempts to convert chemical compound names to SMILES notation + by querying multiple chemical databases APIs in parallel until a valid + result is found. + + Args: + name (str): The chemical compound name to convert to SMILES notation. + timeout (int): The timeout for API requests in seconds. + + Raises: + ValueError: If the name cannot be URL-encoded or contains invalid characters. + + Example: + # Basic usage with IUPAC name + >>> converter = Name2Smiles("2-propanone") + >>> await converter.get_smiles() + """ + + def __init__(self, name: str, timeout: int): + try: + self.name = quote(name) + except Exception as e: + logger.error(f"Error encoding name: {e}") + raise ValueError(f"Invalid chemical name: {name}") + + self.timeout = timeout # seconds + + @backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, asyncio.TimeoutError), + max_time=10, + logger=logger, + ) + async def opsin_api(self) -> Optional[str]: + """ + Queries the OPSIN (Open Parser for Systematic IUPAC Nomenclature) API + to convert chemical name to SMILES. + + Returns: + str: SMILES notation if successful, None otherwise + + Raises: + aiohttp.ClientError: On API connection errors + asyncio.TimeoutError: If request times out + """ + url = f"https://opsin.ch.cam.ac.uk/opsin/{self.name}" + + async with aiohttp.ClientSession() as session: + try: + async with session.get(url, timeout=self.timeout) as response: + if response.status == 200: + data = await response.json() + return data["smiles"] + raise ValueError(f"OPSIN API failed with status {response.status}") + except Exception as e: + raise e + + @backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, asyncio.TimeoutError), + max_time=10, + logger=logger, + ) + async def cactus(self) -> Optional[str]: + """ + Queries the NCI CACTUS Chemical Identifier Resolver API + to convert chemical name to SMILES. + + Returns: + str: SMILES notation if successful, None otherwise + + Raises: + aiohttp.ClientError: On API connection errors + asyncio.TimeoutError: If request times out + """ + url = f"https://cactus.nci.nih.gov/chemical/structure/{self.name}/smiles" + + async with aiohttp.ClientSession() as session: + try: + async with session.get(url, timeout=self.timeout) as response: + if response.status == 200: + return await response.text() + raise ValueError(f"CACTUS API failed with status {response.status}") + except Exception as e: + raise e + + @backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, asyncio.TimeoutError), + max_time=10, + logger=logger, + ) + async def pubchem(self) -> Optional[str]: + """ + Queries the PubChem REST API to convert chemical name to + isomeric SMILES notation. + + Returns: + str: SMILES notation if successful, None otherwise + + Raises: + aiohttp.ClientError: On API connection errors + asyncio.TimeoutError: If request times out + """ + url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{self.name}/property/IsomericSMILES/JSON" + + async with aiohttp.ClientSession() as session: + try: + async with session.get(url, timeout=self.timeout) as response: + if response.status == 200: + data = await response.json() + return data["PropertyTable"]["Properties"][0]["IsomericSMILES"] + raise ValueError( + f"PubChem API failed with status {response.status}" + ) + except Exception as e: + raise e + + async def get_smiles(self) -> str: + """Query all APIs in parallel until a valid SMILES is found. + + Attempts to convert name to SMILES using multiple APIs concurrently, + returning the first successful result. + + Returns: + str: First valid SMILES notation found + + Raises: + ValueError: If no API returns a valid SMILES notation + """ + tasks = [ + self.opsin_api(), + self.cactus(), + self.pubchem(), + ] + + for result in asyncio.as_completed(tasks): + try: + smiles = await result + if smiles: + return smiles.strip() + except Exception: + continue + + logger.error(f"Could not find SMILES for {unquote(self.name)}") + raise ValueError(f"Could not find SMILES for {unquote(self.name)}") + + +class _Smiles2Name: + """ + Convert SMILES chemical notation to IUPAC chemical names. + + This class provides methods to query different chemical databases (PubChem, CACTUS) + to obtain IUPAC names for chemical compounds using their SMILES representation. + + Args: + smiles (str): The SMILES string representing the chemical compound. + timeout (int): The timeout for API requests in seconds. + + Raises: + ValueError: If the SMILES string is invalid or cannot be encoded. + + Example: + >>> converter = Name2Smiles("ethanol") + >>> await converter.get_smiles() + 'CCO' + """ + + def __init__(self, smiles: str, timeout: int): + mol = Chem.MolFromSmiles(smiles.strip()) + if mol is None: + raise ValueError(f"Invalid SMILES: {smiles}") + + self.smiles = smiles + self.timeout = timeout # seconds + + @backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, asyncio.TimeoutError), + max_time=10, + logger=logger, + ) + async def pubchem(self) -> Optional[str]: + """ + Query PubChem API to get IUPAC name from SMILES. + + Returns: + Optional[str]: IUPAC name if found, None if the query failed. + + Raises: + aiohttp.ClientError: If the API request fails. + asyncio.TimeoutError: If the request times out. + """ + smiles = quote(self.smiles) + url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/IUPACName/TXT" + async with aiohttp.ClientSession() as session: + try: + async with session.get(url, timeout=self.timeout) as response: + if response.status == 200: + return await response.text() + raise ValueError( + f"PubChem API failed with status {response.status}" + ) + except Exception as e: + raise e + + @backoff.on_exception( + backoff.expo, + (aiohttp.ClientError, asyncio.TimeoutError), + max_time=10, + logger=logger, + ) + async def cactus(self) -> Optional[str]: + """ + Query CACTUS API to get IUPAC name from SMILES. + + Returns: + Optional[str]: IUPAC name if found, None if the query failed. + + Raises: + aiohttp.ClientError: If the API request fails. + asyncio.TimeoutError: If the request times out. + """ + inchi = Chem.MolToInchi(Chem.MolFromSmiles(self.smiles)) + url = f"https://cactus.nci.nih.gov/chemical/structure/{inchi}/iupac_name" + + async with aiohttp.ClientSession() as session: + try: + async with session.get(url, timeout=self.timeout) as response: + if response.status == 200: + return await response.text() + raise ValueError(f"CACTUS API failed with status {response.status}") + except Exception as e: + raise e + + async def get_name(self) -> str: + """ + Query multiple chemical APIs in parallel to get IUPAC name. + + Attempts to retrieve the IUPAC name by querying multiple chemical databases + concurrently (CACTUS and PubChem). Returns the first successful result. + + Returns: + str: The IUPAC name of the chemical compound. + + Raises: + ValueError: If no name could be found in any of the chemical databases. + """ + tasks = [ + self.cactus(), + self.pubchem(), + ] + + for result in asyncio.as_completed(tasks): + try: + name = await result + if name: + return name.strip() + except Exception: + continue + + logger.error(f"Could not find name for {self.smiles}") + raise ValueError(f"Could not find name for {self.smiles}") + + +def _smiles_to_selfies(smiles: str) -> str: + """ + Takes a SMILES and return the SELFIES encoding. + + Args: + smiles (str): SMILES string + + Returns: + str: SELFIES of the input SMILES + """ + + return selfies.encoder(smiles.strip()) + + +def _smiles_to_deepsmiles(smiles: str) -> str: + """ + Takes a SMILES and return the DeepSMILES encoding. + + Args: + smiles (str): SMILES string + + Returns: + str: DeepSMILES of the input SMILES + """ + converter = deepsmiles.Converter(rings=True, branches=True) + return converter.encode(smiles.strip()) + + +def _smiles_to_inchi(smiles: str) -> str: + """ + Takes a SMILES and return the InChI. + + Args: + smiles (str): SMILES string + + Returns: + str: InChI of the input SMILES + + Raises: + ValueError: If the input SMILES is invalid + """ + mol = Chem.MolFromSmiles(smiles.strip()) + if mol is None: + raise ValueError(f"Invalid SMILES: {smiles}") + return Chem.MolToInchi(mol) + + +def _smiles_to_inchikey(smiles: str) -> str: + """ + Takes a SMILES and return the InChIKey. + + Args: + smiles (str): SMILES string + + Returns: + str: InChIKey of the input SMILES + + Raises: + ValueError: If the input SMILES is invalid + """ + mol = Chem.MolFromSmiles(smiles.strip()) + if mol is None: + raise ValueError(f"Invalid SMILES: {smiles}") + return Chem.MolToInchiKey(mol) + + +def _smiles_to_safe(smiles: str) -> str: + """ + Takes a SMILES and return the SAFE (https://github.com/datamol-io/safe). + + Args: + smiles (str): SMILES string + + Returns: + str: SAFE of the input SMILES + """ + return safe.encode(smiles.strip(), seed=42, canonical=True, randomize=False) + + +def _selfies_to_smiles(_selfies: str) -> str: + """ + Takes a SELFIES and return the SMILES. + + Args: + selfies (str): SELFIES string + + Returns: + str: SMILES of the input SELFIES + """ + return selfies.decoder(_selfies.strip()) + + +def _inchi_to_smiles(inchi: str) -> str: + """ + Takes an InChI and return the SMILES. + + Args: + inchi (str): InChI string + + Returns: + str: SMILES of the input InChI + + Raises: + ValueError: If the input InChI is invalid + """ + mol = Chem.MolFromInchi(inchi.strip()) + if mol is None: + raise ValueError(f"Invalid InChI: {inchi}") + return Chem.MolToSmiles(mol) + + +def _deepsmiles_to_smiles(_deepsmiles: str) -> str: + """ + Takes a DeepSMILES and return the SMILES. + + Args: + deepsmiles (str): DeepSMILES string + + Returns: + str: SMILES of the input DeepSMILES + """ + converter = deepsmiles.Converter(rings=True, branches=True) + try: + decoded = converter.decode(_deepsmiles.strip()) + return decoded + except deepsmiles.DecodeError: + raise ValueError(f"Invalid DeepSMILES: {_deepsmiles}") diff --git a/tests/tools/test_converters.py b/tests/tools/test_converters.py new file mode 100644 index 0000000..81628ec --- /dev/null +++ b/tests/tools/test_converters.py @@ -0,0 +1,221 @@ +import pytest +import asyncio +import aiohttp +from unittest.mock import patch, AsyncMock + +from chemenv.tools.converters import ( + Name2Smiles, + Smiles2Name, + smiles_to_selfies, + smiles_to_deepsmiles, + smiles_to_canoncial, + smiles_to_inchi, + smiles_to_safe, +) + +def test_smiles_to_selfies(): + assert smiles_to_selfies("CCO") == "[C][C][O]" + assert smiles_to_selfies("CC") == "[C][C]" + +def test_smiles_to_deepsmiles(): + assert smiles_to_deepsmiles("CCO") == "CCO" + assert smiles_to_deepsmiles("CC") == "CC" + +def test_smiles_to_canoncial(): + assert smiles_to_canoncial("CCO") == "CCO" + assert smiles_to_canoncial("CC") == "CC" + +def test_smiles_to_inchi(): + assert smiles_to_inchi("CCO") == "InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3" + assert smiles_to_inchi("CC") == "InChI=1S/C2H6/c1-2/h1-2H3" + +@pytest.mark.asyncio +async def test_name2smiles_failures(): + # Test failed API calls return None + converter = Name2Smiles("ethanol") + + # Test non-200 response + mock_response = AsyncMock() + mock_response.status = 404 + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await converter.opsin() + assert result is None + + result = await converter.cactus() + assert result is None + + result = await converter.pubchem() + assert result is None + + result = await converter.unknown() + assert result is None + + # Test invalid JSON response + mock_response.status = 200 + mock_response.json.side_effect = ValueError + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await converter.opsin() + assert result is None + + result = await converter.pubchem() + assert result is None + +@pytest.mark.asyncio +async def test_name2smiles_retries(): + converter = Name2Smiles("ethanol") + + # Mock that fails twice then succeeds + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json.return_value = {"smiles": "CCO"} + + with patch("aiohttp.ClientSession.get", + side_effect=[aiohttp.ClientError, aiohttp.ClientError, + AsyncMock(__aenter__=AsyncMock(return_value=mock_response))]): + result = await converter.opsin() + assert result == "CCO" + +async def test_cactus_api(name2smiles): + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = "CCO" + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await name2smiles.cactus() + assert result == "CCO" + +@pytest.mark.asyncio +async def test_pubchem_api(name2smiles): + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json.return_value = { + "PropertyTable": { + "Properties": [{"IsomericSMILES": "CCO"}] + } + } + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await name2smiles.pubchem() + assert result == "CCO" + +@pytest.mark.asyncio +async def test_unknown_api(name2smiles): + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = "Message:CCO" + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await name2smiles.unknown() + assert result == "CCO" + +@pytest.mark.asyncio +async def test_api_errors(name2smiles): + # Test timeout error + with patch("aiohttp.ClientSession.get", side_effect=asyncio.TimeoutError): + result = await name2smiles.opsin() + assert result is None + + # Test client error + with patch("aiohttp.ClientSession.get", side_effect=aiohttp.ClientError): + result = await name2smiles.cactus() + assert result is None + +@pytest.mark.asyncio +async def test_get_smiles_parallel(name2smiles): + mock_opsin = AsyncMock(return_value=None) + mock_cactus = AsyncMock(return_value="CCO") + mock_pubchem = AsyncMock(return_value=None) + mock_unknown = AsyncMock(return_value=None) + + with patch.multiple(name2smiles, + opsin=mock_opsin, + cactus=mock_cactus, + pubchem=mock_pubchem, + unknown=mock_unknown): + result = await name2smiles.get_smiles() + assert result == "CCO" + assert isinstance(result, str) + +@pytest.fixture +def smiles2name(): + return Smiles2Name("CCO") + +@pytest.mark.asyncio +async def test_smiles2name_init(): + # Valid SMILES + converter = Smiles2Name("CCO") + assert converter.smiles == "CCO" + + # Invalid SMILES + with pytest.raises(ValueError, match="Invalid SMILES"): + Smiles2Name("invalid_smiles") + +@pytest.mark.asyncio +async def test_smiles2name_pubchem(smiles2name): + # Test successful response + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = "ethanol" + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await smiles2name.pubchem() + assert result == "ethanol" + + # Test failed response + mock_response.status = 404 + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await smiles2name.pubchem() + assert result is None + +@pytest.mark.asyncio +async def test_smiles2name_cactus(smiles2name): + # Test successful response + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = "ethanol" + + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await smiles2name.cactus() + assert result == "ethanol" + + # Test failed response + mock_response.status = 404 + with patch("aiohttp.ClientSession.get", return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))): + result = await smiles2name.cactus() + assert result is None + +@pytest.mark.asyncio +async def test_smiles2name_api_errors(smiles2name): + # Test timeout error + with patch("aiohttp.ClientSession.get", side_effect=asyncio.TimeoutError): + result = await smiles2name.pubchem() + assert result is None + + # Test client error + with patch("aiohttp.ClientSession.get", side_effect=aiohttp.ClientError): + result = await smiles2name.cactus() + assert result is None + +@pytest.mark.asyncio +async def test_smiles2name_get_name_parallel(smiles2name): + # Test when first API succeeds + mock_cactus = AsyncMock(return_value="ethanol") + mock_pubchem = AsyncMock(return_value=None) + + with patch.multiple(smiles2name, + cactus=mock_cactus, + pubchem=mock_pubchem): + result = await smiles2name.get_name() + assert result == "ethanol" + + # Test when all APIs fail + mock_cactus = AsyncMock(return_value=None) + mock_pubchem = AsyncMock(return_value=None) + + with patch.multiple(smiles2name, + cactus=mock_cactus, + pubchem=mock_pubchem): + with pytest.raises(ValueError, match="Could not find name"): + await smiles2name.get_name() \ No newline at end of file