From ee95a95365b13d7ee361505d340639e592d28cdc Mon Sep 17 00:00:00 2001 From: Alex Carlin Date: Mon, 24 Jul 2023 23:58:28 -0700 Subject: [PATCH 1/9] Implement fetching from AlphaFold DB - add module for fetching from AlphaFold DB - add tests for module - more to come --- src/biotite/database/alphafold/__init__.py | 12 +++ src/biotite/database/alphafold/download.py | 120 +++++++++++++++++++++ tests/database/test_alphafold.py | 46 ++++++++ 3 files changed, 178 insertions(+) create mode 100644 src/biotite/database/alphafold/__init__.py create mode 100644 src/biotite/database/alphafold/download.py create mode 100644 tests/database/test_alphafold.py diff --git a/src/biotite/database/alphafold/__init__.py b/src/biotite/database/alphafold/__init__.py new file mode 100644 index 000000000..498422158 --- /dev/null +++ b/src/biotite/database/alphafold/__init__.py @@ -0,0 +1,12 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +A subpackage for downloading protein structures from the AlphaFold DB +""" + +__name__ = "biotite.database.alphafold" +__author__ = "Alex Carlin" + +from .download import * diff --git a/src/biotite/database/alphafold/download.py b/src/biotite/database/alphafold/download.py new file mode 100644 index 000000000..5b4b69c4a --- /dev/null +++ b/src/biotite/database/alphafold/download.py @@ -0,0 +1,120 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +__name__ = "biotite.database.alphafold" +__author__ = "Alex Carlin" +__all__ = ["fetch"] + +from os.path import isdir, isfile, join, getsize +import os +import io +import requests +from .check import assert_valid_response + + +_fetch_url = "https://alphafold.com/api/prediction/" + + +def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): + """ + Download predicted protein structures from the AlphaFold DB. + + This function requires an internet connection. + + Parameters + ---------- + ids : str or iterable object of str + A single ID or a list of IDs of the file(s) + to be downloaded. + target_path : str, optional + The target directory of the downloaded files. + By default, the file content is stored in a file-like object + (`StringIO` or `BytesIO`, respectively). + format : {"pdb", "cif", "bcif"} + The format of the files to be downloaded. + overwrite : bool, optional + If true, existing files will be overwritten. Otherwise the + respective file will only be downloaded if the file does not + exist yet in the specified target directory or if the file is + empty. (Default: False) + verbose: bool, optional + If true, the function will output the download progress. + (Default: False) + + Returns + ------- + files : str or StringIO or BytesIO or list of (str or StringIO or BytesIO) + The file path(s) to the downloaded files. + If a single string (a single ID) was given in `ids`, + a single string is returned. If a list (or other iterable + object) was given, a list of strings is returned. + If no `target_path` was given, the file contents are stored in + either `StringIO` or `BytesIO` objects. + + Examples + -------- + + >>> import os.path + >>> file = fetch("P12345", path_to_directory) + >>> print(os.path.basename(file)) + P12345.pdb + >>> files = fetch(["P12345", "Q8K9I1"], path_to_directory) + >>> print([os.path.basename(file) for file in files]) + ['P12345.pdb', 'Q8K9I1.pdb'] + """ + + # If only a single ID is present, + # put it into a single element list + if isinstance(ids, str): + ids = [ids] + single_element = True + else: + single_element = False + # Create the target folder, if not existing + if target_path is not None and not isdir(target_path): + os.makedirs(target_path) + files = [] + for i, id in enumerate(ids): + # Verbose output + if verbose: + print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", + end="\r") + # Fetch file from database + if target_path is not None: + file = join(target_path, id + "." + format) + else: + # 'file = None' -> store content in a file-like object + file = None + if file is None \ + or not isfile(file) \ + or getsize(file) == 0 \ + or overwrite: + if format in ["pdb", "cif", "bcif"]: + metadata_response = requests.get(f"{_fetch_url}/{id}") + metadata_json = metadata_response.json()[0] + # a list of length 1 is always returned + file_url = metadata_json[f"{format}Url"] + file_response = requests.get(file_url) + content = file_response.text + assert_valid_response(r.status_code) + else: + raise ValueError(f"Format '{format}' is not supported") + if file is None: + file = io.StringIO(content) + else: + with open(file, "w+") as f: + f.write(content) + files.append(file) + if verbose: + print("\nDone") + # If input was a single ID, return only a single path + if single_element: + return files[0] + else: + return files + + + + + diff --git a/tests/database/test_alphafold.py b/tests/database/test_alphafold.py new file mode 100644 index 000000000..9bf1d76f7 --- /dev/null +++ b/tests/database/test_alphafold.py @@ -0,0 +1,46 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +import itertools +import tempfile +import pytest +import biotite.database.alphafold as alphafold +import biotite.structure.io.pdb as pdb +from biotite.database import RequestError +from ..util import cannot_connect_to + + +ALPHAFOLD_URL = "https://alphafold.com/" + + +@pytest.mark.skipif( + cannot_connect_to(ALPHAFOLD_URL), + reason="AlphaFold is not available" +) +@pytest.mark.parametrize( + "as_file_like", + itertools.product([False, True]) +) +def test_fetch(as_file_like): + path = None if as_file_like else tempfile.gettempdir() + file = alphafold.fetch( + "P12345", path, overwrite=True + ) + pdb_file = pdb.PDBFile.read(file) + array_stack = pdb_file.get_structure() + assert len(array_stack) > 0 + + +@pytest.mark.skipif( + cannot_connect_to(ALPHAFOLD_URL), + reason="AlphaFold is not available" +) +@pytest.mark.parametrize("format", ["pdb", "cif", "bcif"]) +def test_fetch_invalid(format): + with pytest.raises(RequestError): + file = alphafold.fetch( + "XYZ", format, tempfile.gettempdir(), overwrite=True + ) + + From 2344d5549e63d152b11037aa4182049d6b7c7f6e Mon Sep 17 00:00:00 2001 From: Alex Carlin Date: Thu, 17 Aug 2023 15:24:13 -0700 Subject: [PATCH 2/9] Create basic tests --- src/biotite/database/alphafold/download.py | 10 ++++++---- tests/database/test_alphafold.py | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/biotite/database/alphafold/download.py b/src/biotite/database/alphafold/download.py index 5b4b69c4a..0bff83814 100644 --- a/src/biotite/database/alphafold/download.py +++ b/src/biotite/database/alphafold/download.py @@ -13,7 +13,7 @@ from .check import assert_valid_response -_fetch_url = "https://alphafold.com/api/prediction/" +_fetch_url = "https://alphafold.com/api/prediction" def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): @@ -90,14 +90,16 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): or not isfile(file) \ or getsize(file) == 0 \ or overwrite: - if format in ["pdb", "cif", "bcif"]: + if format in ["pdb", "cif"]: metadata_response = requests.get(f"{_fetch_url}/{id}") - metadata_json = metadata_response.json()[0] + assert_valid_response(metadata_response.status_code) + metadata_json = metadata_response.json()[0] + print(metadata_json) # a list of length 1 is always returned file_url = metadata_json[f"{format}Url"] file_response = requests.get(file_url) + assert_valid_response(file_response.status_code) content = file_response.text - assert_valid_response(r.status_code) else: raise ValueError(f"Format '{format}' is not supported") if file is None: diff --git a/tests/database/test_alphafold.py b/tests/database/test_alphafold.py index 9bf1d76f7..4736f3cbc 100644 --- a/tests/database/test_alphafold.py +++ b/tests/database/test_alphafold.py @@ -36,11 +36,11 @@ def test_fetch(as_file_like): cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" ) -@pytest.mark.parametrize("format", ["pdb", "cif", "bcif"]) +@pytest.mark.parametrize("format", ["pdb", "cif"]) def test_fetch_invalid(format): with pytest.raises(RequestError): file = alphafold.fetch( - "XYZ", format, tempfile.gettempdir(), overwrite=True + "XYZ", target_path=tempfile.gettempdir(), format=format, overwrite=True ) From bf74ac9ef0460efed5ee08902c66fd62892ee2d7 Mon Sep 17 00:00:00 2001 From: Alex Carlin Date: Mon, 21 Aug 2023 13:29:18 -0700 Subject: [PATCH 3/9] Update tests to work with CIF files --- src/biotite/database/alphafold/download.py | 1 - tests/database/test_alphafold.py | 26 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/biotite/database/alphafold/download.py b/src/biotite/database/alphafold/download.py index 0bff83814..85f58ab33 100644 --- a/src/biotite/database/alphafold/download.py +++ b/src/biotite/database/alphafold/download.py @@ -94,7 +94,6 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): metadata_response = requests.get(f"{_fetch_url}/{id}") assert_valid_response(metadata_response.status_code) metadata_json = metadata_response.json()[0] - print(metadata_json) # a list of length 1 is always returned file_url = metadata_json[f"{format}Url"] file_response = requests.get(file_url) diff --git a/tests/database/test_alphafold.py b/tests/database/test_alphafold.py index 4736f3cbc..cce08a1d2 100644 --- a/tests/database/test_alphafold.py +++ b/tests/database/test_alphafold.py @@ -6,7 +6,8 @@ import tempfile import pytest import biotite.database.alphafold as alphafold -import biotite.structure.io.pdb as pdb +import biotite.structure.io.pdb as pdb +import biotite.structure.io.pdbx as pdbx from biotite.database import RequestError from ..util import cannot_connect_to @@ -44,3 +45,26 @@ def test_fetch_invalid(format): ) +@pytest.mark.skipif( + cannot_connect_to(ALPHAFOLD_URL), + reason="AlphaFold is not available" +) +@pytest.mark.parametrize("format", ["pdb", "cif"]) +def test_fetch_multiple(format): + acc = ["P12345", "P12345"] + files = alphafold.fetch( + acc, target_path=tempfile.gettempdir(), format=format, overwrite=True + ) + print(files) + for file in files: + if format == "pdb": + pdb_file = pdb.PDBFile.read(file) + structure = pdb_file.get_structure() + assert len(structure) > 0 + elif format == "cif": + cif_file = pdbx.PDBxFile.read(file) + assert "citation_author" in cif_file.keys() + + + + From bf239b742660d22d5d097555eeafd47068bc2559 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 25 Jan 2025 12:57:34 +0100 Subject: [PATCH 4/9] Reformat code --- src/biotite/database/alphafold/download.py | 24 +++------- tests/database/test_alphafold.py | 55 +++++++++------------- 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/src/biotite/database/alphafold/download.py b/src/biotite/database/alphafold/download.py index 85f58ab33..498d905cf 100644 --- a/src/biotite/database/alphafold/download.py +++ b/src/biotite/database/alphafold/download.py @@ -6,19 +6,18 @@ __author__ = "Alex Carlin" __all__ = ["fetch"] -from os.path import isdir, isfile, join, getsize -import os import io +import os +from os.path import getsize, isdir, isfile, join import requests -from .check import assert_valid_response - +from biotite.database.alphafold.check import assert_valid_response _fetch_url = "https://alphafold.com/api/prediction" def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): """ - Download predicted protein structures from the AlphaFold DB. + Download predicted protein structures from the AlphaFold DB. This function requires an internet connection. @@ -78,23 +77,19 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): for i, id in enumerate(ids): # Verbose output if verbose: - print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", - end="\r") + print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r") # Fetch file from database if target_path is not None: file = join(target_path, id + "." + format) else: # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: if format in ["pdb", "cif"]: metadata_response = requests.get(f"{_fetch_url}/{id}") assert_valid_response(metadata_response.status_code) metadata_json = metadata_response.json()[0] - # a list of length 1 is always returned + # a list of length 1 is always returned file_url = metadata_json[f"{format}Url"] file_response = requests.get(file_url) assert_valid_response(file_response.status_code) @@ -114,8 +109,3 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): return files[0] else: return files - - - - - diff --git a/tests/database/test_alphafold.py b/tests/database/test_alphafold.py index cce08a1d2..9b785e97d 100644 --- a/tests/database/test_alphafold.py +++ b/tests/database/test_alphafold.py @@ -9,62 +9,49 @@ import biotite.structure.io.pdb as pdb import biotite.structure.io.pdbx as pdbx from biotite.database import RequestError -from ..util import cannot_connect_to - +from tests.util import cannot_connect_to ALPHAFOLD_URL = "https://alphafold.com/" @pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), - reason="AlphaFold is not available" -) -@pytest.mark.parametrize( - "as_file_like", - itertools.product([False, True]) + cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" ) +@pytest.mark.parametrize("as_file_like", itertools.product([False, True])) def test_fetch(as_file_like): path = None if as_file_like else tempfile.gettempdir() - file = alphafold.fetch( - "P12345", path, overwrite=True - ) + file = alphafold.fetch("P12345", path, overwrite=True) pdb_file = pdb.PDBFile.read(file) array_stack = pdb_file.get_structure() assert len(array_stack) > 0 - + @pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), - reason="AlphaFold is not available" + cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" ) @pytest.mark.parametrize("format", ["pdb", "cif"]) def test_fetch_invalid(format): with pytest.raises(RequestError): - file = alphafold.fetch( + alphafold.fetch( "XYZ", target_path=tempfile.gettempdir(), format=format, overwrite=True ) @pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), - reason="AlphaFold is not available" + cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" ) @pytest.mark.parametrize("format", ["pdb", "cif"]) def test_fetch_multiple(format): - acc = ["P12345", "P12345"] - files = alphafold.fetch( - acc, target_path=tempfile.gettempdir(), format=format, overwrite=True - ) - print(files) - for file in files: - if format == "pdb": - pdb_file = pdb.PDBFile.read(file) - structure = pdb_file.get_structure() - assert len(structure) > 0 - elif format == "cif": - cif_file = pdbx.PDBxFile.read(file) - assert "citation_author" in cif_file.keys() - - - - + acc = ["P12345", "P12345"] + files = alphafold.fetch( + acc, target_path=tempfile.gettempdir(), format=format, overwrite=True + ) + print(files) + for file in files: + if format == "pdb": + pdb_file = pdb.PDBFile.read(file) + structure = pdb_file.get_structure() + assert len(structure) > 0 + elif format == "cif": + cif_file = pdbx.PDBxFile.read(file) + assert "citation_author" in cif_file.keys() From 73bfa992606c679773d5dbc3111eb9682d32f664 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 25 Jan 2025 15:38:21 +0100 Subject: [PATCH 5/9] Finalize AFDB interface --- src/biotite/database/alphafold/download.py | 156 +++++++++++++++------ tests/database/test_alphafold.py | 80 ++++++----- 2 files changed, 165 insertions(+), 71 deletions(-) diff --git a/src/biotite/database/alphafold/download.py b/src/biotite/database/alphafold/download.py index 498d905cf..bf39e1eee 100644 --- a/src/biotite/database/alphafold/download.py +++ b/src/biotite/database/alphafold/download.py @@ -3,19 +3,25 @@ # information. __name__ = "biotite.database.alphafold" -__author__ = "Alex Carlin" +__author__ = "Patrick Kunzmann, Alex Carlin" __all__ = ["fetch"] import io -import os -from os.path import getsize, isdir, isfile, join +import re +from pathlib import Path +from xml.etree import ElementTree import requests -from biotite.database.alphafold.check import assert_valid_response +from biotite.database.error import RequestError -_fetch_url = "https://alphafold.com/api/prediction" +_METADATA_URL = "https://alphafold.com/api/prediction" +_BINARY_FORMATS = ["bcif"] +# Adopted from https://www.uniprot.org/help/accession_numbers +_UNIPROT_PATTERN = ( + "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" +) -def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): +def fetch(ids, format="pdb", target_path=None, overwrite=False, verbose=False): """ Download predicted protein structures from the AlphaFold DB. @@ -24,22 +30,19 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): Parameters ---------- ids : str or iterable object of str - A single ID or a list of IDs of the file(s) - to be downloaded. + A single ID or a list of IDs of the file(s) to be downloaded. + format : {'pdb', 'pdbx', 'cif', 'mmcif', 'bcif', 'fasta'} + The format of the files to be downloaded. target_path : str, optional The target directory of the downloaded files. By default, the file content is stored in a file-like object (`StringIO` or `BytesIO`, respectively). - format : {"pdb", "cif", "bcif"} - The format of the files to be downloaded. overwrite : bool, optional - If true, existing files will be overwritten. Otherwise the - respective file will only be downloaded if the file does not - exist yet in the specified target directory or if the file is - empty. (Default: False) + If true, existing files will be overwritten. + Otherwise the respective file will only be downloaded if the file does not + exist yet in the specified target directory or if the file is empty. verbose: bool, optional If true, the function will output the download progress. - (Default: False) Returns ------- @@ -55,13 +58,17 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): -------- >>> import os.path - >>> file = fetch("P12345", path_to_directory) - >>> print(os.path.basename(file)) - P12345.pdb - >>> files = fetch(["P12345", "Q8K9I1"], path_to_directory) - >>> print([os.path.basename(file) for file in files]) - ['P12345.pdb', 'Q8K9I1.pdb'] + >>> file = fetch("P12345", "cif", path_to_directory) + >>> print(Path(file).name) + P12345.cif + >>> files = fetch(["P12345", "Q8K9I1"], "cif", path_to_directory) + >>> print([Path(file).name for file in files]) + ['P12345.cif', 'Q8K9I1.cif'] """ + if format not in ["pdb", "pdbx", "cif", "mmcif", "bcif", "fasta"]: + raise ValueError(f"Format '{format}' is not supported") + if format in ["pdbx", "mmcif"]: + format = "cif" # If only a single ID is present, # put it into a single element list @@ -70,9 +77,10 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): single_element = True else: single_element = False - # Create the target folder, if not existing - if target_path is not None and not isdir(target_path): - os.makedirs(target_path) + if target_path is not None: + target_path = Path(target_path) + target_path.mkdir(parents=True, exist_ok=True) + files = [] for i, id in enumerate(ids): # Verbose output @@ -80,32 +88,102 @@ def fetch(ids, target_path=None, format="pdb", overwrite=False, verbose=False): print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r") # Fetch file from database if target_path is not None: - file = join(target_path, id + "." + format) + file = target_path / f"{id}.{format}" else: # 'file = None' -> store content in a file-like object file = None - if file is None or not isfile(file) or getsize(file) == 0 or overwrite: - if format in ["pdb", "cif"]: - metadata_response = requests.get(f"{_fetch_url}/{id}") - assert_valid_response(metadata_response.status_code) - metadata_json = metadata_response.json()[0] - # a list of length 1 is always returned - file_url = metadata_json[f"{format}Url"] - file_response = requests.get(file_url) - assert_valid_response(file_response.status_code) - content = file_response.text + if file is None or not file.is_file() or file.stat().st_size == 0 or overwrite: + file_response = requests.get(_get_file_url(id, format)) + _assert_valid_file(file_response, id) + if format in _BINARY_FORMATS: + content = file_response.content else: - raise ValueError(f"Format '{format}' is not supported") + content = file_response.text + if file is None: - file = io.StringIO(content) + if format in _BINARY_FORMATS: + file = io.BytesIO(content) + else: + file = io.StringIO(content) else: - with open(file, "w+") as f: + mode = "wb+" if format in _BINARY_FORMATS else "w+" + with open(file, mode) as f: f.write(content) + files.append(file) if verbose: print("\nDone") - # If input was a single ID, return only a single path + + # Return paths as strings + files = [file.as_posix() if isinstance(file, Path) else file for file in files] + # If input was a single ID, return only a single element if single_element: return files[0] else: return files + + +def _get_file_url(id, format): + """ + Get the actual file URL for the given ID from the ``prediction`` API endpoint. + + Parameters + ---------- + id : str + The ID of the file to be downloaded. + format : str + The format of the file to be downloaded. + + Returns + ------- + file_url : str + The URL of the file to be downloaded. + """ + uniprot_id = _extract_id(id) + metadata = requests.get(f"{_METADATA_URL}/{uniprot_id}").json() + if len(metadata) == 0: + raise RequestError(f"ID {id} is invalid") + # A list of length 1 is always returned, if the response is valid + return metadata[0][f"{format}Url"] + + +def _extract_id(id): + """ + Extract a AFDB compatible UniProt ID from the given qualifier. + This may comprise + + - Directly the UniProt ID (e.g. ``P12345``) (trivial case) + - Entry ID, as also returned by the RCSB search API (e.g. ``AF-P12345-F1``) + + Parameters + ---------- + id : str + The qualifier to extract the UniProt ID from. + + Returns + ------- + uniprot_id : str + The UniProt ID. + """ + match = re.search(_UNIPROT_PATTERN, id) + if match is None: + raise ValueError(f"Cannot extract AFDB identifier from '{id}'") + return match.group() + + +def _assert_valid_file(response, id): + """ + Checks whether the response is an actual structure file + or the response a *404* error due to invalid UniProt ID. + """ + if len(response.text) == 0: + raise RequestError(f"Received no repsone for '{id}'") + try: + root = ElementTree.fromstring(response.text) + if root.tag == "Error": + raise RequestError( + f"Error while fetching '{id}': {root.find('Message').text}" + ) + except ElementTree.ParseError: + # This is not XML -> the response is probably a valid file + pass diff --git a/tests/database/test_alphafold.py b/tests/database/test_alphafold.py index 9b785e97d..a811217c5 100644 --- a/tests/database/test_alphafold.py +++ b/tests/database/test_alphafold.py @@ -2,7 +2,6 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import itertools import tempfile import pytest import biotite.database.alphafold as alphafold @@ -11,47 +10,64 @@ from biotite.database import RequestError from tests.util import cannot_connect_to -ALPHAFOLD_URL = "https://alphafold.com/" +ALPHAFOLD_URL = "https://alphafold.ebi.ac.uk/" @pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" + cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold DB is not available" ) -@pytest.mark.parametrize("as_file_like", itertools.product([False, True])) -def test_fetch(as_file_like): +@pytest.mark.parametrize("as_file_like", [False, True]) +@pytest.mark.parametrize("entry_id", ["P12345", "AF-P12345-F1"]) +@pytest.mark.parametrize("format", ["pdb", "cif", "bcif"]) +def test_fetch(as_file_like, entry_id, format): + """ + Check if files in different formats can be downloaded by being able to parse them. + """ path = None if as_file_like else tempfile.gettempdir() - file = alphafold.fetch("P12345", path, overwrite=True) - pdb_file = pdb.PDBFile.read(file) - array_stack = pdb_file.get_structure() - assert len(array_stack) > 0 + file_path_or_obj = alphafold.fetch(entry_id, format, path, overwrite=True) + if format == "pdb": + file = pdb.PDBFile.read(file_path_or_obj) + pdb.get_structure(file) + elif format == "cif": + file = pdbx.CIFFile.read(file_path_or_obj) + pdbx.get_structure(file) + elif format == "bcif": + file = pdbx.BinaryCIFFile.read(file_path_or_obj) + pdbx.get_structure(file) @pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" + cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold DB is not available" ) -@pytest.mark.parametrize("format", ["pdb", "cif"]) -def test_fetch_invalid(format): - with pytest.raises(RequestError): - alphafold.fetch( - "XYZ", target_path=tempfile.gettempdir(), format=format, overwrite=True - ) +def test_fetch_multiple(): + """ + Check if multiple files can be downloaded by being able to parse them. + """ + ids = ["P12345", "Q8K9I1"] + files = alphafold.fetch(ids, "cif", tempfile.gettempdir(), overwrite=True) + for file in files: + assert "citation_author" in pdbx.CIFFile.read(file).block @pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold is not available" + cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold DB is not available" ) -@pytest.mark.parametrize("format", ["pdb", "cif"]) -def test_fetch_multiple(format): - acc = ["P12345", "P12345"] - files = alphafold.fetch( - acc, target_path=tempfile.gettempdir(), format=format, overwrite=True - ) - print(files) - for file in files: - if format == "pdb": - pdb_file = pdb.PDBFile.read(file) - structure = pdb_file.get_structure() - assert len(structure) > 0 - elif format == "cif": - cif_file = pdbx.PDBxFile.read(file) - assert "citation_author" in cif_file.keys() +@pytest.mark.parametrize("format", ["pdb", "cif", "bcif"]) +@pytest.mark.parametrize("invalid_id", ["", "XYZ", "A0A12345"]) +@pytest.mark.parametrize("bypass_metadata", [False, True]) +def test_fetch_invalid(monkeypatch, format, invalid_id, bypass_metadata): + """ + Check if proper exceptions are raised if a given ID is invalid. + Also check whether the check works on the file retrieval level via + :func:`_get_file_url()`, by bypassing the metadata check. + """ + import biotite.database.alphafold.download as module + + if bypass_metadata: + monkeypatch.setattr( + module, + "_get_file_url", + lambda id, f: f"https://alphafold.ebi.ac.uk/files/AF-{id}-F1-model_v4.{f}", + ) + with pytest.raises((RequestError, ValueError)): + alphafold.fetch(invalid_id, format) From ad7a08bb0797493dd90cb49814c221bfce6bbbda Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 25 Jan 2025 15:41:26 +0100 Subject: [PATCH 6/9] Rename 'alphafold' subpackage to 'afdb' --- .../database/{alphafold => afdb}/__init__.py | 4 ++-- .../database/{alphafold => afdb}/download.py | 2 +- .../{test_alphafold.py => test_afdb.py} | 24 +++++++------------ 3 files changed, 12 insertions(+), 18 deletions(-) rename src/biotite/database/{alphafold => afdb}/__init__.py (65%) rename src/biotite/database/{alphafold => afdb}/download.py (99%) rename tests/database/{test_alphafold.py => test_afdb.py} (75%) diff --git a/src/biotite/database/alphafold/__init__.py b/src/biotite/database/afdb/__init__.py similarity index 65% rename from src/biotite/database/alphafold/__init__.py rename to src/biotite/database/afdb/__init__.py index 498422158..d1357138a 100644 --- a/src/biotite/database/alphafold/__init__.py +++ b/src/biotite/database/afdb/__init__.py @@ -3,10 +3,10 @@ # information. """ -A subpackage for downloading protein structures from the AlphaFold DB +A subpackage for downloading predicted protein structures from the AlphaFold DB. """ -__name__ = "biotite.database.alphafold" +__name__ = "biotite.database.afdb" __author__ = "Alex Carlin" from .download import * diff --git a/src/biotite/database/alphafold/download.py b/src/biotite/database/afdb/download.py similarity index 99% rename from src/biotite/database/alphafold/download.py rename to src/biotite/database/afdb/download.py index bf39e1eee..0504df67b 100644 --- a/src/biotite/database/alphafold/download.py +++ b/src/biotite/database/afdb/download.py @@ -2,7 +2,7 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -__name__ = "biotite.database.alphafold" +__name__ = "biotite.database.afdb" __author__ = "Patrick Kunzmann, Alex Carlin" __all__ = ["fetch"] diff --git a/tests/database/test_alphafold.py b/tests/database/test_afdb.py similarity index 75% rename from tests/database/test_alphafold.py rename to tests/database/test_afdb.py index a811217c5..e01aa661e 100644 --- a/tests/database/test_alphafold.py +++ b/tests/database/test_afdb.py @@ -4,18 +4,16 @@ import tempfile import pytest -import biotite.database.alphafold as alphafold +import biotite.database.afdb as afdb import biotite.structure.io.pdb as pdb import biotite.structure.io.pdbx as pdbx from biotite.database import RequestError from tests.util import cannot_connect_to -ALPHAFOLD_URL = "https://alphafold.ebi.ac.uk/" +AFDB_URL = "https://alphafold.ebi.ac.uk/" -@pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold DB is not available" -) +@pytest.mark.skipif(cannot_connect_to(AFDB_URL), reason="AlphaFold DB is not available") @pytest.mark.parametrize("as_file_like", [False, True]) @pytest.mark.parametrize("entry_id", ["P12345", "AF-P12345-F1"]) @pytest.mark.parametrize("format", ["pdb", "cif", "bcif"]) @@ -24,7 +22,7 @@ def test_fetch(as_file_like, entry_id, format): Check if files in different formats can be downloaded by being able to parse them. """ path = None if as_file_like else tempfile.gettempdir() - file_path_or_obj = alphafold.fetch(entry_id, format, path, overwrite=True) + file_path_or_obj = afdb.fetch(entry_id, format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile.read(file_path_or_obj) pdb.get_structure(file) @@ -36,22 +34,18 @@ def test_fetch(as_file_like, entry_id, format): pdbx.get_structure(file) -@pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold DB is not available" -) +@pytest.mark.skipif(cannot_connect_to(AFDB_URL), reason="AlphaFold DB is not available") def test_fetch_multiple(): """ Check if multiple files can be downloaded by being able to parse them. """ ids = ["P12345", "Q8K9I1"] - files = alphafold.fetch(ids, "cif", tempfile.gettempdir(), overwrite=True) + files = afdb.fetch(ids, "cif", tempfile.gettempdir(), overwrite=True) for file in files: assert "citation_author" in pdbx.CIFFile.read(file).block -@pytest.mark.skipif( - cannot_connect_to(ALPHAFOLD_URL), reason="AlphaFold DB is not available" -) +@pytest.mark.skipif(cannot_connect_to(AFDB_URL), reason="AlphaFold DB is not available") @pytest.mark.parametrize("format", ["pdb", "cif", "bcif"]) @pytest.mark.parametrize("invalid_id", ["", "XYZ", "A0A12345"]) @pytest.mark.parametrize("bypass_metadata", [False, True]) @@ -61,7 +55,7 @@ def test_fetch_invalid(monkeypatch, format, invalid_id, bypass_metadata): Also check whether the check works on the file retrieval level via :func:`_get_file_url()`, by bypassing the metadata check. """ - import biotite.database.alphafold.download as module + import biotite.database.afdb.download as module if bypass_metadata: monkeypatch.setattr( @@ -70,4 +64,4 @@ def test_fetch_invalid(monkeypatch, format, invalid_id, bypass_metadata): lambda id, f: f"https://alphafold.ebi.ac.uk/files/AF-{id}-F1-model_v4.{f}", ) with pytest.raises((RequestError, ValueError)): - alphafold.fetch(invalid_id, format) + afdb.fetch(invalid_id, format) From c5b1bc26e7ce48276d66e7fc820af3035795efcc Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 26 Jan 2025 11:27:44 +0100 Subject: [PATCH 7/9] Fix docstring --- src/biotite/database/afdb/download.py | 16 +++++++++------- tests/test_doctest.py | 8 ++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/biotite/database/afdb/download.py b/src/biotite/database/afdb/download.py index 0504df67b..0225ab1b2 100644 --- a/src/biotite/database/afdb/download.py +++ b/src/biotite/database/afdb/download.py @@ -21,7 +21,7 @@ ) -def fetch(ids, format="pdb", target_path=None, overwrite=False, verbose=False): +def fetch(ids, format, target_path=None, overwrite=False, verbose=False): """ Download predicted protein structures from the AlphaFold DB. @@ -31,6 +31,8 @@ def fetch(ids, format="pdb", target_path=None, overwrite=False, verbose=False): ---------- ids : str or iterable object of str A single ID or a list of IDs of the file(s) to be downloaded. + They can be either UniProt IDs (e.g. ``P12345``) or AlphaFold DB IDs + (e.g. ``AF-P12345F1``). format : {'pdb', 'pdbx', 'cif', 'mmcif', 'bcif', 'fasta'} The format of the files to be downloaded. target_path : str, optional @@ -48,16 +50,16 @@ def fetch(ids, format="pdb", target_path=None, overwrite=False, verbose=False): ------- files : str or StringIO or BytesIO or list of (str or StringIO or BytesIO) The file path(s) to the downloaded files. - If a single string (a single ID) was given in `ids`, - a single string is returned. If a list (or other iterable - object) was given, a list of strings is returned. - If no `target_path` was given, the file contents are stored in - either `StringIO` or `BytesIO` objects. + If a single string (a single ID) was given in `ids`, a single string is + returned. + If a list (or other iterable object) was given, a list of strings is returned. + If no `target_path` was given, the file contents are stored in either + ``StringIO`` or ``BytesIO`` objects. Examples -------- - >>> import os.path + >>> from pathlib import Path >>> file = fetch("P12345", "cif", path_to_directory) >>> print(Path(file).name) P12345.cif diff --git a/tests/test_doctest.py b/tests/test_doctest.py index 6da470bf8..10eb4c803 100644 --- a/tests/test_doctest.py +++ b/tests/test_doctest.py @@ -15,6 +15,7 @@ NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/" RCSB_URL = "https://www.rcsb.org/" +AFDB_URL = "https://alphafold.ebi.ac.uk/" UNIPROT_URL = "https://www.uniprot.org/" PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/" @@ -82,6 +83,13 @@ cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" ), ), + pytest.param( + "biotite.database.afdb", + [], + marks=pytest.mark.skipif( + cannot_connect_to(AFDB_URL), reason="AlphaFold DB is not available" + ), + ), pytest.param( "biotite.database.uniprot", [], From 5b0259f29170619c7e13ba8e22fde20946f718b3 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 26 Jan 2025 11:28:02 +0100 Subject: [PATCH 8/9] Add tutorial for AFDB --- doc/tutorial/database/rcsb.rst | 46 ++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/doc/tutorial/database/rcsb.rst b/doc/tutorial/database/rcsb.rst index 8057346aa..e48bab1c7 100644 --- a/doc/tutorial/database/rcsb.rst +++ b/doc/tutorial/database/rcsb.rst @@ -128,3 +128,49 @@ Note that grouping may omit PDB IDs in search results, if such PDB IDs cannot be grouped. For example in the case shown above only a few PDB entries were uploaded as collection and hence are part of the search results. + +Getting computational models +---------------------------- +By default :func:`search()` only returns experimental structures. +In addition to that the RCSB lists an order of magnitude more computational models. +They can be included in search results by adding ``"computational"`` to the +``content_types`` parameter. + +.. jupyter-execute:: + + query = ( + rcsb.FieldQuery("rcsb_polymer_entity.pdbx_description", contains_phrase="Hexokinase") + & rcsb.FieldQuery( + "rcsb_entity_source_organism.scientific_name", exact_match="Homo sapiens" + ) + ) + ids = rcsb.search(query, content_types=("experimental", "computational")) + print(ids) + +The returned four-character IDs are the RCSB PDB IDs of experimental structures +like we already saw above. +The IDs with the ``AF_`` on the other hand are computational models from +*AlphaFold DB*. + +.. currentmodule:: biotite.database.afdb + +To download those we require another subpackage: :mod:`biotite.database.afdb`. +Its :func:`fetch()` function works very similar. + +.. jupyter-execute:: + + import biotite.database.afdb as afdb + + files = [] + # For the sake of run time, only download the first 5 entries + for id in ids[:5]: + if id.startswith("AF_"): + # Entry is in AlphaFold DB + files.append(afdb.fetch(id, "cif", gettempdir())) + elif id.startswith("MA_"): + # Entry is in ModelArchive, which is not yet supported + raise NotImplementedError + else: + # Entry is in RCSB PDB + files.append(rcsb.fetch(id, "cif", gettempdir())) + print([basename(file) for file in files]) \ No newline at end of file From 6de13651ed4584bcb7738e6ce3a9894771ac0170 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 1 Feb 2025 19:10:12 +0100 Subject: [PATCH 9/9] Add example --- .../scripts/structure/modeling/model_lddt.py | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 doc/examples/scripts/structure/modeling/model_lddt.py diff --git a/doc/examples/scripts/structure/modeling/model_lddt.py b/doc/examples/scripts/structure/modeling/model_lddt.py new file mode 100644 index 000000000..05b6825fd --- /dev/null +++ b/doc/examples/scripts/structure/modeling/model_lddt.py @@ -0,0 +1,129 @@ +r""" +LDDT for predicted structure evaluation +======================================= + +This example evaluates the quality of a predicted structure from *AlphaFold DB* compared +to the experimental structure of a protein of interest by the means of the lDDT score. +Furthermore, the measured lDDT score is compared to the pLDDT score predicted by the +model. +""" + +# Code source: Patrick Kunzmann +# License: BSD 3 clause + +import matplotlib.pyplot as plt +import numpy as np +import biotite +import biotite.database.afdb as afdb +import biotite.database.rcsb as rcsb +import biotite.sequence as seq +import biotite.sequence.align as align +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx + +# Uniprot ID of the protein of interest (in this case human beta-galactosidase) +UNIPROT_ID = "P16278" + + +## Get the reference experimental structure from the PDB +query = rcsb.FieldQuery( + "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", + exact_match=UNIPROT_ID, +) +# The UniProt ID is defined for a single chain +ids = rcsb.search(query, return_type="polymer_instance") +# Simply use the first matching chain as reference +pdb_id, chain_id = ids[0].split(".") +pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif")) +reference = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) +reference = reference[reference.chain_id == chain_id] +# The experimental structure may contain additional small molecules +# (e.g. water, ions etc.) that are not part of the predicted structure +reference = reference[struc.filter_amino_acids(reference)] + + +## Get the predicted structure from AlphaFold DB +pdbx_file = pdbx.BinaryCIFFile.read(afdb.fetch(UNIPROT_ID, "bcif")) +# Use 'label_' fields to make sure the residue ID is the the same as given in the +# `ma_qa_metric_local` category, where the pLDDT is obtained from +model = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) + + +## Filter the structures to common atoms that are present in both structures +reference_sequence = struc.to_sequence(reference)[0][0] +model_sequence = struc.to_sequence(model)[0][0] +# This script does not rely on consistent residue numbering, +# so a sequence alignment is done instead +identity_matrix = align.SubstitutionMatrix( + seq.ProteinSequence.alphabet, + seq.ProteinSequence.alphabet, + np.eye(len(seq.ProteinSequence.alphabet), dtype=int), +) +alignment = align.align_optimal( + reference_sequence, + model_sequence, + # Residues might be missing due to experimental reasons but not due to homology + # -> use a simple identity matrix + identity_matrix, + gap_penalty=-1, + terminal_penalty=False, + max_number=1, +)[0] +# Remove residues from alignment +# that have no correspondence in the respective other structure +# -> Remove gaps (-1 entries in trace) +alignment = alignment[(alignment.trace != -1).all(axis=1)] +# Map the remaining alignment columns to atom indices +reference = reference[ + # Each mask is True for all atoms in one residue + struc.get_residue_masks(reference, struc.get_residue_starts(reference)) \ + # Only keep masks for residues that correspond to remaining alignment columns + [alignment.trace[:,0]] \ + # And aggregate them to get a single mask + .any(axis=0) +] # fmt: skip +model = model[ + struc.get_residue_masks(model, struc.get_residue_starts(model))[ + alignment.trace[:, 1] + ].any(axis=0) +] + + +## Get predicted lDDT from the model file +plddt_category = pdbx_file.block["ma_qa_metric_local"] +plddt_res_ids = plddt_category["label_seq_id"].as_array(int) +plddt = plddt_category["metric_value"].as_array(float) / 100 +# Remove values for residues that were removed in the alignment process +mask = np.isin(plddt_res_ids, model.res_id) +plddt_res_ids = plddt_res_ids[mask] +plddt = plddt[mask] + + +## Compute actual lDDT by comparing the model to the reference +lddt_res_ids = np.unique(model.res_id) +# The pLDDT predicts the lDDT of CA atoms, so for consistency we do the same +ca_mask = model.atom_name == "CA" +lddt = struc.lddt(reference[ca_mask], model[ca_mask], aggregation="residue") + + +## Compare predicted to measured lDDT +fig, ax = plt.subplots(figsize=(8.0, 4.0)) +ax.plot( + plddt_res_ids, + plddt, + color=biotite.colors["dimgreen"], + linestyle="-", + label="predicted", +) +ax.plot( + lddt_res_ids, + lddt, + color=biotite.colors["lightorange"], + linestyle="-", + label="measured", +) +ax.legend() +ax.set_xlabel("Residue ID") +ax.set_ylabel("lDDT") +ax.autoscale(axis="x", tight=True) +plt.show()