Merge pull request #492 from dacarlin/add-alphafold-db

Add fetching from the AlphaFold structure database
biotite-dev · Feb 2, 2025 · 4d02bb8 · 4d02bb8
2 parents ecc3eb7 + 6de1365
commit 4d02bb8
Show file tree

Hide file tree

Showing 6 changed files with 453 additions and 0 deletions.
diff --git a/doc/examples/scripts/structure/modeling/model_lddt.py b/doc/examples/scripts/structure/modeling/model_lddt.py
@@ -0,0 +1,129 @@
+r"""
+LDDT for predicted structure evaluation
+=======================================
+
+This example evaluates the quality of a predicted structure from *AlphaFold DB* compared
+to the experimental structure of a protein of interest by the means of the lDDT score.
+Furthermore, the measured lDDT score is compared to the pLDDT score predicted by the
+model.
+"""
+
+# Code source: Patrick Kunzmann
+# License: BSD 3 clause
+
+import matplotlib.pyplot as plt
+import numpy as np
+import biotite
+import biotite.database.afdb as afdb
+import biotite.database.rcsb as rcsb
+import biotite.sequence as seq
+import biotite.sequence.align as align
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
+
+# Uniprot ID of the protein of interest (in this case human beta-galactosidase)
+UNIPROT_ID = "P16278"
+
+
+## Get the reference experimental structure from the PDB
+query = rcsb.FieldQuery(
+    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
+    exact_match=UNIPROT_ID,
+)
+# The UniProt ID is defined for a single chain
+ids = rcsb.search(query, return_type="polymer_instance")
+# Simply use the first matching chain as reference
+pdb_id, chain_id = ids[0].split(".")
+pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif"))
+reference = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
+reference = reference[reference.chain_id == chain_id]
+# The experimental structure may contain additional small molecules
+# (e.g. water, ions etc.) that are not part of the predicted structure
+reference = reference[struc.filter_amino_acids(reference)]
+
+
+## Get the predicted structure from AlphaFold DB
+pdbx_file = pdbx.BinaryCIFFile.read(afdb.fetch(UNIPROT_ID, "bcif"))
+# Use 'label_<x>' fields to make sure the residue ID is the the same as given in the
+# `ma_qa_metric_local` category, where the pLDDT is obtained from
+model = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
+
+
+## Filter the structures to common atoms that are present in both structures
+reference_sequence = struc.to_sequence(reference)[0][0]
+model_sequence = struc.to_sequence(model)[0][0]
+# This script does not rely on consistent residue numbering,
+# so a sequence alignment is done instead
+identity_matrix = align.SubstitutionMatrix(
+    seq.ProteinSequence.alphabet,
+    seq.ProteinSequence.alphabet,
+    np.eye(len(seq.ProteinSequence.alphabet), dtype=int),
+)
+alignment = align.align_optimal(
+    reference_sequence,
+    model_sequence,
+    # Residues might be missing due to experimental reasons but not due to homology
+    # -> use a simple identity matrix
+    identity_matrix,
+    gap_penalty=-1,
+    terminal_penalty=False,
+    max_number=1,
+)[0]
+# Remove residues from alignment
+# that have no correspondence in the respective other structure
+# -> Remove gaps (-1 entries in trace)
+alignment = alignment[(alignment.trace != -1).all(axis=1)]
+# Map the remaining alignment columns to atom indices
+reference = reference[
+    # Each mask is True for all atoms in one residue
+    struc.get_residue_masks(reference, struc.get_residue_starts(reference)) \
+    # Only keep masks for residues that correspond to remaining alignment columns
+    [alignment.trace[:,0]] \
+    # And aggregate them to get a single mask
+    .any(axis=0)
+]  # fmt: skip
+model = model[
+    struc.get_residue_masks(model, struc.get_residue_starts(model))[
+        alignment.trace[:, 1]
+    ].any(axis=0)
+]
+
+
+## Get predicted lDDT from the model file
+plddt_category = pdbx_file.block["ma_qa_metric_local"]
+plddt_res_ids = plddt_category["label_seq_id"].as_array(int)
+plddt = plddt_category["metric_value"].as_array(float) / 100
+# Remove values for residues that were removed in the alignment process
+mask = np.isin(plddt_res_ids, model.res_id)
+plddt_res_ids = plddt_res_ids[mask]
+plddt = plddt[mask]
+
+
+## Compute actual lDDT by comparing the model to the reference
+lddt_res_ids = np.unique(model.res_id)
+# The pLDDT predicts the lDDT of CA atoms, so for consistency we do the same
+ca_mask = model.atom_name == "CA"
+lddt = struc.lddt(reference[ca_mask], model[ca_mask], aggregation="residue")
+
+
+## Compare predicted to measured lDDT
+fig, ax = plt.subplots(figsize=(8.0, 4.0))
+ax.plot(
+    plddt_res_ids,
+    plddt,
+    color=biotite.colors["dimgreen"],
+    linestyle="-",
+    label="predicted",
+)
+ax.plot(
+    lddt_res_ids,
+    lddt,
+    color=biotite.colors["lightorange"],
+    linestyle="-",
+    label="measured",
+)
+ax.legend()
+ax.set_xlabel("Residue ID")
+ax.set_ylabel("lDDT")
+ax.autoscale(axis="x", tight=True)
+plt.show()
diff --git a/doc/tutorial/database/rcsb.rst b/doc/tutorial/database/rcsb.rst
@@ -128,3 +128,49 @@ Note that grouping may omit PDB IDs in search results, if such PDB IDs
 cannot be grouped.
 For example in the case shown above only a few PDB entries were
 uploaded as collection and hence are part of the search results.
+
+Getting computational models
+----------------------------
+By default :func:`search()` only returns experimental structures.
+In addition to that the RCSB lists an order of magnitude more computational models.
+They can be included in search results by adding ``"computational"`` to the
+``content_types`` parameter.
+
+.. jupyter-execute::
+
+    query = (
+        rcsb.FieldQuery("rcsb_polymer_entity.pdbx_description", contains_phrase="Hexokinase")
+        & rcsb.FieldQuery(
+            "rcsb_entity_source_organism.scientific_name", exact_match="Homo sapiens"
+        )
+    )
+    ids = rcsb.search(query, content_types=("experimental", "computational"))
+    print(ids)
+
+The returned four-character IDs are the RCSB PDB IDs of experimental structures
+like we already saw above.
+The IDs with the ``AF_`` on the other hand are computational models from
+*AlphaFold DB*.
+
+.. currentmodule:: biotite.database.afdb
+
+To download those we require another subpackage: :mod:`biotite.database.afdb`.
+Its :func:`fetch()` function works very similar.
+
+.. jupyter-execute::
+
+    import biotite.database.afdb as afdb
+
+    files = []
+    # For the sake of run time, only download the first 5 entries
+    for id in ids[:5]:
+        if id.startswith("AF_"):
+            # Entry is in AlphaFold DB
+            files.append(afdb.fetch(id, "cif", gettempdir()))
+        elif id.startswith("MA_"):
+            # Entry is in ModelArchive, which is not yet supported
+            raise NotImplementedError
+        else:
+            # Entry is in RCSB PDB
+            files.append(rcsb.fetch(id, "cif", gettempdir()))
+    print([basename(file) for file in files])
diff --git a/src/biotite/database/afdb/__init__.py b/src/biotite/database/afdb/__init__.py
@@ -0,0 +1,12 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+"""
+A subpackage for downloading predicted protein structures from the AlphaFold DB.
+"""
+
+__name__ = "biotite.database.afdb"
+__author__ = "Alex Carlin"
+
+from .download import *
diff --git a/src/biotite/database/afdb/download.py b/src/biotite/database/afdb/download.py
@@ -0,0 +1,191 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+__name__ = "biotite.database.afdb"
+__author__ = "Patrick Kunzmann, Alex Carlin"
+__all__ = ["fetch"]
+
+import io
+import re
+from pathlib import Path
+from xml.etree import ElementTree
+import requests
+from biotite.database.error import RequestError
+
+_METADATA_URL = "https://alphafold.com/api/prediction"
+_BINARY_FORMATS = ["bcif"]
+# Adopted from https://www.uniprot.org/help/accession_numbers
+_UNIPROT_PATTERN = (
+    "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+)
+
+
+def fetch(ids, format, target_path=None, overwrite=False, verbose=False):
+    """
+    Download predicted protein structures from the AlphaFold DB.
+
+    This function requires an internet connection.
+
+    Parameters
+    ----------
+    ids : str or iterable object of str
+        A single ID or a list of IDs of the file(s) to be downloaded.
+        They can be either UniProt IDs (e.g. ``P12345``) or AlphaFold DB IDs
+        (e.g. ``AF-P12345F1``).
+    format : {'pdb', 'pdbx', 'cif', 'mmcif', 'bcif', 'fasta'}
+        The format of the files to be downloaded.
+    target_path : str, optional
+        The target directory of the downloaded files.
+        By default, the file content is stored in a file-like object
+        (`StringIO` or `BytesIO`, respectively).
+    overwrite : bool, optional
+        If true, existing files will be overwritten.
+        Otherwise the respective file will only be downloaded if the file does not
+        exist yet in the specified target directory or if the file is empty.
+    verbose: bool, optional
+        If true, the function will output the download progress.
+
+    Returns
+    -------
+    files : str or StringIO or BytesIO or list of (str or StringIO or BytesIO)
+        The file path(s) to the downloaded files.
+        If a single string (a single ID) was given in `ids`, a single string is
+        returned.
+        If a list (or other iterable object) was given, a list of strings is returned.
+        If no `target_path` was given, the file contents are stored in either
+        ``StringIO`` or ``BytesIO`` objects.
+
+    Examples
+    --------
+
+    >>> from pathlib import Path
+    >>> file = fetch("P12345", "cif", path_to_directory)
+    >>> print(Path(file).name)
+    P12345.cif
+    >>> files = fetch(["P12345", "Q8K9I1"], "cif", path_to_directory)
+    >>> print([Path(file).name for file in files])
+    ['P12345.cif', 'Q8K9I1.cif']
+    """
+    if format not in ["pdb", "pdbx", "cif", "mmcif", "bcif", "fasta"]:
+        raise ValueError(f"Format '{format}' is not supported")
+    if format in ["pdbx", "mmcif"]:
+        format = "cif"
+
+    # If only a single ID is present,
+    # put it into a single element list
+    if isinstance(ids, str):
+        ids = [ids]
+        single_element = True
+    else:
+        single_element = False
+    if target_path is not None:
+        target_path = Path(target_path)
+        target_path.mkdir(parents=True, exist_ok=True)
+
+    files = []
+    for i, id in enumerate(ids):
+        # Verbose output
+        if verbose:
+            print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r")
+        # Fetch file from database
+        if target_path is not None:
+            file = target_path / f"{id}.{format}"
+        else:
+            # 'file = None' -> store content in a file-like object
+            file = None
+        if file is None or not file.is_file() or file.stat().st_size == 0 or overwrite:
+            file_response = requests.get(_get_file_url(id, format))
+            _assert_valid_file(file_response, id)
+            if format in _BINARY_FORMATS:
+                content = file_response.content
+            else:
+                content = file_response.text
+
+            if file is None:
+                if format in _BINARY_FORMATS:
+                    file = io.BytesIO(content)
+                else:
+                    file = io.StringIO(content)
+            else:
+                mode = "wb+" if format in _BINARY_FORMATS else "w+"
+                with open(file, mode) as f:
+                    f.write(content)
+
+        files.append(file)
+    if verbose:
+        print("\nDone")
+
+    # Return paths as strings
+    files = [file.as_posix() if isinstance(file, Path) else file for file in files]
+    # If input was a single ID, return only a single element
+    if single_element:
+        return files[0]
+    else:
+        return files
+
+
+def _get_file_url(id, format):
+    """
+    Get the actual file URL for the given ID from the ``prediction`` API endpoint.
+
+    Parameters
+    ----------
+    id : str
+        The ID of the file to be downloaded.
+    format : str
+        The format of the file to be downloaded.
+
+    Returns
+    -------
+    file_url : str
+        The URL of the file to be downloaded.
+    """
+    uniprot_id = _extract_id(id)
+    metadata = requests.get(f"{_METADATA_URL}/{uniprot_id}").json()
+    if len(metadata) == 0:
+        raise RequestError(f"ID {id} is invalid")
+    # A list of length 1 is always returned, if the response is valid
+    return metadata[0][f"{format}Url"]
+
+
+def _extract_id(id):
+    """
+    Extract a AFDB compatible UniProt ID from the given qualifier.
+    This may comprise
+
+    - Directly the UniProt ID (e.g. ``P12345``) (trivial case)
+    - Entry ID, as also returned by the RCSB search API (e.g. ``AF-P12345-F1``)
+
+    Parameters
+    ----------
+    id : str
+        The qualifier to extract the UniProt ID from.
+
+    Returns
+    -------
+    uniprot_id : str
+        The UniProt ID.
+    """
+    match = re.search(_UNIPROT_PATTERN, id)
+    if match is None:
+        raise ValueError(f"Cannot extract AFDB identifier from '{id}'")
+    return match.group()
+
+
+def _assert_valid_file(response, id):
+    """
+    Checks whether the response is an actual structure file
+    or the response a *404* error due to invalid UniProt ID.
+    """
+    if len(response.text) == 0:
+        raise RequestError(f"Received no repsone for '{id}'")
+    try:
+        root = ElementTree.fromstring(response.text)
+        if root.tag == "Error":
+            raise RequestError(
+                f"Error while fetching '{id}': {root.find('Message').text}"
+            )
+    except ElementTree.ParseError:
+        # This is not XML -> the response is probably a valid file
+        pass