Skip to content

Commit

Permalink
Merge pull request #492 from dacarlin/add-alphafold-db
Browse files Browse the repository at this point in the history
Add fetching from the AlphaFold structure database
  • Loading branch information
padix-key authored Feb 2, 2025
2 parents ecc3eb7 + 6de1365 commit 4d02bb8
Show file tree
Hide file tree
Showing 6 changed files with 453 additions and 0 deletions.
129 changes: 129 additions & 0 deletions doc/examples/scripts/structure/modeling/model_lddt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
r"""
LDDT for predicted structure evaluation
=======================================
This example evaluates the quality of a predicted structure from *AlphaFold DB* compared
to the experimental structure of a protein of interest by the means of the lDDT score.
Furthermore, the measured lDDT score is compared to the pLDDT score predicted by the
model.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
import biotite
import biotite.database.afdb as afdb
import biotite.database.rcsb as rcsb
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx

# Uniprot ID of the protein of interest (in this case human beta-galactosidase)
UNIPROT_ID = "P16278"


## Get the reference experimental structure from the PDB
query = rcsb.FieldQuery(
"rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
exact_match=UNIPROT_ID,
)
# The UniProt ID is defined for a single chain
ids = rcsb.search(query, return_type="polymer_instance")
# Simply use the first matching chain as reference
pdb_id, chain_id = ids[0].split(".")
pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif"))
reference = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
reference = reference[reference.chain_id == chain_id]
# The experimental structure may contain additional small molecules
# (e.g. water, ions etc.) that are not part of the predicted structure
reference = reference[struc.filter_amino_acids(reference)]


## Get the predicted structure from AlphaFold DB
pdbx_file = pdbx.BinaryCIFFile.read(afdb.fetch(UNIPROT_ID, "bcif"))
# Use 'label_<x>' fields to make sure the residue ID is the the same as given in the
# `ma_qa_metric_local` category, where the pLDDT is obtained from
model = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)


## Filter the structures to common atoms that are present in both structures
reference_sequence = struc.to_sequence(reference)[0][0]
model_sequence = struc.to_sequence(model)[0][0]
# This script does not rely on consistent residue numbering,
# so a sequence alignment is done instead
identity_matrix = align.SubstitutionMatrix(
seq.ProteinSequence.alphabet,
seq.ProteinSequence.alphabet,
np.eye(len(seq.ProteinSequence.alphabet), dtype=int),
)
alignment = align.align_optimal(
reference_sequence,
model_sequence,
# Residues might be missing due to experimental reasons but not due to homology
# -> use a simple identity matrix
identity_matrix,
gap_penalty=-1,
terminal_penalty=False,
max_number=1,
)[0]
# Remove residues from alignment
# that have no correspondence in the respective other structure
# -> Remove gaps (-1 entries in trace)
alignment = alignment[(alignment.trace != -1).all(axis=1)]
# Map the remaining alignment columns to atom indices
reference = reference[
# Each mask is True for all atoms in one residue
struc.get_residue_masks(reference, struc.get_residue_starts(reference)) \
# Only keep masks for residues that correspond to remaining alignment columns
[alignment.trace[:,0]] \
# And aggregate them to get a single mask
.any(axis=0)
] # fmt: skip
model = model[
struc.get_residue_masks(model, struc.get_residue_starts(model))[
alignment.trace[:, 1]
].any(axis=0)
]


## Get predicted lDDT from the model file
plddt_category = pdbx_file.block["ma_qa_metric_local"]
plddt_res_ids = plddt_category["label_seq_id"].as_array(int)
plddt = plddt_category["metric_value"].as_array(float) / 100
# Remove values for residues that were removed in the alignment process
mask = np.isin(plddt_res_ids, model.res_id)
plddt_res_ids = plddt_res_ids[mask]
plddt = plddt[mask]


## Compute actual lDDT by comparing the model to the reference
lddt_res_ids = np.unique(model.res_id)
# The pLDDT predicts the lDDT of CA atoms, so for consistency we do the same
ca_mask = model.atom_name == "CA"
lddt = struc.lddt(reference[ca_mask], model[ca_mask], aggregation="residue")


## Compare predicted to measured lDDT
fig, ax = plt.subplots(figsize=(8.0, 4.0))
ax.plot(
plddt_res_ids,
plddt,
color=biotite.colors["dimgreen"],
linestyle="-",
label="predicted",
)
ax.plot(
lddt_res_ids,
lddt,
color=biotite.colors["lightorange"],
linestyle="-",
label="measured",
)
ax.legend()
ax.set_xlabel("Residue ID")
ax.set_ylabel("lDDT")
ax.autoscale(axis="x", tight=True)
plt.show()
46 changes: 46 additions & 0 deletions doc/tutorial/database/rcsb.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,49 @@ Note that grouping may omit PDB IDs in search results, if such PDB IDs
cannot be grouped.
For example in the case shown above only a few PDB entries were
uploaded as collection and hence are part of the search results.

Getting computational models
----------------------------
By default :func:`search()` only returns experimental structures.
In addition to that the RCSB lists an order of magnitude more computational models.
They can be included in search results by adding ``"computational"`` to the
``content_types`` parameter.

.. jupyter-execute::

query = (
rcsb.FieldQuery("rcsb_polymer_entity.pdbx_description", contains_phrase="Hexokinase")
& rcsb.FieldQuery(
"rcsb_entity_source_organism.scientific_name", exact_match="Homo sapiens"
)
)
ids = rcsb.search(query, content_types=("experimental", "computational"))
print(ids)

The returned four-character IDs are the RCSB PDB IDs of experimental structures
like we already saw above.
The IDs with the ``AF_`` on the other hand are computational models from
*AlphaFold DB*.

.. currentmodule:: biotite.database.afdb

To download those we require another subpackage: :mod:`biotite.database.afdb`.
Its :func:`fetch()` function works very similar.

.. jupyter-execute::

import biotite.database.afdb as afdb

files = []
# For the sake of run time, only download the first 5 entries
for id in ids[:5]:
if id.startswith("AF_"):
# Entry is in AlphaFold DB
files.append(afdb.fetch(id, "cif", gettempdir()))
elif id.startswith("MA_"):
# Entry is in ModelArchive, which is not yet supported
raise NotImplementedError
else:
# Entry is in RCSB PDB
files.append(rcsb.fetch(id, "cif", gettempdir()))
print([basename(file) for file in files])
12 changes: 12 additions & 0 deletions src/biotite/database/afdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
A subpackage for downloading predicted protein structures from the AlphaFold DB.
"""

__name__ = "biotite.database.afdb"
__author__ = "Alex Carlin"

from .download import *
191 changes: 191 additions & 0 deletions src/biotite/database/afdb/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

__name__ = "biotite.database.afdb"
__author__ = "Patrick Kunzmann, Alex Carlin"
__all__ = ["fetch"]

import io
import re
from pathlib import Path
from xml.etree import ElementTree
import requests
from biotite.database.error import RequestError

_METADATA_URL = "https://alphafold.com/api/prediction"
_BINARY_FORMATS = ["bcif"]
# Adopted from https://www.uniprot.org/help/accession_numbers
_UNIPROT_PATTERN = (
"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
)


def fetch(ids, format, target_path=None, overwrite=False, verbose=False):
"""
Download predicted protein structures from the AlphaFold DB.
This function requires an internet connection.
Parameters
----------
ids : str or iterable object of str
A single ID or a list of IDs of the file(s) to be downloaded.
They can be either UniProt IDs (e.g. ``P12345``) or AlphaFold DB IDs
(e.g. ``AF-P12345F1``).
format : {'pdb', 'pdbx', 'cif', 'mmcif', 'bcif', 'fasta'}
The format of the files to be downloaded.
target_path : str, optional
The target directory of the downloaded files.
By default, the file content is stored in a file-like object
(`StringIO` or `BytesIO`, respectively).
overwrite : bool, optional
If true, existing files will be overwritten.
Otherwise the respective file will only be downloaded if the file does not
exist yet in the specified target directory or if the file is empty.
verbose: bool, optional
If true, the function will output the download progress.
Returns
-------
files : str or StringIO or BytesIO or list of (str or StringIO or BytesIO)
The file path(s) to the downloaded files.
If a single string (a single ID) was given in `ids`, a single string is
returned.
If a list (or other iterable object) was given, a list of strings is returned.
If no `target_path` was given, the file contents are stored in either
``StringIO`` or ``BytesIO`` objects.
Examples
--------
>>> from pathlib import Path
>>> file = fetch("P12345", "cif", path_to_directory)
>>> print(Path(file).name)
P12345.cif
>>> files = fetch(["P12345", "Q8K9I1"], "cif", path_to_directory)
>>> print([Path(file).name for file in files])
['P12345.cif', 'Q8K9I1.cif']
"""
if format not in ["pdb", "pdbx", "cif", "mmcif", "bcif", "fasta"]:
raise ValueError(f"Format '{format}' is not supported")
if format in ["pdbx", "mmcif"]:
format = "cif"

# If only a single ID is present,
# put it into a single element list
if isinstance(ids, str):
ids = [ids]
single_element = True
else:
single_element = False
if target_path is not None:
target_path = Path(target_path)
target_path.mkdir(parents=True, exist_ok=True)

files = []
for i, id in enumerate(ids):
# Verbose output
if verbose:
print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r")
# Fetch file from database
if target_path is not None:
file = target_path / f"{id}.{format}"
else:
# 'file = None' -> store content in a file-like object
file = None
if file is None or not file.is_file() or file.stat().st_size == 0 or overwrite:
file_response = requests.get(_get_file_url(id, format))
_assert_valid_file(file_response, id)
if format in _BINARY_FORMATS:
content = file_response.content
else:
content = file_response.text

if file is None:
if format in _BINARY_FORMATS:
file = io.BytesIO(content)
else:
file = io.StringIO(content)
else:
mode = "wb+" if format in _BINARY_FORMATS else "w+"
with open(file, mode) as f:
f.write(content)

files.append(file)
if verbose:
print("\nDone")

# Return paths as strings
files = [file.as_posix() if isinstance(file, Path) else file for file in files]
# If input was a single ID, return only a single element
if single_element:
return files[0]
else:
return files


def _get_file_url(id, format):
"""
Get the actual file URL for the given ID from the ``prediction`` API endpoint.
Parameters
----------
id : str
The ID of the file to be downloaded.
format : str
The format of the file to be downloaded.
Returns
-------
file_url : str
The URL of the file to be downloaded.
"""
uniprot_id = _extract_id(id)
metadata = requests.get(f"{_METADATA_URL}/{uniprot_id}").json()
if len(metadata) == 0:
raise RequestError(f"ID {id} is invalid")
# A list of length 1 is always returned, if the response is valid
return metadata[0][f"{format}Url"]


def _extract_id(id):
"""
Extract a AFDB compatible UniProt ID from the given qualifier.
This may comprise
- Directly the UniProt ID (e.g. ``P12345``) (trivial case)
- Entry ID, as also returned by the RCSB search API (e.g. ``AF-P12345-F1``)
Parameters
----------
id : str
The qualifier to extract the UniProt ID from.
Returns
-------
uniprot_id : str
The UniProt ID.
"""
match = re.search(_UNIPROT_PATTERN, id)
if match is None:
raise ValueError(f"Cannot extract AFDB identifier from '{id}'")
return match.group()


def _assert_valid_file(response, id):
"""
Checks whether the response is an actual structure file
or the response a *404* error due to invalid UniProt ID.
"""
if len(response.text) == 0:
raise RequestError(f"Received no repsone for '{id}'")
try:
root = ElementTree.fromstring(response.text)
if root.tag == "Error":
raise RequestError(
f"Error while fetching '{id}': {root.find('Message').text}"
)
except ElementTree.ParseError:
# This is not XML -> the response is probably a valid file
pass
Loading

0 comments on commit 4d02bb8

Please sign in to comment.