Skip to content

Commit

Permalink
WIP: small changes in sequence-alignment functions API
Browse files Browse the repository at this point in the history
and new options for gemmi-align
  • Loading branch information
wojdyr committed Oct 4, 2023
1 parent 937bdb0 commit a2a37f5
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 44 deletions.
3 changes: 3 additions & 0 deletions docs/align-help.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ Options:
--text-align Align characters in two strings (for testing).

Scoring (absolute values):
--blosum62 Use BLOSUM62 score matrix.
--full=y|n Use scoring meant to align partially-modelled polymer to
its full sequence (default in 1st mode).
--match=INT Match score (default: 1).
--mism=INT Mismatch penalty (default: -1).
--gapo=INT Gap opening penalty (default: -1).
Expand Down
3 changes: 2 additions & 1 deletion docs/mol.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2126,7 +2126,8 @@ is not to be imposed.
>>> st = gemmi.read_pdb('../tests/pdb1gdr.ent', max_line_length=72)
>>> result = gemmi.align_sequence_to_polymer(st.entities[0].full_sequence,
... st[0][0].get_polymer(),
... gemmi.PolymerType.PeptideL)
... gemmi.PolymerType.PeptideL,
... gemmi.AlignmentScoring())

The arguments of this functions are: sequence (a list of residue names),
:ref:`ResidueSpan <residuespan>` (a span of residues in a chain),
Expand Down
25 changes: 14 additions & 11 deletions include/gemmi/align.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ namespace gemmi {
// helper function for sequence alignment
inline std::vector<int> prepare_target_gapo(const ConstResidueSpan& polymer,
PolymerType polymer_type,
int default_gapo) {
const AlignmentScoring& scoring) {
std::vector<int> gaps;
gaps.reserve(polymer.size());
gaps.push_back(0); // free gap opening at the beginning of sequence
if (is_polypeptide(polymer_type) || is_polynucleotide(polymer_type)) {
auto first_conformer = polymer.first_conformer();
auto res = first_conformer.begin();
for (auto next_res = res; ++next_res != first_conformer.end(); res = next_res)
gaps.push_back(are_connected3(*res, *next_res, polymer_type) ? default_gapo : 0);
for (auto next_res = res; ++next_res != first_conformer.end(); res = next_res) {
bool connected = are_connected3(*res, *next_res, polymer_type);
gaps.push_back(connected ? scoring.bad_gapo : scoring.good_gapo);
}
gaps.push_back(0); // free gap after the end of chain
}
return gaps;
Expand All @@ -36,9 +38,11 @@ inline AlignmentResult align_sequence_to_polymer(
const std::vector<std::string>& full_seq,
const ConstResidueSpan& polymer,
PolymerType polymer_type,
const AlignmentScoring& scoring) {
const AlignmentScoring* scoring=nullptr) {
std::map<std::string, std::uint8_t> encoding;
for (const std::string& res_name : scoring.matrix_encoding)
if (!scoring)
scoring = AlignmentScoring::full_sequence_scoring();
for (const std::string& res_name : scoring->matrix_encoding)
encoding.emplace(res_name, (std::uint8_t)encoding.size());
for (const Residue& res : polymer)
encoding.emplace(res.name, (std::uint8_t)encoding.size());
Expand All @@ -57,8 +61,8 @@ inline AlignmentResult align_sequence_to_polymer(
encoded_model_seq.push_back(encoding.at(res.name));

return align_sequences(encoded_full_seq, encoded_model_seq,
prepare_target_gapo(polymer, polymer_type, scoring.gapo),
(std::uint8_t)encoding.size(), scoring);
prepare_target_gapo(polymer, polymer_type, *scoring),
(std::uint8_t)encoding.size(), *scoring);
}

// check for exact match between model sequence and full sequence (SEQRES)
Expand Down Expand Up @@ -101,8 +105,7 @@ inline void assign_label_seq_to_polymer(ResidueSpan& polymer,
// sequence alignment
} else {
PolymerType ptype = get_or_check_polymer_type(ent, polymer);
AlignmentScoring scoring;
result = align_sequence_to_polymer(ent->full_sequence, polymer, ptype, scoring);
result = align_sequence_to_polymer(ent->full_sequence, polymer, ptype);
}

auto res_group = polymer.first_conformer().begin();
Expand Down Expand Up @@ -159,9 +162,9 @@ inline void prepare_positions_for_superposition(std::vector<Position>& pos1,
SupSelect sel,
char altloc='\0',
std::vector<int>* ca_offsets=nullptr) {
AlignmentScoring scoring;
AlignmentScoring scoring = prepare_blosum62_scoring();
AlignmentResult result = align_sequence_to_polymer(fixed.extract_sequence(),
movable, ptype, scoring);
movable, ptype, &scoring);
auto it1 = fixed.first_conformer().begin();
auto it2 = movable.first_conformer().begin();
std::vector<AtomNameElement> used_atoms;
Expand Down
26 changes: 19 additions & 7 deletions include/gemmi/seqalign.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,19 @@ namespace gemmi {
struct AlignmentScoring {
int match = 1;
int mismatch = -1;
int gapo = -1;
int gape = -1;
int gapo = -1; // gap opening penalty
int gape = -1; // gap extension penalty
// In a polymer in model, coordinates are used to determine expected gaps.
int good_gapo = 0; // gap opening in expected place in a polymer
int bad_gapo = -2; // gap opening that was not predicted
std::vector<std::int8_t> score_matrix;
std::vector<std::string> matrix_encoding;

/// Scoring for alignment of partially-modelled polymer to its full sequence
static AlignmentScoring* full_sequence_scoring() {
static AlignmentScoring scoring = { 100, -10000, -10000, -1, 0, -200, {}, {} };
return &scoring;
}
};

struct AlignmentResult {
Expand Down Expand Up @@ -155,7 +164,7 @@ AlignmentResult align_sequences(const std::vector<std::uint8_t>& query,
std::uint8_t m,
const AlignmentScoring& scoring) {
// generate the query profile
std::int8_t *query_profile = new std::int8_t[query.size() * m];
std::int16_t *query_profile = new std::int16_t[query.size() * m];
{
std::uint32_t mat_size = (std::uint32_t) scoring.matrix_encoding.size();
std::int32_t i = 0;
Expand Down Expand Up @@ -188,7 +197,7 @@ AlignmentResult align_sequences(const std::vector<std::uint8_t>& query,
// DP loop
for (std::int32_t i = 0; i < (std::int32_t)target.size(); ++i) {
std::uint8_t target_item = target[i];
std::int8_t *scores = &query_profile[target_item * query.size()];
std::int16_t *scores = &query_profile[target_item * query.size()];
std::uint8_t *zi = &z[i * query.size()];
std::int32_t h1 = gapoe + gape * i;
std::int32_t f = gapoe + gapoe + gape * i;
Expand Down Expand Up @@ -253,9 +262,12 @@ inline
AlignmentResult align_string_sequences(const std::vector<std::string>& query,
const std::vector<std::string>& target,
const std::vector<int>& target_gapo,
const AlignmentScoring& scoring) {
const AlignmentScoring* scoring) {
AlignmentScoring default_scoring;
if (scoring == nullptr)
scoring = &default_scoring;
std::map<std::string, std::uint8_t> encoding;
for (const std::string& res_name : scoring.matrix_encoding)
for (const std::string& res_name : scoring->matrix_encoding)
encoding.emplace(res_name, (std::uint8_t)encoding.size());
for (const std::string& s : query)
encoding.emplace(s, (std::uint8_t)encoding.size());
Expand All @@ -270,7 +282,7 @@ AlignmentResult align_string_sequences(const std::vector<std::string>& query,
for (size_t i = 0; i != target.size(); ++i)
encoded_target[i] = encoding.at(target[i]);
return align_sequences(encoded_query, encoded_target,
target_gapo, (std::uint8_t)encoding.size(), scoring);
target_gapo, (std::uint8_t)encoding.size(), *scoring);
}

inline AlignmentScoring prepare_blosum62_scoring() {
Expand Down
45 changes: 28 additions & 17 deletions prog/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// This program compares sequence from SEQRES and from the model.

#include <gemmi/model.hpp>
#include <gemmi/polyheur.hpp> // for setup_entities, align_sequence_to_polymer
#include <gemmi/polyheur.hpp> // for setup_entities
#include <gemmi/align.hpp> // for align_sequence_to_polymer
#include <gemmi/seqalign.hpp> // for align_string_sequences
#include <gemmi/mmread_gz.hpp>
Expand All @@ -17,7 +17,7 @@ namespace {

using std::printf;

enum OptionIndex { Match=4, Mismatch, GapOpen, GapExt,
enum OptionIndex { Match=4, Mismatch, GapOpen, GapExt, Blosum62, FullScores,
CheckMmcif, PrintOneLetter, Query, Target, TextAlign, Rmsd };

const option::Descriptor Usage[] = {
Expand Down Expand Up @@ -49,6 +49,11 @@ const option::Descriptor Usage[] = {
" --text-align \tAlign characters in two strings (for testing)." },

{ NoOp, 0, "", "", Arg::None, "\nScoring (absolute values):" },
{ Blosum62, 0, "", "blosum62", Arg::None,
" --blosum62 \tUse BLOSUM62 score matrix." },
{ FullScores, 0, "", "full", Arg::YesNo,
" --full=y|n \tUse scoring meant to align partially-modelled polymer"
" to its full sequence (default in 1st mode)." },
{ Match, 0, "", "match", Arg::Int,
" --match=INT \tMatch score (default: 1)." },
{ Mismatch, 0, "", "mism", Arg::Int,
Expand All @@ -72,7 +77,8 @@ void print_alignment_details(const gemmi::AlignmentResult& result,
const std::string& chain_name,
const gemmi::ConstResidueSpan& polymer,
const gemmi::Entity& ent) {
std::vector<int> gaps = prepare_target_gapo(polymer, ent.polymer_type, -1);
std::vector<int> gaps = prepare_target_gapo(polymer, ent.polymer_type,
gemmi::AlignmentScoring());
auto gap = gaps.begin();
int seq_pos = 0;
auto model_residues = polymer.first_conformer();
Expand Down Expand Up @@ -206,7 +212,18 @@ std::vector<std::string> string_to_vector(const std::string& s) {
int GEMMI_MAIN(int argc, char **argv) {
OptParser p(EXE_NAME);
p.simple_parse(argc, argv, Usage);
p.check_exclusive_pair(Blosum62, FullScores);
if ((bool)p.options[Query] != (bool)p.options[Target]) {
std::fputs("Options --query and --target must be used together.\n", stderr);
return 1;
}
p.check_exclusive_pair(TextAlign, Query);
gemmi::AlignmentScoring scoring;
if (p.options[Blosum62])
scoring = *gemmi::AlignmentScoring::full_sequence_scoring();
bool first_mode = !(p.options[TextAlign] || p.options[Query]);
if (p.is_yes(FullScores, first_mode))
scoring = gemmi::prepare_blosum62_scoring();
if (p.options[Match])
scoring.match = std::atoi(p.options[Match].arg);
if (p.options[Mismatch])
Expand All @@ -216,11 +233,6 @@ int GEMMI_MAIN(int argc, char **argv) {
if (p.options[GapExt])
scoring.gape = std::atoi(p.options[GapExt].arg);
bool verbose = p.options[Verbose];
if ((bool)p.options[Query] != (bool)p.options[Target]) {
std::fputs("Options --query and --target must be used together.\n", stderr);
return 1;
}
p.check_exclusive_pair(TextAlign, Query);

if (p.options[TextAlign]) {
p.require_positional_args(2);
Expand All @@ -230,11 +242,10 @@ int GEMMI_MAIN(int argc, char **argv) {
}
std::string text1 = p.nonOption(0);
std::string text2 = p.nonOption(1);
std::vector<int> target_gapo(1, 0);
gemmi::AlignmentResult result
= gemmi::align_string_sequences(string_to_vector(text1),
string_to_vector(text2),
target_gapo, scoring);
std::vector<int> target_gapo(1, scoring.good_gapo);
auto result = gemmi::align_string_sequences(string_to_vector(text1),
string_to_vector(text2),
target_gapo, &scoring);
printf("Score: %d CIGAR: %s\n", result.score, result.cigar_str().c_str());
if (p.options[PrintOneLetter])
print_one_letter_alignment(result, text1, text2);
Expand Down Expand Up @@ -262,9 +273,9 @@ int GEMMI_MAIN(int argc, char **argv) {
st2_ = gemmi::read_structure_gz(p.coordinate_input_file(1));
gemmi::Structure& st2 = n_files == 2 ? st2_ : st1;
if (const gemmi::Entity* ent = take_entity(st2, p.options[Target].arg)) {
std::vector<int> target_gapo(1, 0);
std::vector<int> target_gapo(1, scoring.good_gapo);
result = gemmi::align_string_sequences(query, ent->full_sequence,
target_gapo, scoring);
target_gapo, &scoring);
print_result_summary(result);
if (p.options[PrintOneLetter])
print_one_letter_alignment(result, gemmi::one_letter_code(query),
Expand All @@ -276,7 +287,7 @@ int GEMMI_MAIN(int argc, char **argv) {
if (ptype == gemmi::PolymerType::Unknown)
if (const gemmi::Entity* entity = st2.get_entity_of(polymer))
ptype = entity->polymer_type;
result = gemmi::align_sequence_to_polymer(query, polymer, ptype, scoring);
result = gemmi::align_sequence_to_polymer(query, polymer, ptype, &scoring);
print_result_summary(result);
if (p.options[PrintOneLetter])
print_one_letter_alignment(result, gemmi::one_letter_code(query),
Expand Down Expand Up @@ -327,7 +338,7 @@ int GEMMI_MAIN(int argc, char **argv) {
printf("Sequence numbers are wrt the full sequence (SEQRES).\n");
gemmi::AlignmentResult result =
gemmi::align_sequence_to_polymer(ent->full_sequence, polymer,
ent->polymer_type, scoring);
ent->polymer_type, &scoring);
printf("%s chain %s ", st.name.c_str(), chain.name.c_str());
print_result_summary(result);
if (p.options[CheckMmcif]) {
Expand Down
17 changes: 9 additions & 8 deletions python/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,20 @@ void add_alignment(py::module& m) {
.def_readwrite("mismatch", &AlignmentScoring::mismatch)
.def_readwrite("gapo", &AlignmentScoring::gapo)
.def_readwrite("gape", &AlignmentScoring::gape)
.def_readwrite("good_gapo", &AlignmentScoring::good_gapo)
.def_readwrite("bad_gapo", &AlignmentScoring::bad_gapo)
;

m.def("prepare_blosum62_scoring", &prepare_blosum62_scoring);
m.def("align_string_sequences", &align_string_sequences,
py::arg("query"), py::arg("target"), py::arg("free_gapo"),
py::arg_v("scoring", AlignmentScoring(), "gemmi.AlignmentScoring()"));
py::arg("query"), py::arg("target"), py::arg("target_gapo"),
py::arg("scoring")=nullptr);
m.def("align_sequence_to_polymer",
[](const std::vector<std::string>& full_seq,
const ResidueSpan& polymer, PolymerType polymer_type,
AlignmentScoring& sco) {
return align_sequence_to_polymer(full_seq, polymer, polymer_type, sco);
}, py::arg("full_seq"), py::arg("polymer"), py::arg("polymer_type"),
py::arg_v("scoring", AlignmentScoring(), "gemmi.AlignmentScoring()"));
[](const std::vector<std::string>& full_seq, const ResidueSpan& polymer,
PolymerType polymer_type, AlignmentScoring* scoring) {
return align_sequence_to_polymer(full_seq, polymer, polymer_type, scoring);
}, py::arg("full_seq"), py::arg("polymer"),
py::arg("polymer_type"), py::arg("scoring")=nullptr);

// structure superposition
py::enum_<SupSelect>(m, "SupSelect")
Expand Down
6 changes: 6 additions & 0 deletions tools/pdb_ids.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,9 @@

# numeric altlocs (1,2,3,4)
1aac

# only UNK residues
3kgv

# chain C has only 1.57% of residues from SEQRES
3zms

0 comments on commit a2a37f5

Please sign in to comment.