Skip to content

Commit

Permalink
implement complexsearch and sanitizing expandcomplex
Browse files Browse the repository at this point in the history
  • Loading branch information
Woosub-Kim committed Dec 14, 2023
1 parent b220b5a commit b156e06
Show file tree
Hide file tree
Showing 9 changed files with 233 additions and 58 deletions.
1 change: 1 addition & 0 deletions data/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ set(COMPILED_RESOURCES
evalue_nn.kerasify
main.js
vendor.js.zst
complexsearch.sh
easycomplexsearch.sh
)

Expand Down
46 changes: 46 additions & 0 deletions data/complexsearch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}

if notExists "${TMP_PATH}/result.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" search "${QUERYDB}" "${TARGETDB}" "${TMP_PATH}/result" "${TMP_PATH}/search_tmp" ${SEARCH_PAR} \
|| fail "Search died"
fi

RESULT="${TMP_PATH}/result"
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
if notExists "${TMP_PATH}/result_expand_pref.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" expandcomplex "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${TMP_PATH}/result_expand_pref" ${THREADS_PAR} \
|| fail "Expandcomplex died"
fi
if notExists "${TMP_PATH}/result_expand_aligned.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" $COMPLEX_ALIGNMENT_ALGO "${QUERYDB}" "${TARGETDB}" "${TMP_PATH}/result_expand_pref" "${TMP_PATH}/result_expand_aligned" ${COMPLEX_ALIGN_PAR} \
|| fail $COMPLEX_ALIGNMENT_ALGO "died"
fi
RESULT="${TMP_PATH}/result_expand_aligned"
fi
if notExists "${TMP_PATH}/complex_result.dbtype"; then
# shellcheck disable=SC2086
$MMSEQS scorecomplex "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${OUTPUT}" ${SCORECOMPLEX_PAR} \
|| fail "ScoreComplex died"
fi

if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
if [ "$PREFMODE" != "EXHAUSTIVE" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY}
fi
rm -rf "${TMP_PATH}/search_tmp"
rm -f "${TMP_PATH}/complexsearch.sh"
fi
21 changes: 20 additions & 1 deletion src/FoldseekBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,26 @@ std::vector<Command> foldseekCommands = {
{"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb}
}
},
{"easy-complexsearch", easycomplexsearch, &localPar.easyscorecomplexworkflow, COMMAND_EASY,
{"complexsearch", complexsearch, &localPar.complexsearchworkflow, COMMAND_MAIN,
"Complex level search",
"# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n"
"foldseek complexsearch queryDB targetDB result tmp\n"
"# Format output differently\n"
"foldseek easy-complexsearch queryDB targetDB result tmp --format-output query,target,qstart,tstart,cigar\n"
"# Align with TMalign (global)\n"
"foldseek complexsearch queryDB targetDB result tmp --alignment-type 1\n"
"# Skip prefilter and perform an exhaustive alignment (slower but more sensitive)\n"
"foldseek complexsearch queryDB targetDB result tmp --exhaustive-search 1\n\n",
"Woosub Kim <[email protected]>",
"<i:queryDB> <i:targetDB> <o:outputFileName> <tmpDir>",
CITATION_FOLDSEEK, {
{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb},
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb},
{"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb},
{"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}
}
},
{"easy-complexsearch", easycomplexsearch, &localPar.easyscomplexsearchworkflow, COMMAND_EASY,
"Complex level search",
"# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n"
"foldseek easy-complexsearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp\n"
Expand Down
1 change: 1 addition & 0 deletions src/LocalCommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ extern int scorecomplex(int argc, const char **argv, const Command& command);
extern int easycomplexsearch(int argc, const char **argv, const Command &command);
extern int createcomplexreport(int argc, const char **argv, const Command &command);
extern int expandcomplex(int argc, const char **argv, const Command &command);
extern int complexsearch(int argc, const char **argv, const Command &command);
#endif
16 changes: 9 additions & 7 deletions src/commons/LocalParameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,14 +176,16 @@ LocalParameters::LocalParameters() :
createcomplexreport.push_back(&PARAM_THREADS);
createcomplexreport.push_back(&PARAM_V);

// easyscorecomplexworkflow
easyscorecomplexworkflow = combineList(structurecreatedb, structuresearchworkflow);
easyscorecomplexworkflow = combineList(easyscorecomplexworkflow, scorecomplex);
easyscorecomplexworkflow = combineList(easyscorecomplexworkflow, convertalignments);
easyscorecomplexworkflow = combineList(easyscorecomplexworkflow, createcomplexreport);
easyscorecomplexworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE);
// complexsearchworkflow
complexsearchworkflow = combineList(structuresearchworkflow, scorecomplex);

//
// easycomplexsearchworkflow
easyscomplexsearchworkflow = combineList(structurecreatedb, complexsearchworkflow);
easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, convertalignments);
easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createcomplexreport);
easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE);

// expandcomplex
expandcomplex.push_back(&PARAM_THREADS);
expandcomplex.push_back(&PARAM_V);

Expand Down
3 changes: 2 additions & 1 deletion src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ class LocalParameters : public Parameters {
std::vector<MMseqsParameter *> structurecreatedb;
std::vector<MMseqsParameter *> compressca;
std::vector<MMseqsParameter *> scorecomplex;
std::vector<MMseqsParameter *> easyscorecomplexworkflow;
std::vector<MMseqsParameter *> complexsearchworkflow;
std::vector<MMseqsParameter *> easyscomplexsearchworkflow;
std::vector<MMseqsParameter *> createcomplexreport;
std::vector<MMseqsParameter *> expandcomplex;

Expand Down
87 changes: 38 additions & 49 deletions src/strucclustutils/expandcomplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,65 +25,21 @@ bool compareChainKeyPair_t(const ChainKeyPair_t &first, const ChainKeyPair_t &se
return false;
}

class ComplexExpander {
public:
ComplexExpander(DBReader<unsigned int> &alnDbr, chainKeyToComplexId_t &dbChainKeyToComplexIdMap, complexIdToChainKeys_t &dbComplexIdToChainKeysMap, unsigned int thread_idx)
: alnDbr(alnDbr), thread_idx(thread_idx), dbChainKeyToComplexIdMap(dbChainKeyToComplexIdMap), dbComplexIdToChainKeysMap(dbComplexIdToChainKeysMap) {}

void getQueryDbKeyPairs(std::vector<unsigned int> &qChainKeys, std::vector<ChainKeyPair_t> & queryDbKeyPairs) {
for (auto qChainKey: qChainKeys) {
unsigned int qKey = alnDbr.getId(qChainKey);
if (qKey == NOT_AVAILABLE_CHAIN_KEY)
continue;
char *data = alnDbr.getData(qKey, thread_idx);
while (*data != '\0') {
char dbKeyBuffer[255 + 1];
Util::parseKey(data, dbKeyBuffer);
const auto dbChainKey = (unsigned int) strtoul(dbKeyBuffer, NULL, 10);
const unsigned int dbComplexId = dbChainKeyToComplexIdMap.at(dbChainKey);
if (std::find(dbFoundComplexIndexes.begin(), dbFoundComplexIndexes.end(), dbComplexId) == dbFoundComplexIndexes.end())
dbFoundComplexIndexes.emplace_back(dbComplexId);
data = Util::skipLine(data);
}
}
if (dbFoundComplexIndexes.empty())
return;
for (auto dbComplexId: dbFoundComplexIndexes) {
auto &dbChainKeys = dbComplexIdToChainKeysMap.at(dbComplexId);
for (auto qChainKey: qChainKeys) {
for (auto dbChainKey: dbChainKeys) {
queryDbKeyPairs.emplace_back(qChainKey, dbChainKey);
}
}
}
dbFoundComplexIndexes.clear();
}

private:
DBReader<unsigned int> &alnDbr;
unsigned int thread_idx;
std::vector<unsigned int> dbFoundComplexIndexes;
chainKeyToComplexId_t &dbChainKeyToComplexIdMap;
complexIdToChainKeys_t &dbComplexIdToChainKeysMap;
};

int expandcomplex(int argc, const char **argv, const Command &command) {
LocalParameters &par = LocalParameters::getLocalInstance();
par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_ALIGN);
std::string qLookupFile = par.db1 + ".lookup";
std::string dbLookupFile = par.db2 + ".lookup";
DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
alnDbr.open(DBReader<unsigned int>::LINEAR_ACCCESS);
// TEMP
DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), static_cast<unsigned int>(par.threads), par.compressed, Parameters::DBTYPE_PREFILTER_RES);
resultWriter.open();
std::vector<unsigned int> qComplexIndices;
std::vector<unsigned int> dbComplexIndices;
std::vector<ChainKeyPair_t> chainKeyPairs;
chainKeyToComplexId_t qChainKeyToComplexIdMap;
chainKeyToComplexId_t dbChainKeyToComplexIdMap;
complexIdToChainKeys_t dbComplexIdToChainKeysMap;
complexIdToChainKeys_t qComplexIdToChainKeysMap;
complexIdToChainKeys_t dbComplexIdToChainKeysMap;
getKeyToIdMapIdToKeysMapIdVec(qLookupFile, qChainKeyToComplexIdMap, qComplexIdToChainKeysMap, qComplexIndices);
getKeyToIdMapIdToKeysMapIdVec(dbLookupFile, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, dbComplexIndices);
dbComplexIndices.clear();
Expand All @@ -97,18 +53,49 @@ int expandcomplex(int argc, const char **argv, const Command &command) {
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
resultToWrite_t result;
ComplexExpander complexExpander(alnDbr, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, thread_idx);
std::vector<unsigned int> dbFoundIndices;
std::vector<ChainKeyPair_t> chainKeyPairs;
#pragma omp for schedule(dynamic, 1)
// for each q complex
for (size_t qCompIdx = 0; qCompIdx < qComplexIndices.size(); qCompIdx++) {
unsigned int qComplexId = qComplexIndices[qCompIdx];
std::vector<unsigned int> &qChainKeys = qComplexIdToChainKeysMap.at(qComplexId);
complexExpander.getQueryDbKeyPairs(qChainKeys, chainKeyPairs);
// For the current query complex
for (size_t qChainIdx=0; qChainIdx<qChainKeys.size(); qChainIdx++) {
unsigned int qKey = alnDbr.getId(qChainKeys[qChainIdx]);
if (qKey == NOT_AVAILABLE_CHAIN_KEY)
continue;
char *data = alnDbr.getData(qKey, thread_idx);
while (*data != '\0') {
char dbKeyBuffer[255 + 1];
Util::parseKey(data, dbKeyBuffer);
const auto dbChainKey = (unsigned int) strtoul(dbKeyBuffer, NULL, 10);
const unsigned int dbComplexId = dbChainKeyToComplexIdMap.at(dbChainKey);
// find all db complex aligned to the query complex.
if (std::find(dbFoundIndices.begin(), dbFoundIndices.end(), dbComplexId) == dbFoundIndices.end())
dbFoundIndices.emplace_back(dbComplexId);
data = Util::skipLine(data);
}
}
if (dbFoundIndices.empty())
continue;
// Among all db complexes aligned to query complex
for (size_t dbIdx=0; dbIdx<dbFoundIndices.size(); dbIdx++) {
std::vector<unsigned int> &dbChainKeys = dbComplexIdToChainKeysMap.at(dbFoundIndices[dbIdx]);
// for all query chains
for (size_t qChainIdx=0; qChainIdx<qChainKeys.size(); qChainIdx++) {
// and target chains
for (size_t dbChainIdx=0; dbChainIdx<dbChainKeys.size(); dbChainIdx++) {
// get all possible alignments
chainKeyPairs.emplace_back(qChainKeys[qChainIdx], dbChainKeys[dbChainIdx]);
}
}
}
SORT_SERIAL(chainKeyPairs.begin(), chainKeyPairs.end(), compareChainKeyPair_t);
unsigned int qPrevChainKey = chainKeyPairs[0].first;
// and write.
for (size_t chainKeyPairIdx=0; chainKeyPairIdx<chainKeyPairs.size(); chainKeyPairIdx++) {
unsigned int qCurrChainKey = chainKeyPairs[chainKeyPairIdx].first;
if (qCurrChainKey != qPrevChainKey) {
if (chainKeyPairs[chainKeyPairIdx].first != qPrevChainKey) {
resultWriter.writeData(result.c_str(),result.length(),qPrevChainKey,thread_idx);
result.clear();
qPrevChainKey = chainKeyPairs[chainKeyPairIdx].first;
Expand All @@ -118,6 +105,8 @@ int expandcomplex(int argc, const char **argv, const Command &command) {
}
resultWriter.writeData(result.c_str(),result.length(),qPrevChainKey,thread_idx);
result.clear();
dbFoundIndices.clear();
chainKeyPairs.clear();
progress.updateProgress();
}
}
Expand Down
1 change: 1 addition & 0 deletions src/workflow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ set(workflow_source_files
workflow/EasyStructureSearch.cpp
workflow/EasyStructureCluster.cpp
workflow/EasyComplexSearch.cpp
workflow/ComplexSearch.cpp
PARENT_SCOPE
)
115 changes: 115 additions & 0 deletions src/workflow/ComplexSearch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#include <cassert>

#include "LocalParameters.h"
#include "FileUtil.h"
#include "CommandCaller.h"
#include "Util.h"
#include "Debug.h"

#include "complexsearch.sh.h"

int complexsearch(int argc, const char **argv, const Command &command) {
LocalParameters &par = LocalParameters::getLocalInstance();
par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
for (size_t i = 0; i < par.createdb.size(); i++){
par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}

par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);

par.parseParameters(argc, argv, command, false, Parameters::PARSE_VARIADIC, 0);
if(par.PARAM_FORMAT_OUTPUT.wasSet == false){
par.outfmt = "query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,complexassignid";
}
par.addBacktrace = true;
par.PARAM_ADD_BACKTRACE.wasSet = true;
par.printParameters(command.cmd, argc, argv, *command.params);

bool needBacktrace = false;
bool needTaxonomy = false;
bool needTaxonomyMapping = false;
bool needLookup = false;
{
bool needSequenceDB = false;
bool needFullHeaders = false;
bool needSource = false;
bool needQCA = false;
bool needTCA = false;
bool needTMalign = false;
bool needLDDT = false;
LocalParameters::getOutputFormat(par.formatAlignmentMode, par.outfmt, needSequenceDB, needBacktrace, needFullHeaders,
needLookup, needSource, needTaxonomyMapping, needTaxonomy, needQCA, needTCA, needTMalign, needLDDT);
}

if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM ||
par.formatAlignmentMode == LocalParameters::FORMAT_ALIGNMENT_PDB_SUPERPOSED ||
par.greedyBestHits) {
needBacktrace = true;
}
if (needBacktrace) {
Debug(Debug::INFO) << "Alignment backtraces will be computed, since they were requested by output format.\n";
par.addBacktrace = true;
par.PARAM_ADD_BACKTRACE.wasSet = true;
}
if (needLookup) {
par.writeLookup = true;
}

std::string tmpDir = par.filenames.back();
std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params));
if (par.reuseLatest) {
hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
}
tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
par.filenames.pop_back();
CommandCaller cmd;
if(par.alignmentType == LocalParameters::ALIGNMENT_TYPE_TMALIGN){
cmd.addVariable("COMPLEX_ALIGNMENT_ALGO", "tmalign");
cmd.addVariable("COMPLEX_ALIGN_PAR", par.createParameterString(par.tmalign).c_str());
}else if(par.alignmentType == LocalParameters::ALIGNMENT_TYPE_3DI_AA || par.alignmentType == LocalParameters::ALIGNMENT_TYPE_3DI){
cmd.addVariable("COMPLEX_ALIGNMENT_ALGO", "structurealign");
cmd.addVariable("COMPLEX_ALIGN_PAR", par.createParameterString(par.structurealign).c_str());
}

switch(par.prefMode){
case LocalParameters::PREF_MODE_KMER:
cmd.addVariable("PREFMODE", "KMER");
break;
case LocalParameters::PREF_MODE_UNGAPPED:
cmd.addVariable("PREFMODE", "UNGAPPED");
break;
case LocalParameters::PREF_MODE_EXHAUSTIVE:
cmd.addVariable("PREFMODE", "EXHAUSTIVE");
break;
}
if(par.exhaustiveSearch){
cmd.addVariable("PREFMODE", "EXHAUSTIVE");
}
cmd.addVariable("NO_REPORT", par.complexReportMode == 0 ? "TRUE" : NULL);
cmd.addVariable("TMP_PATH", tmpDir.c_str());
cmd.addVariable("OUTPUT", par.filenames.back().c_str());
par.filenames.pop_back();
cmd.addVariable("TARGETDB", par.filenames.back().c_str());
par.filenames.pop_back();
cmd.addVariable("QUERYDB", par.filenames.back().c_str());
cmd.addVariable("LEAVE_INPUT", par.dbOut ? "TRUE" : NULL);
par.filenames.pop_back();
cmd.addVariable("SEARCH_PAR", par.createParameterString(par.structuresearchworkflow, true).c_str());
cmd.addVariable("SCORECOMPLEX_PAR", par.createParameterString(par.scorecomplex).c_str());
cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
std::string program = tmpDir + "/complexsearch.sh";
FileUtil::writeFile(program, complexsearch_sh, complexsearch_sh_len);
cmd.execProgram(program.c_str(), par.filenames);
// Should never get here
assert(false);
return EXIT_FAILURE;
}

0 comments on commit b156e06

Please sign in to comment.