From 1f4cd0381464e9fddddcef7691610ab50ea11893 Mon Sep 17 00:00:00 2001 From: Junsu Date: Wed, 14 Aug 2024 13:28:26 +0900 Subject: [PATCH 01/13] new linclust --- src/commons/Parameters.cpp | 1 + src/commons/Parameters.h | 1 + src/linclust/kmermatcher.cpp | 334 ++++++++++++++++++++++++----------- src/linclust/kmermatcher.h | 1 + 4 files changed, 232 insertions(+), 105 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 34e429811..dbc895551 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -2513,6 +2513,7 @@ void Parameters::setDefaults() { resultDirection = Parameters::PARAM_RESULT_DIRECTION_TARGET; weightThr = 0.9; weightFile = ""; + hashSeqBuffer = 1.05; // result2stats stat = ""; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 745d2f809..d03aed208 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -552,6 +552,7 @@ class Parameters { int resultDirection; float weightThr; std::string weightFile; + float hashSeqBuffer; // indexdb int checkCompatible; diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 3e0740c59..93eaa3cc6 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -244,6 +244,13 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].id = seqId; threadKmerBuffer[bufferPos].pos = 0; threadKmerBuffer[bufferPos].seqLen = seq.L; + for (size_t i = 0; i < 6; i++) { + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(4); + }else { + threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(12); + } + } if(hashDistribution != NULL){ __sync_fetch_and_add(&hashDistribution[static_cast(seqHash)], 1); } @@ -321,6 +328,36 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].id = seqId; threadKmerBuffer[bufferPos].pos = (kmers + kmerIdx)->pos; threadKmerBuffer[bufferPos].seqLen = seq.L; + // store adjacent seq information + unsigned int startPos = (kmers + kmerIdx)->pos; + unsigned int endPos = (kmers + kmerIdx)->pos + adjustedKmerSize - 1; + for (size_t i = 0; i < 6; i++) { + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(4); + }else { + threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(12); + } + } + if (startPos >= 3) { + threadKmerBuffer[bufferPos].adjacentSeq[0] = seq.numSequence[startPos - 3]; + threadKmerBuffer[bufferPos].adjacentSeq[1] = seq.numSequence[startPos - 2]; + threadKmerBuffer[bufferPos].adjacentSeq[2] = seq.numSequence[startPos - 1]; + }else if (startPos == 2) { + threadKmerBuffer[bufferPos].adjacentSeq[1] = seq.numSequence[startPos - 2]; + threadKmerBuffer[bufferPos].adjacentSeq[2] = seq.numSequence[startPos - 1]; + }else if (startPos == 1) { + threadKmerBuffer[bufferPos].adjacentSeq[2] = seq.numSequence[startPos - 1]; + } + if (endPos + 3 <= static_cast(seq.L) - 1) { + threadKmerBuffer[bufferPos].adjacentSeq[3] = seq.numSequence[endPos + 1]; + threadKmerBuffer[bufferPos].adjacentSeq[4] = seq.numSequence[endPos + 2]; + threadKmerBuffer[bufferPos].adjacentSeq[5] = seq.numSequence[endPos + 3]; + }else if (endPos + 2 == static_cast(seq.L) - 1) { + threadKmerBuffer[bufferPos].adjacentSeq[3] = seq.numSequence[endPos + 1]; + threadKmerBuffer[bufferPos].adjacentSeq[4] = seq.numSequence[endPos + 2]; + }else if (endPos + 1 == static_cast(seq.L) - 1) { + threadKmerBuffer[bufferPos].adjacentSeq[3] = seq.numSequence[endPos + 1]; + } bufferPos++; if(hashDistribution != NULL){ __sync_fetch_and_add(&hashDistribution[(kmers + kmerIdx)->score], 1); @@ -439,7 +476,7 @@ template void swapCenterSequence<1, short>(KmerPosition *kmers, size_t sp template void swapCenterSequence<1, int>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); template -KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, +KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat) { KmerPosition * hashSeqPair = initKmerPositionMemory(totalKmers); @@ -483,12 +520,17 @@ KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t // The longest sequence is the first since we sorted by kmer, seq.Len and id size_t writePos; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr); + writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer); }else{ - writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr); + writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer); } delete sequenceWeights; + if (writePos == SIZE_T_MAX) { + delete [] hashSeqPair; + totalKmers = 0; + return NULL; + } // sort by rep. sequence (stored in kmer) and sequence id Debug(Debug::INFO) << "Sort by rep. sequence "; @@ -520,11 +562,16 @@ KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t template size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights * sequenceWeights, float weightThr) { - size_t writePos=0; + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { + + // change splitKmerCount to exclude additional memory + splitKmerCount = static_cast(splitKmerCount / hashSeqBuffer); + // declare variables + size_t writePos = 0; + size_t writeTmp = 0; size_t prevHash = hashSeqPair[0].kmer; size_t repSeqId = hashSeqPair[0].id; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { bool isReverse = (BIT_CHECK(hashSeqPair[0].kmer, 63) == false); repSeqId = (isReverse) ? BIT_CLEAR(repSeqId, 63) : BIT_SET(repSeqId, 63); prevHash = BIT_SET(prevHash, 63); @@ -532,117 +579,178 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc size_t prevHashStart = 0; size_t prevSetSize = 0; size_t skipByWeightCount = 0; - T queryLen=hashSeqPair[0].seqLen; - bool repIsReverse = false; - T repSeq_i_pos = hashSeqPair[0].pos; + bool repIsReverse; + T queryLen; + T repSeq_i_pos; + unsigned char repAdjacent[6]; + // modulate # of rep seq selected in same kmer groups + size_t repSeqNum = 20; + for (size_t elementIdx = 0; elementIdx < splitKmerCount+1; elementIdx++) { size_t currKmer = hashSeqPair[elementIdx].kmer; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { currKmer = BIT_SET(currKmer, 63); } if (prevHash != currKmer) { - for (size_t i = prevHashStart; i < elementIdx; i++) { - // skip target sequences if weight > weightThr - if(i > prevHashStart && sequenceWeights != NULL - && sequenceWeights->getWeightById(hashSeqPair[i].id) > weightThr) - continue; - size_t kmer = hashSeqPair[i].kmer; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - kmer = BIT_SET(hashSeqPair[i].kmer, 63); + size_t repIdx = prevHashStart; + for (size_t k = 0; k < repSeqNum; k++) { + // initialize variables + repSeqId = hashSeqPair[repIdx].id; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + repIsReverse = (BIT_CHECK(hashSeqPair[repIdx].kmer, 63) == 0); + repSeqId = (repIsReverse) ? repSeqId : BIT_SET(repSeqId, 63); } - size_t rId = (kmer != SIZE_T_MAX) ? ((prevSetSize-skipByWeightCount == 1) ? SIZE_T_MAX : repSeqId) : SIZE_T_MAX; - // remove singletones from set - if(rId != SIZE_T_MAX){ - int diagonal = repSeq_i_pos - hashSeqPair[i].pos; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ - // 00 No problem here both are forward - // 01 We can revert the query of target, lets invert the query. - // 10 Same here, we can revert query to match the not inverted target - // 11 Both are reverted so no problem! - // So we need just 1 bit of information to encode all four states - bool targetIsReverse = (BIT_CHECK(hashSeqPair[i].kmer, 63) == false); - bool queryNeedsToBeRev = false; - // we now need 2 byte of information (00),(01),(10),(11) - // we need to flip the coordinates of the query - T queryPos=0; - T targetPos=0; - // revert kmer in query hits normal kmer in target - // we need revert the query - if (repIsReverse == true && targetIsReverse == false){ - queryPos = repSeq_i_pos; - targetPos = hashSeqPair[i].pos; - queryNeedsToBeRev = true; + queryLen = hashSeqPair[repIdx].seqLen; + repSeq_i_pos = hashSeqPair[repIdx].pos; + for (size_t i = 0; i < 6; i++) { + repAdjacent[i] = hashSeqPair[repIdx].adjacentSeq[i]; + } + int matchIdx = 0; + for (size_t n = 0; n < 6; n++) { + matchIdx += subMat->subMatrix[repAdjacent[n]][repAdjacent[n]]; + } + int matchThreshold = matchIdx; + + for (size_t i = prevHashStart; i < elementIdx; i++) { + // skip target sequences if weight > weightThr + if (i > prevHashStart && sequenceWeights != NULL + && sequenceWeights->getWeightById(hashSeqPair[i].id) > weightThr) + continue; + size_t rId = (prevSetSize-skipByWeightCount == 1) ? SIZE_T_MAX : repSeqId; + // remove singletones from set + if (rId != SIZE_T_MAX) { + int diagonal = repSeq_i_pos - hashSeqPair[i].pos; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + // 00 No problem here both are forward + // 01 We can revert the query of target, lets invert the query. + // 10 Same here, we can revert query to match the not inverted target + // 11 Both are reverted so no problem! + // So we need just 1 bit of information to encode all four states + bool targetIsReverse = (BIT_CHECK(hashSeqPair[i].kmer, 63) == false); + bool queryNeedsToBeRev = false; + // we now need 2 byte of information (00),(01),(10),(11) + // we need to flip the coordinates of the query + T queryPos = 0; + T targetPos = 0; + // revert kmer in query hits normal kmer in target + // we need revert the query + if (repIsReverse == true && targetIsReverse == false) { + queryPos = repSeq_i_pos; + targetPos = hashSeqPair[i].pos; + queryNeedsToBeRev = true; // both k-mers were extracted on the reverse strand // this is equal to both are extract on the forward strand // we just need to offset the position to the forward strand - }else if (repIsReverse == true && targetIsReverse == true){ - queryPos = (queryLen - 1) - repSeq_i_pos; - targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; - queryNeedsToBeRev = false; + }else if (repIsReverse == true && targetIsReverse == true) { + queryPos = (queryLen - 1) - repSeq_i_pos; + targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; + queryNeedsToBeRev = false; // query is not revers but target k-mer is reverse // instead of reverting the target, we revert the query and offset the the query/target position - }else if (repIsReverse == false && targetIsReverse == true){ - queryPos = (queryLen - 1) - repSeq_i_pos; - targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; - queryNeedsToBeRev = true; - // both are forward, everything is good here - }else{ - queryPos = repSeq_i_pos; - targetPos = hashSeqPair[i].pos; - queryNeedsToBeRev = false; + }else if (repIsReverse == false && targetIsReverse == true) { + queryPos = (queryLen - 1) - repSeq_i_pos; + targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; + queryNeedsToBeRev = true; + // both are forward, everything is good here + }else { + queryPos = repSeq_i_pos; + targetPos = hashSeqPair[i].pos; + queryNeedsToBeRev = false; + } + diagonal = queryPos - targetPos; + rId = (queryNeedsToBeRev) ? BIT_CLEAR(rId, 63) : BIT_SET(rId, 63); + } + + bool canBeExtended = diagonal < 0 || (diagonal > (queryLen - hashSeqPair[i].seqLen)); + bool canBecovered = Util::canBeCovered(covThr, covMode, + static_cast(queryLen), + static_cast(hashSeqPair[i].seqLen)); + int matchCount = 0; + for (size_t n = 0; n < 6; n++) { + matchCount += subMat->subMatrix[repAdjacent[n]][hashSeqPair[i].adjacentSeq[n]]; + } + if ((matchCount <= matchIdx) && (repIdx < i)) { + matchIdx = matchCount; + repIdx = i; + } + // store new connection information in hashSeqPair + if ((includeOnlyExtendable == false && canBecovered) || (canBeExtended && includeOnlyExtendable == true)) { + // if possible, store information in an in-place manner + if (writePos < prevHashStart) { + hashSeqPair[writePos].kmer = rId; + hashSeqPair[writePos].pos = diagonal; + hashSeqPair[writePos].seqLen = hashSeqPair[i].seqLen; + hashSeqPair[writePos].id = hashSeqPair[i].id; + writePos++; + // otherwise, store information sequentially starting from the splitKmerCount + }else if (splitKmerCount + writeTmp < hashSeqBuffer * splitKmerCount) { + hashSeqPair[splitKmerCount + writeTmp].kmer = rId; + hashSeqPair[splitKmerCount + writeTmp].pos = diagonal; + hashSeqPair[splitKmerCount + writeTmp].seqLen = hashSeqPair[i].seqLen; + hashSeqPair[splitKmerCount + writeTmp].id = hashSeqPair[i].id; + writeTmp++; + // if both are impossible, increase hashSeqPair's memory and split again + }else { + // calculate elaborate buffer size based on the progress rate + hashSeqBuffer = 1 + (static_cast(splitKmerCount) / static_cast(writePos)) * (hashSeqBuffer - 1) * 1.2; + Debug(Debug::INFO) << "\n" << "Buffer size is unsufficient, splitting again" << "\n\n"; + return SIZE_T_MAX; + } } - diagonal = queryPos - targetPos; - rId = (queryNeedsToBeRev) ? BIT_CLEAR(rId, 63) : BIT_SET(rId, 63); - } -// std::cout << diagonal << "\t" << repSeq_i_pos << "\t" << hashSeqPair[i].pos << std::endl; - - - bool canBeExtended = diagonal < 0 || (diagonal > (queryLen - hashSeqPair[i].seqLen)); - bool canBecovered = Util::canBeCovered(covThr, covMode, - static_cast(queryLen), - static_cast(hashSeqPair[i].seqLen)); - if((includeOnlyExtendable == false && canBecovered) || (canBeExtended && includeOnlyExtendable ==true )){ - hashSeqPair[writePos].kmer = rId; - hashSeqPair[writePos].pos = diagonal; - hashSeqPair[writePos].seqLen = hashSeqPair[i].seqLen; - hashSeqPair[writePos].id = hashSeqPair[i].id; - writePos++; } } -// hashSeqPair[i].kmer = SIZE_T_MAX; - hashSeqPair[i].kmer = (i != writePos - 1) ? SIZE_T_MAX : hashSeqPair[i].kmer; + // terminate iteration of the search within the same kmer group + if (matchIdx == matchThreshold) { + break; + } } + // re-initialize variables + prevHashStart = elementIdx; prevSetSize = 0; skipByWeightCount = 0; - prevHashStart = elementIdx; - repSeqId = hashSeqPair[elementIdx].id; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ - repIsReverse = (BIT_CHECK(hashSeqPair[elementIdx].kmer, 63) == 0); - repSeqId = (repIsReverse) ? repSeqId : BIT_SET(repSeqId, 63); - } - queryLen = hashSeqPair[elementIdx].seqLen; - repSeq_i_pos = hashSeqPair[elementIdx].pos; } if (hashSeqPair[elementIdx].kmer == SIZE_T_MAX) { break; } prevSetSize++; - if(prevSetSize > 1 && sequenceWeights != NULL + if (prevSetSize > 1 && sequenceWeights != NULL && sequenceWeights->getWeightById(hashSeqPair[elementIdx].id) > weightThr) skipByWeightCount++; prevHash = hashSeqPair[elementIdx].kmer; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { prevHash = BIT_SET(prevHash, 63); } } - + // re-order hashSeqPair + for (size_t i = 0; i < writeTmp; i++) { + hashSeqPair[writePos + i] = hashSeqPair[splitKmerCount + i]; + } + writePos = writePos + writeTmp; + // mark the end index of the valid information as SIZE_T_MAX + hashSeqPair[writePos].kmer = SIZE_T_MAX; + + Debug(Debug::INFO) << "\n" << "Total kmers per split: " << splitKmerCount << "\n"; + Debug(Debug::INFO) << "Number of connections: " << writePos; + if (splitKmerCount != 0) { + double pos = std::round((static_cast(writePos - writeTmp) / splitKmerCount) * 100.0 * 100.0) / 100.0; + double tmp = std::round((static_cast(writeTmp) / (splitKmerCount * (hashSeqBuffer - 1))) * 100.0 * 100.0) / 100.0; + + std::ostringstream posStream; + std::ostringstream tmpStream; + + posStream << std::defaultfloat << pos; + tmpStream << std::defaultfloat << tmp; + + Debug(Debug::INFO) << " (default:" << posStream.str() << "%/buffer:" << tmpStream.str() << "%)"; + } + Debug(Debug::INFO) << "\n\n"; return writePos; } -template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); -template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); -template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); -template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); void setLinearFilterDefault(Parameters *p) { @@ -696,7 +804,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { Debug(Debug::INFO) << "\n"; float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ? par.kmersPerSequenceScale.values.nucleotide() : par.kmersPerSequenceScale.values.aminoacid(); - size_t totalKmers = computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale); + size_t totalKmers = static_cast(computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale) * par.hashSeqBuffer); size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); // compute splits size_t splits = static_cast(std::ceil(static_cast(totalSizeNeeded) / memoryLimit)); @@ -733,6 +841,11 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { std::string splitFileName = par.db2 + "_split_" +SSTR(split); hashSeqPair = doComputation(totalKmers, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); } + // detect insufficient buffer size + if (totalKmers == 0) { + delete subMat; + return EXIT_SUCCESS; + } MPI_Barrier(MPI_COMM_WORLD); if(mpiRank == 0){ for(size_t split = 0; split < splits; split++) { @@ -749,6 +862,11 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { if(FileUtil::fileExists(splitFileNameDone.c_str()) == false){ hashSeqPair = doComputation(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); } + // detect insufficient buffer size + if (totalKmersPerSplit == 0) { + delete subMat; + return EXIT_SUCCESS; + } splitFiles.push_back(splitFileName); } @@ -843,6 +961,9 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr // define splits size_t currBucketSize = 0; size_t currBucketStart = 0; + // reflect increment of buffer size + totalKmers = totalKmers / par.hashSeqBuffer; + for(size_t i = 0; i < (USHRT_MAX+1); i++){ if(currBucketSize+hashDist[i] >= totalKmers){ hashRanges.emplace_back(currBucketStart, i - 1); @@ -866,24 +987,27 @@ int kmermatcher(int argc, const char **argv, const Command &command) { setLinearFilterDefault(&par); par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); - DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, - DBReader::USE_INDEX | DBReader::USE_DATA); - seqDbr.open(DBReader::NOSORT); - int querySeqType = seqDbr.getDbtype(); - - setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); - std::vector *params = command.params; - par.printParameters(command.cmd, argc, argv, *params); - Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; - - if (seqDbr.getMaxSeqLen() < SHRT_MAX) { - kmermatcherInner(par, seqDbr); - } - else { - kmermatcherInner(par, seqDbr); - } - - seqDbr.close(); + float hashSeqBuffer; + do { + DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, + DBReader::USE_INDEX | DBReader::USE_DATA); + seqDbr.open(DBReader::NOSORT); + int querySeqType = seqDbr.getDbtype(); + + setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); + std::vector *params = command.params; + par.printParameters(command.cmd, argc, argv, *params); + Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; + + hashSeqBuffer = par.hashSeqBuffer; + if (seqDbr.getMaxSeqLen() < SHRT_MAX) { + kmermatcherInner(par, seqDbr); + } + else { + kmermatcherInner(par, seqDbr); + } + seqDbr.close(); + } while (hashSeqBuffer != par.hashSeqBuffer); return EXIT_SUCCESS; } diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index 43a7413a2..c45d40906 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -52,6 +52,7 @@ struct __attribute__((__packed__))KmerPosition { unsigned int id; T seqLen; T pos; + unsigned char adjacentSeq[6]; static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer ) From f78b6bb69c58072aea8dbe89dc30450e303f735d Mon Sep 17 00:00:00 2001 From: Junsu Date: Wed, 14 Aug 2024 13:34:02 +0900 Subject: [PATCH 02/13] fix compile error --- src/linclust/kmermatcher.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 93eaa3cc6..9cd84684d 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -729,21 +729,21 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc // mark the end index of the valid information as SIZE_T_MAX hashSeqPair[writePos].kmer = SIZE_T_MAX; - Debug(Debug::INFO) << "\n" << "Total kmers per split: " << splitKmerCount << "\n"; - Debug(Debug::INFO) << "Number of connections: " << writePos; - if (splitKmerCount != 0) { - double pos = std::round((static_cast(writePos - writeTmp) / splitKmerCount) * 100.0 * 100.0) / 100.0; - double tmp = std::round((static_cast(writeTmp) / (splitKmerCount * (hashSeqBuffer - 1))) * 100.0 * 100.0) / 100.0; + // Debug(Debug::INFO) << "\n" << "Total kmers per split: " << splitKmerCount << "\n"; + // Debug(Debug::INFO) << "Number of connections: " << writePos; + // if (splitKmerCount != 0) { + // double pos = std::round((static_cast(writePos - writeTmp) / splitKmerCount) * 100.0 * 100.0) / 100.0; + // double tmp = std::round((static_cast(writeTmp) / (splitKmerCount * (hashSeqBuffer - 1))) * 100.0 * 100.0) / 100.0; - std::ostringstream posStream; - std::ostringstream tmpStream; + // std::ostringstream posStream; + // std::ostringstream tmpStream; - posStream << std::defaultfloat << pos; - tmpStream << std::defaultfloat << tmp; + // posStream << std::defaultfloat << pos; + // tmpStream << std::defaultfloat << tmp; - Debug(Debug::INFO) << " (default:" << posStream.str() << "%/buffer:" << tmpStream.str() << "%)"; - } - Debug(Debug::INFO) << "\n\n"; + // Debug(Debug::INFO) << " (default:" << posStream.str() << "%/buffer:" << tmpStream.str() << "%)"; + // } + // Debug(Debug::INFO) << "\n\n"; return writePos; } From e9da997540c1d62f2bf51c6c11641e75ea41ec05 Mon Sep 17 00:00:00 2001 From: Junsu Date: Wed, 14 Aug 2024 14:42:27 +0900 Subject: [PATCH 03/13] resolve problems --- src/linclust/kmermatcher.cpp | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 9cd84684d..27baeefde 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -244,11 +244,13 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].id = seqId; threadKmerBuffer[bufferPos].pos = 0; threadKmerBuffer[bufferPos].seqLen = seq.L; + const unsigned char nIndex = subMat->aa2num[static_cast('N')]; + const unsigned char xIndex = subMat->aa2num[static_cast('X')]; for (size_t i = 0; i < 6; i++) { if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(4); + threadKmerBuffer[bufferPos].adjacentSeq[i] = nIndex; }else { - threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(12); + threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; } } if(hashDistribution != NULL){ @@ -331,11 +333,13 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz // store adjacent seq information unsigned int startPos = (kmers + kmerIdx)->pos; unsigned int endPos = (kmers + kmerIdx)->pos + adjustedKmerSize - 1; + const unsigned char nIndex = subMat->aa2num[static_cast('N')]; + const unsigned char xIndex = subMat->aa2num[static_cast('X')]; for (size_t i = 0; i < 6; i++) { if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(4); + threadKmerBuffer[bufferPos].adjacentSeq[i] = nIndex; }else { - threadKmerBuffer[bufferPos].adjacentSeq[i] = static_cast(12); + threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; } } if (startPos >= 3) { @@ -988,17 +992,28 @@ int kmermatcher(int argc, const char **argv, const Command &command) { par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); float hashSeqBuffer; + bool firstIt = true; do { - DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, - DBReader::USE_INDEX | DBReader::USE_DATA); + // FIXME: currently have to reopen DB every iteration due to DBReader::unmapData() + DBReader seqDbr( + par.db1.c_str(), + par.db1Index.c_str(), + par.threads, + DBReader::USE_INDEX | DBReader::USE_DATA + ); seqDbr.open(DBReader::NOSORT); int querySeqType = seqDbr.getDbtype(); - setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); - std::vector *params = command.params; - par.printParameters(command.cmd, argc, argv, *params); - Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; + // print is executed only once + if (firstIt) { + firstIt = false; + setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); + std::vector *params = command.params; + par.printParameters(command.cmd, argc, argv, *params); + Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; + } + // if the buffer size is insufficient, par.hashSeqBuffer is changed and repeat split again hashSeqBuffer = par.hashSeqBuffer; if (seqDbr.getMaxSeqLen() < SHRT_MAX) { kmermatcherInner(par, seqDbr); From e12665be88b4910592e1918a98ddacca6eb1ed10 Mon Sep 17 00:00:00 2001 From: Junsu Date: Wed, 14 Aug 2024 15:05:12 +0900 Subject: [PATCH 04/13] resolve problems --- src/linclust/kmermatcher.cpp | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 27baeefde..9cc7e67fa 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -98,6 +98,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz #pragma omp parallel { unsigned int thread_idx = 0; + const unsigned char xIndex = subMat->aa2num[static_cast('X')]; #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif @@ -244,14 +245,8 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].id = seqId; threadKmerBuffer[bufferPos].pos = 0; threadKmerBuffer[bufferPos].seqLen = seq.L; - const unsigned char nIndex = subMat->aa2num[static_cast('N')]; - const unsigned char xIndex = subMat->aa2num[static_cast('X')]; for (size_t i = 0; i < 6; i++) { - if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - threadKmerBuffer[bufferPos].adjacentSeq[i] = nIndex; - }else { - threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; - } + threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; } if(hashDistribution != NULL){ __sync_fetch_and_add(&hashDistribution[static_cast(seqHash)], 1); @@ -333,14 +328,8 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz // store adjacent seq information unsigned int startPos = (kmers + kmerIdx)->pos; unsigned int endPos = (kmers + kmerIdx)->pos + adjustedKmerSize - 1; - const unsigned char nIndex = subMat->aa2num[static_cast('N')]; - const unsigned char xIndex = subMat->aa2num[static_cast('X')]; for (size_t i = 0; i < 6; i++) { - if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - threadKmerBuffer[bufferPos].adjacentSeq[i] = nIndex; - }else { - threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; - } + threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; } if (startPos >= 3) { threadKmerBuffer[bufferPos].adjacentSeq[0] = seq.numSequence[startPos - 3]; From 2ef1e2eb91ee3ca1ce5ab43b074af3398ba9c990 Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Wed, 28 Aug 2024 16:54:19 +0900 Subject: [PATCH 05/13] feat: add param to enable adj_seq --- src/commons/Parameters.cpp | 3 +++ src/commons/Parameters.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index dbc895551..072a16a57 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -152,6 +152,8 @@ Parameters::Parameters(): PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip k-mers occurring multiple times (>=2)", typeid(bool), (void *) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash initialization", typeid(int), (void *) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_PICK_N_SIMILAR(PARAM_PICK_N_SIMILAR_ID, "--pick-n-sim-kmer", "Add N similar to search", "Add N similar k-mers to search", typeid(int), (void *) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), + // TODO: qualify as COMMAND_EXPERT? + PARAM_MATCH_ADJACENT_SEQ(PARAM_MATCH_ADJACENT_SEQ_ID, "--match-adjacent-seq", "Compare adjacent sequences to k-mers", "Compare sequence information adjacent to k-mers and elect multiple representative sequences per cluster", typeid(bool), (void *) &matchAdjacentSeq, "", MMseqsParameter::COMMAND_CLUSTLINEAR), PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "Adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void *) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_RESULT_DIRECTION(PARAM_RESULT_DIRECTION_ID, "--result-direction", "Result direction", "result is 0: query, 1: target centric", typeid(int), (void *) &resultDirection, "^[0-1]{1}$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_WEIGHT_FILE(PARAM_WEIGHT_FILE_ID, "--weights", "Weight file name", "Weights used for cluster priorization", typeid(std::string), (void*) &weightFile, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT ), @@ -2513,6 +2515,7 @@ void Parameters::setDefaults() { resultDirection = Parameters::PARAM_RESULT_DIRECTION_TARGET; weightThr = 0.9; weightFile = ""; + matchAdjacentSeq = true; hashSeqBuffer = 1.05; // result2stats diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index d03aed208..0fa75e597 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -552,6 +552,7 @@ class Parameters { int resultDirection; float weightThr; std::string weightFile; + bool matchAdjacentSeq; float hashSeqBuffer; // indexdb @@ -866,6 +867,7 @@ class Parameters { PARAMETER(PARAM_IGNORE_MULTI_KMER) PARAMETER(PARAM_HASH_SHIFT) PARAMETER(PARAM_PICK_N_SIMILAR) + PARAMETER(PARAM_MATCH_ADJACENT_SEQ) PARAMETER(PARAM_ADJUST_KMER_LEN) PARAMETER(PARAM_RESULT_DIRECTION) PARAMETER(PARAM_WEIGHT_FILE) From 1533a9196ef7111dff67a27af108e009fb7e527f Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Thu, 29 Aug 2024 15:57:37 +0900 Subject: [PATCH 06/13] fix: apply changes to header --- src/linclust/kmermatcher.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index c45d40906..eaa7b2078 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -4,6 +4,7 @@ #include "DBWriter.h" #include "Parameters.h" #include "BaseMatrix.h" +#include "SequenceWeights.h" #include @@ -194,7 +195,8 @@ class CompareResultBySeqId { template -size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); +size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); template void mergeKmerFilesAndOutput(DBWriter & dbw, std::vector tmpFiles, std::vector &repSequence); @@ -215,7 +217,7 @@ void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t template -KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile, +KmerPosition * doComputation(size_t &totalKmers, size_t split, size_t splits, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); template From 36bca10273b1b6153ef64e718f0d3ecb34fc586c Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Thu, 29 Aug 2024 19:46:07 +0900 Subject: [PATCH 07/13] feat: extract adjSeq to be configurable --- src/linclust/kmermatcher.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index eaa7b2078..428e112e4 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -47,13 +47,36 @@ struct SequencePosition{ } }; -template -struct __attribute__((__packed__))KmerPosition { +template +struct AdjacentSeqArray { + void setAdjacentSeq(int index, const unsigned char val) { + _adjacentSeq[index] = val; + } + unsigned char getAdjacentSeq(int index) { + return _adjacentSeq[index]; + } + + unsigned char _adjacentSeq[6]; +}; + +// save memory when adjacent sequence is unused +template <> +struct AdjacentSeqArray { + void setAdjacentSeq(const int index, const unsigned char val) { + Debug(Debug::ERROR) << "Invalid write attempt at adjacent sequence array"; + }; + unsigned char getAdjacentSeq(int index) { + Debug(Debug::ERROR) << "Invalid read attempt at adjacent sequence array"; + return '\0'; + } +}; + +template +struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray { size_t kmer; unsigned int id; T seqLen; T pos; - unsigned char adjacentSeq[6]; static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer ) From b11018552c85d6cfa1bd3db705f3918677a79e65 Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Thu, 29 Aug 2024 19:51:07 +0900 Subject: [PATCH 08/13] feat: integrate with old linclust --- src/linclust/kmermatcher.cpp | 398 ++++++++++++++++++++++++----------- src/linclust/kmermatcher.h | 37 ++-- 2 files changed, 300 insertions(+), 135 deletions(-) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 9cc7e67fa..5ebe728a2 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -38,17 +38,17 @@ uint64_t hashUInt64(uint64_t in, uint64_t seed) { return XXH64(&in, sizeof(uint64_t), seed); } -template -KmerPosition *initKmerPositionMemory(size_t size) { - KmerPosition * hashSeqPair = new(std::nothrow) KmerPosition[size + 1]; +template +KmerPosition *initKmerPositionMemory(size_t size) { + KmerPosition * hashSeqPair = new(std::nothrow) KmerPosition[size + 1]; Util::checkAllocation(hashSeqPair, "Can not allocate memory"); - size_t pageSize = Util::getPageSize()/sizeof(KmerPosition); + size_t pageSize = Util::getPageSize()/sizeof(KmerPosition); #pragma omp parallel { #pragma omp for schedule(static) for (size_t page = 0; page < size+1; page += pageSize) { size_t readUntil = std::min(size+1, page + pageSize) - page; - memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition)* readUntil); + memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition)* readUntil); } } return hashSeqPair; @@ -67,7 +67,7 @@ void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq maskProb /*options.minMaskProb*/, probMatrix->hardMaskTable); } if(maskLowerCase == 1 && (Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) || - Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) { + Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) { const char * charSeq = seq.getSeqData(); for (int i = 0; i < seq.L; i++) { seq.numSequence[i] = (islower(charSeq[i])) ? maskLetter : seq.numSequence[i]; @@ -75,10 +75,8 @@ void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq } } -template -std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, - size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){ +template +std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){ size_t offset = 0; int querySeqType = seqDbr.getDbtype(); size_t longestKmer = par.kmerSize; @@ -115,7 +113,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz Indexer idxer(subMat->alphabetSize - 1, par.kmerSize); const unsigned int BUFFER_SIZE = 1048576; size_t bufferPos = 0; - KmerPosition * threadKmerBuffer = new KmerPosition[BUFFER_SIZE]; + KmerPosition * threadKmerBuffer = new KmerPosition[BUFFER_SIZE]; SequencePosition * kmers = (SequencePosition *) malloc((par.pickNbest * (par.maxSeqLen + 1) + 1) * sizeof(SequencePosition)); size_t kmersArraySize = par.maxSeqLen; const size_t flushSize = 100000000; @@ -246,7 +244,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].pos = 0; threadKmerBuffer[bufferPos].seqLen = seq.L; for (size_t i = 0; i < 6; i++) { - threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; + threadKmerBuffer[bufferPos].setAdjacentSeq(i, xIndex); } if(hashDistribution != NULL){ __sync_fetch_and_add(&hashDistribution[static_cast(seqHash)], 1); @@ -256,7 +254,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); if(writeOffset + bufferPos < kmerArraySize){ if(kmerArray!=NULL){ - memcpy(kmerArray + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(kmerArray + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); } } else{ Debug(Debug::ERROR) << "Kmer array overflow. currKmerArrayOffset="<< writeOffset @@ -329,27 +327,27 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz unsigned int startPos = (kmers + kmerIdx)->pos; unsigned int endPos = (kmers + kmerIdx)->pos + adjustedKmerSize - 1; for (size_t i = 0; i < 6; i++) { - threadKmerBuffer[bufferPos].adjacentSeq[i] = xIndex; + threadKmerBuffer[bufferPos].setAdjacentSeq(i, xIndex); } if (startPos >= 3) { - threadKmerBuffer[bufferPos].adjacentSeq[0] = seq.numSequence[startPos - 3]; - threadKmerBuffer[bufferPos].adjacentSeq[1] = seq.numSequence[startPos - 2]; - threadKmerBuffer[bufferPos].adjacentSeq[2] = seq.numSequence[startPos - 1]; + threadKmerBuffer[bufferPos].setAdjacentSeq(0, seq.numSequence[startPos - 3]); + threadKmerBuffer[bufferPos].setAdjacentSeq(1, seq.numSequence[startPos - 2]); + threadKmerBuffer[bufferPos].setAdjacentSeq(2, seq.numSequence[startPos - 1]); }else if (startPos == 2) { - threadKmerBuffer[bufferPos].adjacentSeq[1] = seq.numSequence[startPos - 2]; - threadKmerBuffer[bufferPos].adjacentSeq[2] = seq.numSequence[startPos - 1]; + threadKmerBuffer[bufferPos].setAdjacentSeq(1, seq.numSequence[startPos - 2]); + threadKmerBuffer[bufferPos].setAdjacentSeq(2, seq.numSequence[startPos - 1]); }else if (startPos == 1) { - threadKmerBuffer[bufferPos].adjacentSeq[2] = seq.numSequence[startPos - 1]; + threadKmerBuffer[bufferPos].setAdjacentSeq(2, seq.numSequence[startPos - 1]); } if (endPos + 3 <= static_cast(seq.L) - 1) { - threadKmerBuffer[bufferPos].adjacentSeq[3] = seq.numSequence[endPos + 1]; - threadKmerBuffer[bufferPos].adjacentSeq[4] = seq.numSequence[endPos + 2]; - threadKmerBuffer[bufferPos].adjacentSeq[5] = seq.numSequence[endPos + 3]; + threadKmerBuffer[bufferPos].setAdjacentSeq(3, seq.numSequence[endPos + 1]); + threadKmerBuffer[bufferPos].setAdjacentSeq(4, seq.numSequence[endPos + 2]); + threadKmerBuffer[bufferPos].setAdjacentSeq(5, seq.numSequence[endPos + 3]); }else if (endPos + 2 == static_cast(seq.L) - 1) { - threadKmerBuffer[bufferPos].adjacentSeq[3] = seq.numSequence[endPos + 1]; - threadKmerBuffer[bufferPos].adjacentSeq[4] = seq.numSequence[endPos + 2]; + threadKmerBuffer[bufferPos].setAdjacentSeq(3, seq.numSequence[endPos + 1]); + threadKmerBuffer[bufferPos].setAdjacentSeq(4, seq.numSequence[endPos + 2]); }else if (endPos + 1 == static_cast(seq.L) - 1) { - threadKmerBuffer[bufferPos].adjacentSeq[3] = seq.numSequence[endPos + 1]; + threadKmerBuffer[bufferPos].setAdjacentSeq(3, seq.numSequence[endPos + 1]); } bufferPos++; if(hashDistribution != NULL){ @@ -361,7 +359,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz if(writeOffset + bufferPos < kmerArraySize){ if(kmerArray!=NULL) { memcpy(kmerArray + writeOffset, threadKmerBuffer, - sizeof(KmerPosition) * bufferPos); + sizeof(KmerPosition) * bufferPos); } } else{ Debug(Debug::ERROR) << "Kmer array overflow. currKmerArrayOffset="<< writeOffset @@ -392,7 +390,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz if(bufferPos > 0){ size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); if(kmerArray != NULL){ - memcpy(kmerArray+writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(kmerArray+writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); } } free(kmers); @@ -416,8 +414,8 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz } -template -void swapCenterSequence(KmerPosition *hashSeqPair, size_t splitKmerCount, SequenceWeights &seqWeights) { +template +void swapCenterSequence(KmerPosition *hashSeqPair, size_t splitKmerCount, SequenceWeights &seqWeights) { size_t prevHash = hashSeqPair[0].kmer; @@ -463,19 +461,23 @@ void swapCenterSequence(KmerPosition *hashSeqPair, size_t splitKmerCount, Seq } } -template void swapCenterSequence<0, short>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template void swapCenterSequence<0, int>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template void swapCenterSequence<1, short>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template void swapCenterSequence<1, int>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, short, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, int, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, short, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, int, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, short, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, int, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, short, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, int, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template -KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, - DBReader & seqDbr, Parameters & par, BaseMatrix * subMat) { +template +KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, + DBReader & seqDbr, Parameters & par, BaseMatrix * subMat) { - KmerPosition * hashSeqPair = initKmerPositionMemory(totalKmers); + KmerPosition * hashSeqPair = initKmerPositionMemory(totalKmers); size_t elementsToSort; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); + std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); elementsToSort = ret.first; par.kmerSize = ret.second; Debug(Debug::INFO) << "\nAdjusted k-mer length " << par.kmerSize << "\n"; @@ -490,9 +492,9 @@ KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_ Debug(Debug::INFO) << "Sort kmer "; Timer timer; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); + SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); }else{ - SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); + SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); } Debug(Debug::INFO) << timer.lap() << "\n"; @@ -502,9 +504,9 @@ KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_ sequenceWeights = new SequenceWeights(par.weightFile.c_str()); if (sequenceWeights != NULL) { if (Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); + swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); } else { - swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); + swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); } } } @@ -529,9 +531,9 @@ KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_ Debug(Debug::INFO) << "Sort by rep. sequence "; timer.reset(); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); + SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); }else{ - SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); + SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); } //kx::radix_sort(hashSeqPair, hashSeqPair + elementsToSort, SequenceComparision()); // for(size_t i = 0; i < writePos; i++){ @@ -552,11 +554,129 @@ KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_ } +template +size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { + + size_t writePos=0; + size_t prevHash = hashSeqPair[0].kmer; + size_t repSeqId = hashSeqPair[0].id; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + bool isReverse = (BIT_CHECK(hashSeqPair[0].kmer, 63) == false); + repSeqId = (isReverse) ? BIT_CLEAR(repSeqId, 63) : BIT_SET(repSeqId, 63); + prevHash = BIT_SET(prevHash, 63); + } + size_t prevHashStart = 0; + size_t prevSetSize = 0; + size_t skipByWeightCount = 0; + T queryLen=hashSeqPair[0].seqLen; + bool repIsReverse = false; + T repSeq_i_pos = hashSeqPair[0].pos; + for (size_t elementIdx = 0; elementIdx < splitKmerCount+1; elementIdx++) { + size_t currKmer = hashSeqPair[elementIdx].kmer; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + currKmer = BIT_SET(currKmer, 63); + } + if (prevHash != currKmer) { + for (size_t i = prevHashStart; i < elementIdx; i++) { + // skip target sequences if weight > weightThr + if(i > prevHashStart && sequenceWeights != NULL + && sequenceWeights->getWeightById(hashSeqPair[i].id) > weightThr) + continue; + size_t kmer = hashSeqPair[i].kmer; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + kmer = BIT_SET(hashSeqPair[i].kmer, 63); + } + size_t rId = (kmer != SIZE_T_MAX) ? ((prevSetSize-skipByWeightCount == 1) ? SIZE_T_MAX : repSeqId) : SIZE_T_MAX; + // remove singletones from set + if(rId != SIZE_T_MAX){ + int diagonal = repSeq_i_pos - hashSeqPair[i].pos; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + // 00 No problem here both are forward + // 01 We can revert the query of target, lets invert the query. + // 10 Same here, we can revert query to match the not inverted target + // 11 Both are reverted so no problem! + // So we need just 1 bit of information to encode all four states + bool targetIsReverse = (BIT_CHECK(hashSeqPair[i].kmer, 63) == false); + bool queryNeedsToBeRev = false; + // we now need 2 byte of information (00),(01),(10),(11) + // we need to flip the coordinates of the query + T queryPos=0; + T targetPos=0; + // revert kmer in query hits normal kmer in target + // we need revert the query + if (repIsReverse == true && targetIsReverse == false){ + queryPos = repSeq_i_pos; + targetPos = hashSeqPair[i].pos; + queryNeedsToBeRev = true; + // both k-mers were extracted on the reverse strand + // this is equal to both are extract on the forward strand + // we just need to offset the position to the forward strand + }else if (repIsReverse == true && targetIsReverse == true){ + queryPos = (queryLen - 1) - repSeq_i_pos; + targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; + queryNeedsToBeRev = false; + // query is not revers but target k-mer is reverse + // instead of reverting the target, we revert the query and offset the the query/target position + }else if (repIsReverse == false && targetIsReverse == true){ + queryPos = (queryLen - 1) - repSeq_i_pos; + targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; + queryNeedsToBeRev = true; + // both are forward, everything is good here + }else{ + queryPos = repSeq_i_pos; + targetPos = hashSeqPair[i].pos; + queryNeedsToBeRev = false; + } + diagonal = queryPos - targetPos; + rId = (queryNeedsToBeRev) ? BIT_CLEAR(rId, 63) : BIT_SET(rId, 63); + } + + bool canBeExtended = diagonal < 0 || (diagonal > (queryLen - hashSeqPair[i].seqLen)); + bool canBecovered = Util::canBeCovered(covThr, covMode, + static_cast(queryLen), + static_cast(hashSeqPair[i].seqLen)); + if((includeOnlyExtendable == false && canBecovered) || (canBeExtended && includeOnlyExtendable ==true )){ + hashSeqPair[writePos].kmer = rId; + hashSeqPair[writePos].pos = diagonal; + hashSeqPair[writePos].seqLen = hashSeqPair[i].seqLen; + hashSeqPair[writePos].id = hashSeqPair[i].id; + writePos++; + } + } + hashSeqPair[i].kmer = (i != writePos - 1) ? SIZE_T_MAX : hashSeqPair[i].kmer; + } + prevSetSize = 0; + skipByWeightCount = 0; + prevHashStart = elementIdx; + repSeqId = hashSeqPair[elementIdx].id; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + repIsReverse = (BIT_CHECK(hashSeqPair[elementIdx].kmer, 63) == 0); + repSeqId = (repIsReverse) ? repSeqId : BIT_SET(repSeqId, 63); + } + queryLen = hashSeqPair[elementIdx].seqLen; + repSeq_i_pos = hashSeqPair[elementIdx].pos; + } + if (hashSeqPair[elementIdx].kmer == SIZE_T_MAX) { + break; + } + prevSetSize++; + if(prevSetSize > 1 && sequenceWeights != NULL + && sequenceWeights->getWeightById(hashSeqPair[elementIdx].id) > weightThr) + skipByWeightCount++; + prevHash = hashSeqPair[elementIdx].kmer; + if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ + prevHash = BIT_SET(prevHash, 63); + } + } + + return writePos; +} template -size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { - +size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { + // change splitKmerCount to exclude additional memory splitKmerCount = static_cast(splitKmerCount / hashSeqBuffer); // declare variables @@ -596,7 +716,7 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc queryLen = hashSeqPair[repIdx].seqLen; repSeq_i_pos = hashSeqPair[repIdx].pos; for (size_t i = 0; i < 6; i++) { - repAdjacent[i] = hashSeqPair[repIdx].adjacentSeq[i]; + repAdjacent[i] = hashSeqPair[repIdx].getAdjacentSeq(i); } int matchIdx = 0; for (size_t n = 0; n < 6; n++) { @@ -607,7 +727,7 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc for (size_t i = prevHashStart; i < elementIdx; i++) { // skip target sequences if weight > weightThr if (i > prevHashStart && sequenceWeights != NULL - && sequenceWeights->getWeightById(hashSeqPair[i].id) > weightThr) + && sequenceWeights->getWeightById(hashSeqPair[i].id) > weightThr) continue; size_t rId = (prevSetSize-skipByWeightCount == 1) ? SIZE_T_MAX : repSeqId; // remove singletones from set @@ -660,8 +780,8 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc static_cast(hashSeqPair[i].seqLen)); int matchCount = 0; for (size_t n = 0; n < 6; n++) { - matchCount += subMat->subMatrix[repAdjacent[n]][hashSeqPair[i].adjacentSeq[n]]; - } + matchCount += subMat->subMatrix[repAdjacent[n]][hashSeqPair[i].getAdjacentSeq(n)]; + } if ((matchCount <= matchIdx) && (repIdx < i)) { matchIdx = matchCount; repIdx = i; @@ -722,28 +842,18 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc // mark the end index of the valid information as SIZE_T_MAX hashSeqPair[writePos].kmer = SIZE_T_MAX; - // Debug(Debug::INFO) << "\n" << "Total kmers per split: " << splitKmerCount << "\n"; - // Debug(Debug::INFO) << "Number of connections: " << writePos; - // if (splitKmerCount != 0) { - // double pos = std::round((static_cast(writePos - writeTmp) / splitKmerCount) * 100.0 * 100.0) / 100.0; - // double tmp = std::round((static_cast(writeTmp) / (splitKmerCount * (hashSeqBuffer - 1))) * 100.0 * 100.0) / 100.0; - - // std::ostringstream posStream; - // std::ostringstream tmpStream; - - // posStream << std::defaultfloat << pos; - // tmpStream << std::defaultfloat << tmp; - - // Debug(Debug::INFO) << " (default:" << posStream.str() << "%/buffer:" << tmpStream.str() << "%)"; - // } - // Debug(Debug::INFO) << "\n\n"; return writePos; } -template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); -template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); + +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); void setLinearFilterDefault(Parameters *p) { @@ -767,13 +877,13 @@ size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t return totalKmers; } -template +template size_t computeMemoryNeededLinearfilter(size_t totalKmer) { - return sizeof(KmerPosition) * totalKmer; + return sizeof(KmerPosition) * totalKmer; } -template +template int kmermatcherInner(Parameters& par, DBReader& seqDbr) { int querySeqType = seqDbr.getDbtype(); @@ -798,18 +908,18 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ? par.kmersPerSequenceScale.values.nucleotide() : par.kmersPerSequenceScale.values.aminoacid(); size_t totalKmers = static_cast(computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale) * par.hashSeqBuffer); - size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); + size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); // compute splits size_t splits = static_cast(std::ceil(static_cast(totalSizeNeeded) / memoryLimit)); size_t totalKmersPerSplit = std::max(static_cast(1024+1), - static_cast(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition))+1); + static_cast(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition))+1); - std::vector> hashRanges = setupKmerSplits(par, subMat, seqDbr, totalKmersPerSplit, splits); + std::vector> hashRanges = setupKmerSplits(par, subMat, seqDbr, totalKmersPerSplit, splits); if(splits > 1){ Debug(Debug::INFO) << "Process file into " << hashRanges.size() << " parts\n"; } std::vector splitFiles; - KmerPosition *hashSeqPair = NULL; + KmerPosition *hashSeqPair = NULL; size_t mpiRank = 0; #ifdef HAVE_MPI @@ -832,7 +942,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { for(size_t split = fromSplit; split < fromSplit+splitCount; split++) { std::string splitFileName = par.db2 + "_split_" +SSTR(split); - hashSeqPair = doComputation(totalKmers, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); + hashSeqPair = doComputation(totalKmers, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); } // detect insufficient buffer size if (totalKmers == 0) { @@ -853,7 +963,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { std::string splitFileNameDone = splitFileName + ".done"; if(FileUtil::fileExists(splitFileNameDone.c_str()) == false){ - hashSeqPair = doComputation(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); + hashSeqPair = doComputation(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); } // detect insufficient buffer size if (totalKmersPerSplit == 0) { @@ -926,7 +1036,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { return EXIT_SUCCESS; } -template +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits){ std::vector> hashRanges; if (splits > 1) { @@ -935,9 +1045,9 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr size_t * hashDist = new size_t[USHRT_MAX+1]; memset(hashDist, 0 , sizeof(size_t) * (USHRT_MAX+1)); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); + fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); }else{ - fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); + fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); } seqDbr.remapData(); // figure out if machine has enough memory to run this job @@ -948,7 +1058,7 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr } } if(maxBucketSize > totalKmers){ - Debug(Debug::INFO) << "Not enough memory to run the kmermatcher. Minimum is at least " << maxBucketSize* sizeof(KmerPosition) << " bytes\n"; + Debug(Debug::INFO) << "Not enough memory to run the kmermatcher. Minimum is at least " << maxBucketSize* sizeof(KmerPosition) << " bytes\n"; EXIT(EXIT_FAILURE); } // define splits @@ -973,13 +1083,8 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr return hashRanges; } -int kmermatcher(int argc, const char **argv, const Command &command) { - MMseqsMPI::init(argc, argv); - - Parameters &par = Parameters::getInstance(); - setLinearFilterDefault(&par); - par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); - +// https://github.com/soedinglab/MMseqs2/pull/873#issue-2464876011 +void matchWithAdjacentSeq(Parameters &par,int argc, const char **argv, const Command &command) { float hashSeqBuffer; bool firstIt = true; do { @@ -1005,20 +1110,59 @@ int kmermatcher(int argc, const char **argv, const Command &command) { // if the buffer size is insufficient, par.hashSeqBuffer is changed and repeat split again hashSeqBuffer = par.hashSeqBuffer; if (seqDbr.getMaxSeqLen() < SHRT_MAX) { - kmermatcherInner(par, seqDbr); + kmermatcherInner(par, seqDbr); } else { - kmermatcherInner(par, seqDbr); + kmermatcherInner(par, seqDbr); } seqDbr.close(); } while (hashSeqBuffer != par.hashSeqBuffer); +} + +void matchWithoutAdjacentSeq(Parameters &par,int argc, const char **argv, const Command &command) { + DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, + DBReader::USE_INDEX | DBReader::USE_DATA); + seqDbr.open(DBReader::NOSORT); + int querySeqType = seqDbr.getDbtype(); + + setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); + std::vector *params = command.params; + par.printParameters(command.cmd, argc, argv, *params); + Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; + + if (seqDbr.getMaxSeqLen() < SHRT_MAX) { + kmermatcherInner(par, seqDbr); + } + else { + kmermatcherInner(par, seqDbr); + } + + seqDbr.close(); +} + +int kmermatcher(int argc, const char **argv, const Command &command) { + MMseqsMPI::init(argc, argv); + + Parameters &par = Parameters::getInstance(); + setLinearFilterDefault(&par); + par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); + + bool matchAdjacentSeq = par.matchAdjacentSeq; + + if (matchAdjacentSeq) { + matchWithAdjacentSeq(par, argc, argv, command); + } else { + // overwrite value (no need for buffer) + par.hashSeqBuffer = 1.0; + matchWithoutAdjacentSeq(par, argc, argv, command); + } return EXIT_SUCCESS; } -template +template void writeKmerMatcherResult(DBWriter & dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads) { std::vector threadOffsets; size_t splitSize = totalKmers/threads; @@ -1314,8 +1458,8 @@ void mergeKmerFilesAndOutput(DBWriter & dbw, } -template -void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t totalKmers) { +template +void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t totalKmers) { size_t repSeqId = SIZE_T_MAX; size_t lastTargetId = SIZE_T_MAX; seqLenType lastDiagonal=0; @@ -1437,26 +1581,44 @@ void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqTy } } -template std::pair fillKmerPositionArray<0, short>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, short>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, short>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<0, int>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, int>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, int>(KmerPosition< int> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); - -template KmerPosition *initKmerPositionMemory(size_t size); -template KmerPosition *initKmerPositionMemory(size_t size); - -template size_t computeMemoryNeededLinearfilter(size_t totalKmer); -template size_t computeMemoryNeededLinearfilter(size_t totalKmer); - -template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); -template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::pair fillKmerPositionArray<0, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<0, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, int, false>(KmerPosition< int, false> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<0, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<0, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, int, true>(KmerPosition< int, true> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); + +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); + +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); + +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); #undef SIZE_T_MAX diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index 428e112e4..292993593 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -78,7 +78,7 @@ struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer ) return true; if(second.kmer < first.kmer ) @@ -98,7 +98,7 @@ struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndPosReverse(const KmerPosition &first, const KmerPosition &second){ size_t firstKmer = BIT_SET(first.kmer, 63); size_t secondKmer = BIT_SET(second.kmer, 63); if(firstKmer < secondKmer ) @@ -120,7 +120,7 @@ struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndDiagReverse(const KmerPosition &first, const KmerPosition &second){ size_t firstKmer = BIT_SET(first.kmer, 63); size_t secondKmer = BIT_SET(second.kmer, 63); if(firstKmer < secondKmer) @@ -138,7 +138,7 @@ struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndDiag(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer) return true; if(second.kmer < first.kmer) @@ -218,7 +218,10 @@ class CompareResultBySeqId { template -size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, +size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template +size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); template @@ -231,23 +234,23 @@ size_t queueNextEntry(KmerPositionQueue &queue, int file, size_t offsetPos, T *e void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqType); -template -void writeKmersToDisk(std::string tmpFile, KmerPosition *kmers, size_t totalKmers); +template +void writeKmersToDisk(std::string tmpFile, KmerPosition *kmers, size_t totalKmers); -template -void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, +template +void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads); -template -KmerPosition * doComputation(size_t &totalKmers, size_t split, size_t splits, std::string splitFile, +template +KmerPosition * doComputation(size_t &totalKmers, size_t split, size_t splits, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); -template -KmerPosition *initKmerPositionMemory(size_t size); +template +KmerPosition *initKmerPositionMemory(size_t size); -template -std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template +std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); @@ -255,10 +258,10 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, si void maskSequence(int maskMode, int maskLowerCase, Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix); -template +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); -template +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t chooseTopKmer, From e199a6904aab6fe61c24a87b532f85b521618bab Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Thu, 29 Aug 2024 19:58:37 +0900 Subject: [PATCH 09/13] feat: default IncludeAdjacentSeq false --- src/linclust/kmermatcher.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index 292993593..a583edcf0 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -71,7 +71,7 @@ struct AdjacentSeqArray { } }; -template +template struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray { size_t kmer; unsigned int id; @@ -246,7 +246,7 @@ template KmerPosition * doComputation(size_t &totalKmers, size_t split, size_t splits, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); -template +template KmerPosition *initKmerPositionMemory(size_t size); template @@ -258,10 +258,10 @@ std::pair fillKmerPositionArray(KmerPosition +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); -template +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t chooseTopKmer, From 4c88efcce0a9d5ed65c839852560a27062c7751c Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Thu, 29 Aug 2024 20:05:01 +0900 Subject: [PATCH 10/13] fix: evade unused parameter check --- src/linclust/kmermatcher.cpp | 2 +- src/linclust/kmermatcher.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 5ebe728a2..536cdff3d 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -556,7 +556,7 @@ KmerPosition * doComputation(size_t &totalKmers, size_t h template size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *, float &) { size_t writePos=0; size_t prevHash = hashSeqPair[0].kmer; diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index a583edcf0..64ce325db 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -62,10 +62,10 @@ struct AdjacentSeqArray { // save memory when adjacent sequence is unused template <> struct AdjacentSeqArray { - void setAdjacentSeq(const int index, const unsigned char val) { + void setAdjacentSeq(const int, const unsigned char) { Debug(Debug::ERROR) << "Invalid write attempt at adjacent sequence array"; }; - unsigned char getAdjacentSeq(int index) { + unsigned char getAdjacentSeq(int) { Debug(Debug::ERROR) << "Invalid read attempt at adjacent sequence array"; return '\0'; } From 0cc8af8800124be966ae42ebc37f29ccf884b941 Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Fri, 30 Aug 2024 01:46:27 +0900 Subject: [PATCH 11/13] fix: compiler error --- src/linclust/kmermatcher.cpp | 26 +++++++++++++------------- src/linclust/kmermatcher.h | 4 +--- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 536cdff3d..7bb3f076e 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -482,7 +482,7 @@ KmerPosition * doComputation(size_t &totalKmers, size_t h par.kmerSize = ret.second; Debug(Debug::INFO) << "\nAdjusted k-mer length " << par.kmerSize << "\n"; }else{ - std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); + std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); elementsToSort = ret.first; } if(hashEndRange == SIZE_T_MAX){ @@ -1581,29 +1581,29 @@ void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqTy } } -template std::pair fillKmerPositionArray<0, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<0, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<1, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<2, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<0, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<0, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<1, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, int, false>(KmerPosition< int, false> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<2, int, false>(KmerPosition< int, false> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<0, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<0, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<1, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<2, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<0, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<0, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<1, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, int, true>(KmerPosition< int, true> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template std::pair fillKmerPositionArray<2, int, true>(KmerPosition< int, true> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); template KmerPosition *initKmerPositionMemory(size_t size); diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index 64ce325db..1b7d158b4 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -62,9 +62,7 @@ struct AdjacentSeqArray { // save memory when adjacent sequence is unused template <> struct AdjacentSeqArray { - void setAdjacentSeq(const int, const unsigned char) { - Debug(Debug::ERROR) << "Invalid write attempt at adjacent sequence array"; - }; + void setAdjacentSeq(const int, const unsigned char) {}; unsigned char getAdjacentSeq(int) { Debug(Debug::ERROR) << "Invalid read attempt at adjacent sequence array"; return '\0'; From ecea89f31d741bdac25b30965decd952cbbd9226 Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Fri, 30 Aug 2024 02:30:29 +0900 Subject: [PATCH 12/13] fix: disable feature by default --- src/commons/Parameters.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 072a16a57..0cb81e9b1 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -152,7 +152,6 @@ Parameters::Parameters(): PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip k-mers occurring multiple times (>=2)", typeid(bool), (void *) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash initialization", typeid(int), (void *) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_PICK_N_SIMILAR(PARAM_PICK_N_SIMILAR_ID, "--pick-n-sim-kmer", "Add N similar to search", "Add N similar k-mers to search", typeid(int), (void *) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), - // TODO: qualify as COMMAND_EXPERT? PARAM_MATCH_ADJACENT_SEQ(PARAM_MATCH_ADJACENT_SEQ_ID, "--match-adjacent-seq", "Compare adjacent sequences to k-mers", "Compare sequence information adjacent to k-mers and elect multiple representative sequences per cluster", typeid(bool), (void *) &matchAdjacentSeq, "", MMseqsParameter::COMMAND_CLUSTLINEAR), PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "Adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void *) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_RESULT_DIRECTION(PARAM_RESULT_DIRECTION_ID, "--result-direction", "Result direction", "result is 0: query, 1: target centric", typeid(int), (void *) &resultDirection, "^[0-1]{1}$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), @@ -2515,7 +2514,8 @@ void Parameters::setDefaults() { resultDirection = Parameters::PARAM_RESULT_DIRECTION_TARGET; weightThr = 0.9; weightFile = ""; - matchAdjacentSeq = true; + // TODO: change to true after fixing regression tests + matchAdjacentSeq = false; hashSeqBuffer = 1.05; // result2stats From 62a2ad46b4d42bc6ff4e74a3f4848fec3128b880 Mon Sep 17 00:00:00 2001 From: Joey Lee Date: Fri, 30 Aug 2024 16:18:45 +0900 Subject: [PATCH 13/13] fix: make param visible --- src/commons/Parameters.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 0cb81e9b1..302919d4c 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -985,6 +985,7 @@ Parameters::Parameters(): kmermatcher.push_back(&PARAM_V); kmermatcher.push_back(&PARAM_WEIGHT_FILE); kmermatcher.push_back(&PARAM_WEIGHT_THR); + kmermatcher.push_back(&PARAM_MATCH_ADJACENT_SEQ); // kmermatcher kmersearch.push_back(&PARAM_SEED_SUB_MAT);