diff --git a/monitoring/StuckDetectionAgent.cpp b/monitoring/StuckDetectionAgent.cpp index 85a97698..616bb188 100644 --- a/monitoring/StuckDetectionAgent.cpp +++ b/monitoring/StuckDetectionAgent.cpp @@ -69,56 +69,51 @@ void StuckDetectionAgent::StuckDetectionLoop( StuckDetectionAgent* _agent ) { LOG( info, "StuckDetection agent: started monitoring." ); + // determine if this is the first restart, or there we restarts + // before + auto numberOfPreviousRestarts = _agent->getNumberOfPreviousRestarts(); - uint64_t restartIteration = 1; - - while ( true ) { - if ( _agent->getSchain()->getNode()->isExitRequested() ) - return; - auto restartFileName = _agent->createStuckFileName( restartIteration ); - - if ( !boost::filesystem::exists( restartFileName ) ) { - break; - } - restartIteration++; + if ( numberOfPreviousRestarts > 0 ) { + LOG( info, "Stuck detection engine: previous restarts detected:" << numberOfPreviousRestarts ); } - if ( restartIteration > 1 ) { - LOG( info, "Stuck detection engine: previous restarts detected:" << to_string( - restartIteration - 1 ) ); - } - - - if ( _agent->getSchain()->getNode()->isExitRequested() ) - return; + uint64_t restartIteration = numberOfPreviousRestarts + 1; + uint64_t whenToRestart = 0; - uint64_t restartTime = 0; - uint64_t sleepTime = _agent->getSchain()->getNode()->getStuckMonitoringIntervalMs() * 1000; - - while ( restartTime == 0 ) { - if ( _agent->getSchain()->getNode()->isExitRequested() ) - return; + // loop until stuck is detected + do { try { - usleep( sleepTime ); _agent->getSchain()->getNode()->exitCheck(); - restartTime = _agent->checkForRestart( restartIteration ); + usleep(_agent->getSchain()->getNode()->getStuckMonitoringIntervalMs() * 1000); + // this will return non-zero if skaled needs to be restarted + whenToRestart = _agent->doStuckCheck(restartIteration); } catch ( ExitRequestedException& ) { return; } catch ( exception& e ) { SkaleException::logNested( e ); } - } - + } while (whenToRestart == 0 ); - CHECK_STATE( restartTime > 0 ); + // Stuck detection loop detected stuck. Restart. try { LOG( info, "Stuck detection engine: restarting skaled because of stuck detected." ); - _agent->restart( restartTime, restartIteration ); + _agent->restart(whenToRestart, restartIteration ); } catch ( ExitRequestedException& ) { return; } } +uint64_t StuckDetectionAgent::getNumberOfPreviousRestarts() { + // each time a restart happens, a file with a corresponding name + // is created. To find out how many restarts already happened we + // count these files + uint64_t restartCounter = 0; + while (boost::filesystem::exists(restartFileName(restartCounter + 1))) { + restartCounter++; + } + return restartCounter; +} + void StuckDetectionAgent::join() { CHECK_STATE( stuckDetectionThreadPool ); stuckDetectionThreadPool->joinAll(); @@ -174,7 +169,7 @@ bool StuckDetectionAgent::stuckCheck( uint64_t _restartIntervalMs, uint64_t _tim return result; } -uint64_t StuckDetectionAgent::checkForRestart( uint64_t _restartIteration ) { +uint64_t StuckDetectionAgent::doStuckCheck(uint64_t _restartIteration ) { CHECK_STATE( _restartIteration >= 1 ); auto baseRestartIntervalMs = getSchain()->getNode()->getStuckRestartIntervalMs(); @@ -229,7 +224,7 @@ void StuckDetectionAgent::restart( uint64_t _restartTimeMs, uint64_t _iteration exit( 13 ); } -string StuckDetectionAgent::createStuckFileName( uint64_t _iteration ) { +string StuckDetectionAgent::restartFileName(uint64_t _iteration ) { CHECK_STATE( _iteration >= 1 ); auto engine = getNode()->getConsensusEngine(); CHECK_STATE( engine ); @@ -242,7 +237,7 @@ string StuckDetectionAgent::createStuckFileName( uint64_t _iteration ) { void StuckDetectionAgent::createStuckRestartFile( uint64_t _iteration ) { CHECK_STATE( _iteration >= 1 ); - auto fileName = createStuckFileName( _iteration ); + auto fileName = restartFileName(_iteration); ofstream f; f.open( fileName, ios::trunc ); diff --git a/monitoring/StuckDetectionAgent.h b/monitoring/StuckDetectionAgent.h index e6e78172..f12c6121 100644 --- a/monitoring/StuckDetectionAgent.h +++ b/monitoring/StuckDetectionAgent.h @@ -39,7 +39,7 @@ class StuckDetectionAgent : public Agent { void join(); - uint64_t checkForRestart( uint64_t _restartIteration ); + uint64_t doStuckCheck( uint64_t _restartIteration ); void restart( uint64_t _baseRestartTimeMs, uint64_t _iteration ); @@ -47,9 +47,11 @@ class StuckDetectionAgent : public Agent { void cleanupState(); - string createStuckFileName( uint64_t _iteration ); + string restartFileName(uint64_t _iteration ); bool checkNodesAreOnline(); + uint64_t getNumberOfPreviousRestarts(); + bool stuckCheck( uint64_t _restartIntervalMs, uint64_t _timeStamp ); };