diff --git a/tests/pytest/crash_gen.sh b/tests/pytest/crash_gen.sh index 9cca23ac79e8..0af09634df5a 100755 --- a/tests/pytest/crash_gen.sh +++ b/tests/pytest/crash_gen.sh @@ -70,10 +70,12 @@ if [[ $1 == '--valgrind' ]]; then $CRASH_GEN_EXEC $@ > $VALGRIND_OUT 2> $VALGRIND_ERR elif [[ $1 == '--helgrind' ]]; then shift + HELGRIND_OUT=helgrind.out + HELGRIND_ERR=helgrind.err valgrind \ --tool=helgrind \ $PYTHON_EXEC \ - $CRASH_GEN_EXEC $@ + $CRASH_GEN_EXEC $@ > $HELGRIND_OUT 2> $HELGRIND_ERR else $PYTHON_EXEC $CRASH_GEN_EXEC $@ fi diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 739fb699d643..8d2b0080bc69 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -1226,6 +1226,11 @@ def _executeInternal(self, te: TaskExecutor, wt: WorkerThread): "To be implemeted by child classes, class name: {}".format( self.__class__.__name__)) + def _isServiceStable(self): + if not gSvcMgr: + return True # we don't run service, so let's assume it's stable + return gSvcMgr.isStable() # otherwise let's examine the service + def _isErrAcceptable(self, errno, msg): if errno in [ 0x05, # TSDB_CODE_RPC_NOT_READY @@ -1263,7 +1268,7 @@ def _isErrAcceptable(self, errno, msg): return True elif msg.find("duplicated column names") != -1: # also alter table tag issues return True - elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ... + elif not self._isServiceStable(): # We are managing service, and ... Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg)) return True @@ -1641,15 +1646,39 @@ def getEndState(cls): def canBeginFrom(cls, state: AnyState): return state.canReadData() + # def _canRestartService(self): + # if not gSvcMgr: + # return True # always + # return gSvcMgr.isActive() # only if it's running TODO: race condition here + def _executeInternal(self, te: TaskExecutor, wt: WorkerThread): sTable = self._db.getFixedSuperTable() - # 1 in 5 chance, simulate a broken connection. - if random.randrange(5) == 0: # TODO: break connection in all situations - wt.getDbConn().close() - wt.getDbConn().open() - print("_r", end="", flush=True) - + # 1 in 5 chance, simulate a broken connection, only if service stable (not restarting) + if random.randrange(20)==0: # and self._canRestartService(): # TODO: break connection in all situations + # Logging.info("Attempting to reconnect to server") # TODO: change to DEBUG + Progress.emit(Progress.SERVICE_RECONNECT_START) + try: + wt.getDbConn().close() + wt.getDbConn().open() + except ConnectionError as err: # may fail + if not gSvcMgr: + Logging.error("Failed to reconnect in client-only mode") + raise # Not OK if we are running in client-only mode + if gSvcMgr.isRunning(): # may have race conditon, but low prob, due to + Logging.error("Failed to reconnect when managed server is running") + raise # Not OK if we are running normally + + Progress.emit(Progress.SERVICE_RECONNECT_FAILURE) + # Logging.info("Ignoring DB reconnect error") + + # print("_r", end="", flush=True) + Progress.emit(Progress.SERVICE_RECONNECT_SUCCESS) + # The above might have taken a lot of time, service might be running + # by now, causing error below to be incorrectly handled due to timing issue + return # TODO: fix server restart status race condtion + + dbc = wt.getDbConn() dbName = self._db.getName() for rTbName in sTable.getRegTables(dbc, dbName): # regular tables diff --git a/tests/pytest/crash_gen/misc.py b/tests/pytest/crash_gen/misc.py index 8a2817b3898a..34a33c6af60d 100644 --- a/tests/pytest/crash_gen/misc.py +++ b/tests/pytest/crash_gen/misc.py @@ -163,11 +163,17 @@ class Progress: BEGIN_THREAD_STEP = 1 END_THREAD_STEP = 2 SERVICE_HEART_BEAT= 3 + SERVICE_RECONNECT_START = 4 + SERVICE_RECONNECT_SUCCESS = 5 + SERVICE_RECONNECT_FAILURE = 6 tokens = { STEP_BOUNDARY: '.', BEGIN_THREAD_STEP: '[', END_THREAD_STEP: '] ', - SERVICE_HEART_BEAT: '.Y.' + SERVICE_HEART_BEAT: '.Y.', + SERVICE_RECONNECT_START: '', + SERVICE_RECONNECT_FAILURE: '.xr>', } @classmethod diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py index bb2becb55bd3..196e9d944ab0 100644 --- a/tests/pytest/crash_gen/service_manager.py +++ b/tests/pytest/crash_gen/service_manager.py @@ -280,16 +280,18 @@ def stop(self): # process still alive, let's interrupt it print("Terminate running process, send SIG_INT and wait...") # sub process should end, then IPC queue should end, causing IO thread to end - self.subProcess.send_signal(signal.SIGINT) + # sig = signal.SIGINT + sig = signal.SIGKILL + self.subProcess.send_signal(sig) # SIGNINT or SIGKILL self.subProcess.wait(20) retCode = self.subProcess.returncode # should always be there # May throw subprocess.TimeoutExpired exception above, therefore # The process is guranteed to have ended by now self.subProcess = None if retCode != 0: # != (- signal.SIGINT): - Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode)) + Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode)) else: - Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT") + Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig)) return - retCode class ServiceManager: @@ -395,6 +397,13 @@ def isActive(self): return True return False + def isRunning(self): + for ti in self._tInsts: + if not ti.getStatus().isRunning(): + return False + return True + + # def isRestarting(self): # """ # Determine if the service/cluster is being "restarted", i.e., at least