From a0b83d475456bc8cd90c19405b5049509d485f3a Mon Sep 17 00:00:00 2001 From: Steven Li Date: Wed, 28 Oct 2020 06:38:41 +0000 Subject: [PATCH] Enhanced crash_gen to use SIG_KILL insteadl of SIG_INT when restarting services --- tests/pytest/crash_gen/crash_gen.py | 37 +++++++++++++++++++---- tests/pytest/crash_gen/service_manager.py | 15 +++++++-- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 739fb699d6..3759f2b00f 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -1226,6 +1226,11 @@ class Task(): "To be implemeted by child classes, class name: {}".format( self.__class__.__name__)) + def _isServiceStable(self): + if not gSvcMgr: + return True # we don't run service, so let's assume it's stable + return gSvcMgr.isStable() # otherwise let's examine the service + def _isErrAcceptable(self, errno, msg): if errno in [ 0x05, # TSDB_CODE_RPC_NOT_READY @@ -1263,7 +1268,7 @@ class Task(): return True elif msg.find("duplicated column names") != -1: # also alter table tag issues return True - elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ... + elif not self._isServiceStable(): # We are managing service, and ... Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg)) return True @@ -1641,15 +1646,35 @@ class TaskReadData(StateTransitionTask): def canBeginFrom(cls, state: AnyState): return state.canReadData() + # def _canRestartService(self): + # if not gSvcMgr: + # return True # always + # return gSvcMgr.isActive() # only if it's running TODO: race condition here + def _executeInternal(self, te: TaskExecutor, wt: WorkerThread): sTable = self._db.getFixedSuperTable() - # 1 in 5 chance, simulate a broken connection. - if random.randrange(5) == 0: # TODO: break connection in all situations - wt.getDbConn().close() - wt.getDbConn().open() + # 1 in 5 chance, simulate a broken connection, only if service stable (not restarting) + if random.randrange(20)==0: # and self._canRestartService(): # TODO: break connection in all situations + Logging.info("Attempting to reconnect to server") # TODO: change to DEBUG + try: + wt.getDbConn().close() + wt.getDbConn().open() + except ConnectionError as err: # may fail + if not gSvcMgr: + Logging.error("Failed to reconnect in client-only mode") + raise # Not OK if we are running in client-only mode + if gSvcMgr.isRunning(): # may have race conditon, but low prob, due to + Logging.error("Failed to reconnect when managed server is running") + raise # Not OK if we are running normally + Logging.info("Ignoring DB reconnect error") + print("_r", end="", flush=True) - + # The above might have taken a lot of time, service might be running + # by now, causing error below to be incorrectly handled due to timing issue + return # TODO: fix server restart status race condtion + + dbc = wt.getDbConn() dbName = self._db.getName() for rTbName in sTable.getRegTables(dbc, dbName): # regular tables diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py index bb2becb55b..196e9d944a 100644 --- a/tests/pytest/crash_gen/service_manager.py +++ b/tests/pytest/crash_gen/service_manager.py @@ -280,16 +280,18 @@ class TdeSubProcess: # process still alive, let's interrupt it print("Terminate running process, send SIG_INT and wait...") # sub process should end, then IPC queue should end, causing IO thread to end - self.subProcess.send_signal(signal.SIGINT) + # sig = signal.SIGINT + sig = signal.SIGKILL + self.subProcess.send_signal(sig) # SIGNINT or SIGKILL self.subProcess.wait(20) retCode = self.subProcess.returncode # should always be there # May throw subprocess.TimeoutExpired exception above, therefore # The process is guranteed to have ended by now self.subProcess = None if retCode != 0: # != (- signal.SIGINT): - Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode)) + Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode)) else: - Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT") + Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig)) return - retCode class ServiceManager: @@ -395,6 +397,13 @@ class ServiceManager: return True return False + def isRunning(self): + for ti in self._tInsts: + if not ti.getStatus().isRunning(): + return False + return True + + # def isRestarting(self): # """ # Determine if the service/cluster is being "restarted", i.e., at least -- GitLab