From a79c5b1278481da104f97c8e667948ebf3896542 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Tue, 28 Jul 2020 08:34:35 +0000
Subject: [PATCH] Minor tweaks for crash_gen tool, re-demonstrating TD-997

---
 tests/pytest/crash_gen.py | 82 ++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/tests/pytest/crash_gen.py b/tests/pytest/crash_gen.py
index fead65596f..98181180e2 100755
--- a/tests/pytest/crash_gen.py
+++ b/tests/pytest/crash_gen.py
@@ -136,7 +136,10 @@ class WorkerThread:
 
         # clean up
         if (gConfig.per_thread_db_connection):  # type: ignore
-            self._dbConn.close()
+            if self._dbConn.isOpen: #sometimes it is not open
+                self._dbConn.close()
+            else:
+                logger.warning("Cleaning up worker thread, dbConn already closed")
 
     def _doTaskLoop(self):
         # while self._curStep < self._pool.maxSteps:
@@ -146,6 +149,7 @@ class WorkerThread:
             try:
                 tc.crossStepBarrier()  # shared barrier first, INCLUDING the last one
             except threading.BrokenBarrierError as err: # main thread timed out
+                print("_bto", end="")
                 logger.debug("[TRD] Worker thread exiting due to main thread barrier time-out")
                 break
 
@@ -153,6 +157,7 @@ class WorkerThread:
             self.crossStepGate()   # then per-thread gate, after being tapped
             logger.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid))
             if not self._tc.isRunning():
+                print("_wts", end="")
                 logger.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...")
                 break
 
@@ -169,6 +174,7 @@ class WorkerThread:
             logger.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid))
 
             self._dbInUse = False  # there may be changes between steps
+        # print("_wtd", end=None) # worker thread died
 
     def verifyThreadSelf(self):  # ensure we are called by this own thread
         if (threading.get_ident() != self._thread.ident):
@@ -197,12 +203,15 @@ class WorkerThread:
         # self._curStep += 1  # off to a new step...
 
     def tapStepGate(self):  # give it a tap, release the thread waiting there
-        self.verifyThreadAlive()
+        # self.verifyThreadAlive()
         self.verifyThreadMain()  # only allowed for main thread
 
-        logger.debug("[TRD] Tapping worker thread {}".format(self._tid))
-        self._stepGate.set()  # wake up!
-        time.sleep(0)  # let the released thread run a bit
+        if self._thread.is_alive():
+            logger.debug("[TRD] Tapping worker thread {}".format(self._tid))
+            self._stepGate.set()  # wake up!
+            time.sleep(0)  # let the released thread run a bit
+        else:
+            print("_tad", end="") # Thread already dead
 
     def execSql(self, sql):  # TODO: expose DbConn directly
         return self.getDbConn().execute(sql)
@@ -332,6 +341,7 @@ class ThreadCoordinator:
                 # end, and maybe signal them to stop
             else:
                 raise
+        return transitionFailed
 
         self.resetExecutedTasks()  # clear the tasks after we are done
         # Get ready for next step
@@ -373,7 +383,13 @@ class ThreadCoordinator:
                 break # do transition only if tasks are error free
 
             # Ending previous step
-            transitionFailed = self._doTransition() # To start, we end step -1 first
+            try:
+                transitionFailed = self._doTransition() # To start, we end step -1 first
+            except taos.error.ProgrammingError as err:
+                transitionFailed = True
+                errno2 = err.errno if (err.errno > 0) else 0x80000000 + err.errno  # correct error scheme
+                logger.info("Transition failed: errno=0x{:X}, msg: {}".format(errno2, err))
+
             # Then we move on to the next step
             self._releaseAllWorkerThreads(transitionFailed)                    
 
@@ -788,12 +804,15 @@ class DbConnNative(DbConn):
         else:
             projPath = selfPath[:selfPath.find("tests")]
 
+        buildPath = None
         for root, dirs, files in os.walk(projPath):
             if ("taosd" in files):
                 rootRealPath = os.path.dirname(os.path.realpath(root))
                 if ("packaging" not in rootRealPath):
                     buildPath = root[:len(root) - len("/build/bin")]
                     break
+        if buildPath == None:
+            raise RuntimeError("Failed to determine buildPath, selfPath={}".format(self_path))
         return buildPath
 
     
@@ -1015,29 +1034,11 @@ class StateDbOnly(AnyState):
         if (not self.hasTask(tasks, TaskCreateDb)):
             # only if we don't create any more
             self.assertAtMostOneSuccess(tasks, TaskDropDb)
-        self.assertIfExistThenSuccess(tasks, TaskDropDb)
-        # self.assertAtMostOneSuccess(tasks, CreateFixedTableTask) # not true in massively parrallel cases
-        # Nothing to be said about adding data task
-        # if ( self.hasSuccess(tasks, DropDbTask) ): # dropped the DB
-        # self.assertHasTask(tasks, DropDbTask) # implied by hasSuccess
-        # self.assertAtMostOneSuccess(tasks, DropDbTask)
-        # self._state = self.STATE_EMPTY
-        # if ( self.hasSuccess(tasks, TaskCreateSuperTable) ): # did not drop db, create table success
-        #     # self.assertHasTask(tasks, CreateFixedTableTask) # tried to create table
-        #     if ( not self.hasTask(tasks, TaskDropSuperTable) ):
-        #         self.assertAtMostOneSuccess(tasks, TaskCreateSuperTable) # at most 1 attempt is successful, if we don't drop anything
-        # self.assertNoTask(tasks, DropDbTask) # should have have tried
-        # if ( not self.hasSuccess(tasks, AddFixedDataTask) ): # just created table, no data yet
-        #     # can't say there's add-data attempts, since they may all fail
-        #     self._state = self.STATE_TABLE_ONLY
-        # else:
-        #     self._state = self.STATE_HAS_DATA
-        # What about AddFixedData?
-        # elif ( self.hasSuccess(tasks, AddFixedDataTask) ):
-        #     self._state = self.STATE_HAS_DATA
-        # else: # no success in dropping db tasks, no success in create fixed table? read data should also fail
-        #     # raise RuntimeError("Unexpected no-success scenario")   # We might just landed all failure tasks,
-        #     self._state = self.STATE_DB_ONLY  # no change
+
+        # TODO: restore the below, the problem exists, although unlikely in real-world
+        # if (gSvcMgr!=None) and gSvcMgr.isRestarting():     
+        # if (gSvcMgr == None) or (not gSvcMgr.isRestarting()) : 
+        #     self.assertIfExistThenSuccess(tasks, TaskDropDb)       
 
 
 class StateSuperTableOnly(AnyState):
@@ -1192,7 +1193,7 @@ class StateMechine:
             # case of multiple creation and drops
 
         if self._curState.canDropDb():
-            if gSvcMgr == None: # only if we are not restarting service
+            if gSvcMgr == None: # only if we are running as client-only
                 self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
             # self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
             # case of drop-create-drop
@@ -1522,8 +1523,7 @@ class Task():
         try:
             self._executeInternal(te, wt)  # TODO: no return value?
         except taos.error.ProgrammingError as err:
-            errno2 = err.errno if (
-                err.errno > 0) else 0x80000000 + err.errno  # correct error scheme
+            errno2 = err.errno if (err.errno > 0) else 0x80000000 + err.errno  # correct error scheme
             if (gConfig.continue_on_exception):  # user choose to continue
                 self.logDebug("[=] Continue after TAOS exception: errno=0x{:X}, msg: {}, SQL: {}".format(
                         errno2, err, wt.getDbConn().getLastSql()))
@@ -1961,6 +1961,7 @@ class TaskRestartService(StateTransitionTask):
             return state.canDropFixedSuperTable()  # Basicallly when we have the super table
         return False # don't run this otherwise
 
+    CHANCE_TO_RESTART_SERVICE = 100
     def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
         if not gConfig.auto_start_service: # only execute when we are in -a mode
             print("_a", end="", flush=True)
@@ -1972,7 +1973,7 @@ class TaskRestartService(StateTransitionTask):
                 return
             self._isRunning = True
 
-        if Dice.throw(50) == 0: # 1 in N chance
+        if Dice.throw(self.CHANCE_TO_RESTART_SERVICE) == 0: # 1 in N chance
             dbc = wt.getDbConn()
             dbc.execute("show databases") # simple delay, align timing with other workers
             gSvcMgr.restart()
@@ -2459,11 +2460,12 @@ class TdeSubProcess:
         # for f in filelist:
         #     filePath = os.path.join(logPath, f)
         #     print("Removing log file: {}".format(filePath))
-        #     os.remove(filePath)
-        logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
-        logger.info("Saving old log files to: {}".format(logPathSaved))
-        os.rename(logPath, logPathSaved)
-        os.mkdir(logPath) # recreate
+        #     os.remove(filePath)        
+        if os.path.exists(logPath):
+            logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
+            logger.info("Saving old log files to: {}".format(logPathSaved))
+            os.rename(logPath, logPathSaved)
+        # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
             
 
         svcCmd = [taosdPath, '-c', cfgPath]
@@ -2622,13 +2624,13 @@ class ClientManager:
     def prepare(self):
         self._printLastNumbers()
 
-    def run(self, svcMgr):
+    def run(self, svcMgr):    
         self._printLastNumbers()
 
         dbManager = DbManager()  # Regular function
         thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
         self.tc = ThreadCoordinator(thPool, dbManager)
-
+        
         self.tc.run()
         # print("exec stats: {}".format(self.tc.getExecStats()))
         # print("TC failed = {}".format(self.tc.isFailed()))
-- 
GitLab