Merge pull request #4129 from taosdata/feature/crash_gen

Refactoring crash_gen tool, helping to reproduce TD-1955

Merge pull request #4129 from taosdata/feature/crash_gen
Refactoring crash_gen tool, helping to reproduce TD-1955
968285e4 · Shengliang Guan · GitHub · c1d7a1a7 · 9ab8fb5b · 968285e4
5 changed file
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -38,9 +38,9 @@ import resource
 from guppy import hpy
 import gc

-from .service_manager import ServiceManager, TdeInstance
-from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress
-from .db import DbConn, MyTDSql, DbConnNative, DbManager
+from crash_gen.service_manager import ServiceManager, TdeInstance
+from crash_gen.misc import Logging, Status, CrashGenError, Dice, Helper, Progress
+from crash_gen.db import DbConn, MyTDSql, DbConnNative, DbManager

 import taos
 import requests
@@ -243,7 +243,7 @@ class WorkerThread:


 class ThreadCoordinator:
-    WORKER_THREAD_TIMEOUT = 180 # one minute
+    WORKER_THREAD_TIMEOUT = 120  # Normal: 120

    def __init__(self, pool: ThreadPool, dbManager: DbManager):
        self._curStep = -1  # first step is 0
@@ -388,9 +388,9 @@ class ThreadCoordinator:
                self._syncAtBarrier() # For now just cross the barrier
                Progress.emit(Progress.END_THREAD_STEP)
            except threading.BrokenBarrierError as err:
-                Logging.info("Main loop aborted, caused by worker thread time-out")
+                Logging.info("Main loop aborted, caused by worker thread(s) time-out")
                self._execStats.registerFailure("Aborted due to worker thread timeout")
-                print("\n\nWorker Thread time-out detected, important thread info:")
+                print("\n\nWorker Thread time-out detected, TAOS related threads are:")
                ts = ThreadStacks()
                ts.print(filterInternal=True)
                workerTimeout = True
@@ -435,7 +435,7 @@ class ThreadCoordinator:
        Logging.debug("\r\n\n--> Main thread ready to finish up...")
        Logging.debug("Main thread joining all threads")
        self._pool.joinAll()  # Get all threads to finish
-        Logging.info("\nAll worker threads finished")
+        Logging.info(". . . All worker threads finished") # No CR/LF before
        self._execStats.endExec()

    def cleanup(self): # free resources
@@ -1072,17 +1072,18 @@ class Database:
        t3 = datetime.datetime(2012, 1, 1)  # default "keep" is 10 years
        t4 = datetime.datetime.fromtimestamp(
            t3.timestamp() + elSec2)  # see explanation above
-        Logging.info("Setting up TICKS to start from: {}".format(t4))
+        Logging.debug("Setting up TICKS to start from: {}".format(t4))
        return t4

    @classmethod
    def getNextTick(cls):        
        with cls._clsLock:  # prevent duplicate tick
-            if cls._lastLaggingTick==0:
+            if cls._lastLaggingTick==0 or cls._lastTick==0 : # not initialized
                # 10k at 1/20 chance, should be enough to avoid overlaps
-                cls._lastLaggingTick = cls.setupLastTick() + datetime.timedelta(0, -10000)                 
-            if cls._lastTick==0: # should be quite a bit into the future
-                cls._lastTick = cls.setupLastTick()  
+                tick = cls.setupLastTick()
+                cls._lastTick = tick
+                cls._lastLaggingTick = tick + datetime.timedelta(0, -10000)                 
+                # if : # should be quite a bit into the future

            if Dice.throw(20) == 0:  # 1 in 20 chance, return lagging tick
                cls._lastLaggingTick += datetime.timedelta(0, 1) # Go back in time 100 seconds
@@ -1177,6 +1178,8 @@ class Task():
        instead. But a task is always associated with a DB
    '''
    taskSn = 100
+    _lock = threading.Lock()
+    _tableLocks: Dict[str, threading.Lock] = {}

    @classmethod
    def allocTaskNum(cls):
@@ -1198,6 +1201,8 @@ class Task():
        self._execStats = execStats
        self._db = db # A task is always associated/for a specific DB

+        
+
    def isSuccess(self):
        return self._err is None

@@ -1237,6 +1242,7 @@ class Task():
                0x0B,  # Unable to establish connection, more details in TD-1648
                0x200, # invalid SQL， TODO: re-examine with TD-934
                0x20F, # query terminated, possibly due to vnoding being dropped, see TD-1776
+                0x213, # "Disconnected from service", result of "kill connection ???"
                0x217, # "db not selected", client side defined error code
                # 0x218, # "Table does not exist" client side defined error code
                0x360, # Table already exists
@@ -1318,7 +1324,7 @@ class Task():
                self._err = err
                self._aborted = True
        except Exception as e:
-            self.logInfo("Non-TAOS exception encountered")
+            Logging.info("Non-TAOS exception encountered with: {}".format(self.__class__.__name__))
            self._err = e
            self._aborted = True
            traceback.print_exc()
@@ -1351,6 +1357,24 @@ class Task():
    def getQueryResult(self, wt: WorkerThread):  # execute an SQL on the worker thread
        return wt.getQueryResult()

+    def lockTable(self, ftName): # full table name
+        # print(" <<" + ftName + '_', end="", flush=True)
+        with Task._lock:
+            if not ftName in Task._tableLocks:
+                Task._tableLocks[ftName] = threading.Lock()
+        
+        Task._tableLocks[ftName].acquire()
+
+    def unlockTable(self, ftName):
+        # print('_' + ftName + ">> ", end="", flush=True)
+        with Task._lock:
+            if not ftName in self._tableLocks:
+                raise RuntimeError("Corrupt state, no such lock")
+            lock = Task._tableLocks[ftName]
+            if not lock.locked():
+                raise RuntimeError("Corrupte state, already unlocked")
+        lock.release()
+

 class ExecutionStats:
    def __init__(self):
@@ -1461,7 +1485,7 @@ class StateTransitionTask(Task):

    _baseTableNumber = None

-    _endState = None
+    _endState = None # TODO: no longter used?

    @classmethod
    def getInfo(cls):  # each sub class should supply their own information
@@ -1486,7 +1510,7 @@ class StateTransitionTask(Task):

    @classmethod
    def getRegTableName(cls, i):
-        if ( StateTransitionTask._baseTableNumber is None):
+        if ( StateTransitionTask._baseTableNumber is None): # Set it one time
            StateTransitionTask._baseTableNumber = Dice.throw(
                999) if gConfig.dynamic_db_table_names else 0
        return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i)
@@ -1544,8 +1568,11 @@ class TaskCreateSuperTable(StateTransitionTask):

        sTable = self._db.getFixedSuperTable() # type: TdSuperTable
        # wt.execSql("use db")    # should always be in place
+
        sTable.create(wt.getDbConn(), self._db.getName(), 
-            {'ts':'timestamp', 'speed':'int'}, {'b':'binary(200)', 'f':'float'})
+            {'ts':'timestamp', 'speed':'int'}, {'b':'binary(200)', 'f':'float'},
+            dropIfExists = True
+            )
        # self.execWtSql(wt,"create table db.{} (ts timestamp, speed int) tags (b binary(200), f float) ".format(tblName))
        # No need to create the regular tables, INSERT will do that
        # automatically
@@ -1558,13 +1585,40 @@ class TdSuperTable:
    def getName(self):
        return self._stName

+    def drop(self, dbc, dbName, skipCheck = False):
+        if self.exists(dbc, dbName) : # if myself exists
+            fullTableName = dbName + '.' + self._stName                
+            dbc.execute("DROP TABLE {}".format(fullTableName))
+        else:
+            if not skipCheck:
+                raise CrashGenError("Cannot drop non-existant super table: {}".format(self._stName))
+
+    def exists(self, dbc, dbName):
+        dbc.execute("USE " + dbName)
+        return dbc.existsSuperTable(self._stName)
+
    # TODO: odd semantic, create() method is usually static?
-    def create(self, dbc, dbName, cols: dict, tags: dict):
+    def create(self, dbc, dbName, cols: dict, tags: dict,
+        dropIfExists = False
+        ):
+
        '''Creating a super table'''
-        sql = "CREATE TABLE {}.{} ({}) TAGS ({})".format(
-            dbName,
-            self._stName,
-            ",".join(['%s %s'%(k,v) for (k,v) in cols.items()]),
+        dbc.execute("USE " + dbName)
+        fullTableName = dbName + '.' + self._stName       
+        if dbc.existsSuperTable(self._stName):
+            if dropIfExists: 
+                dbc.execute("DROP TABLE {}".format(fullTableName))
+            else: # error
+                raise CrashGenError("Cannot create super table, already exists: {}".format(self._stName))
+                 
+        # Now let's create
+        sql = "CREATE TABLE {} ({})".format(
+            fullTableName,
+            ",".join(['%s %s'%(k,v) for (k,v) in cols.items()]))
+        if tags is None :
+            sql += " TAGS (dummy int) "
+        else:
+            sql += " TAGS ({})".format(
                ",".join(['%s %s'%(k,v) for (k,v) in tags.items()])
            )
        dbc.execute(sql)        
@@ -1583,14 +1637,25 @@ class TdSuperTable:
    def hasRegTables(self, dbc: DbConn, dbName: str):
        return dbc.query("SELECT * FROM {}.{}".format(dbName, self._stName)) > 0

-    def ensureTable(self, dbc: DbConn, dbName: str, regTableName: str):
+    def ensureTable(self, task: Task, dbc: DbConn, dbName: str, regTableName: str):
        sql = "select tbname from {}.{} where tbname in ('{}')".format(dbName, self._stName, regTableName)
        if dbc.query(sql) >= 1 : # reg table exists already
            return
-        sql = "CREATE TABLE {}.{} USING {}.{} tags ({})".format(
-            dbName, regTableName, dbName, self._stName, self._getTagStrForSql(dbc, dbName)
+
+        # acquire a lock first, so as to be able to *verify*. More details in TD-1471
+        fullTableName = dbName + '.' + regTableName      
+        if task is not None:  # optional lock
+            task.lockTable(fullTableName)
+        Progress.emit(Progress.CREATE_TABLE_ATTEMPT) # ATTEMPT to create a new table
+        # print("(" + fullTableName[-3:] + ")", end="", flush=True)  
+        try:
+            sql = "CREATE TABLE {} USING {}.{} tags ({})".format(
+                fullTableName, dbName, self._stName, self._getTagStrForSql(dbc, dbName)
            )
            dbc.execute(sql)
+        finally:
+            if task is not None:
+                task.unlockTable(fullTableName) # no matter what

    def _getTagStrForSql(self, dbc, dbName: str) :
        tags = self._getTags(dbc, dbName)
@@ -1809,7 +1874,7 @@ class TaskRestartService(StateTransitionTask):

        with self._classLock:
            if self._isRunning:
-                print("Skipping restart task, another running already")
+                Logging.info("Skipping restart task, another running already")
                return
            self._isRunning = True

@@ -1847,24 +1912,21 @@ class TaskAddData(StateTransitionTask):
    def canBeginFrom(cls, state: AnyState):
        return state.canAddData()

-    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
-        # ds = self._dbManager # Quite DANGEROUS here, may result in multi-thread client access
-        db = self._db
-        dbc = wt.getDbConn()
-        tblSeq = list(range(
-                self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES))
-        random.shuffle(tblSeq)
-        for i in tblSeq:
-            if (i in self.activeTable):  # wow already active
-                print("x", end="", flush=True) # concurrent insertion
-            else:
-                self.activeTable.add(i)  # marking it active
+    def _addDataInBatch(self, db, dbc, regTableName, te: TaskExecutor): 
+        numRecords = self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS        
+        fullTableName = db.getName() + '.' + regTableName

-            sTable = db.getFixedSuperTable()
-            regTableName = self.getRegTableName(i)  # "db.reg_table_{}".format(i)
-            sTable.ensureTable(wt.getDbConn(), db.getName(), regTableName)  # Ensure the table exists           
+        sql = "insert into {} values ".format(fullTableName)
+        for j in range(numRecords):  # number of records per table
+            nextInt = db.getNextInt()
+            nextTick = db.getNextTick()
+            sql += "('{}', {});".format(nextTick, nextInt)
+        dbc.execute(sql)

-            for j in range(self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS):  # number of records per table
+    def _addData(self, db, dbc, regTableName, te: TaskExecutor): # implied: NOT in batches
+        numRecords = self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS        
+
+        for j in range(numRecords):  # number of records per table
            nextInt = db.getNextInt()
            nextTick = db.getNextTick()
            if gConfig.record_ops:
@@ -1872,27 +1934,29 @@ class TaskAddData(StateTransitionTask):
                self.fAddLogReady.write("Ready to write {} to {}\n".format(nextInt, regTableName))
                self.fAddLogReady.flush()
                os.fsync(self.fAddLogReady)
-                sql = "insert into {}.{} values ('{}', {});".format( # removed: tags ('{}', {})
-                    db.getName(),
-                    regTableName,
+                
+            # TODO: too ugly trying to lock the table reliably, refactor...
+            fullTableName = db.getName() + '.' + regTableName
+            if gConfig.verify_data:
+                self.lockTable(fullTableName) 
+                # print("_w" + str(nextInt % 100), end="", flush=True) # Trace what was written
+
+            try:
+                sql = "insert into {} values ('{}', {});".format( # removed: tags ('{}', {})
+                    fullTableName,
                    # ds.getFixedSuperTableName(),
                    # ds.getNextBinary(), ds.getNextFloat(),
                    nextTick, nextInt)
                dbc.execute(sql)
-                # Successfully wrote the data into the DB, let's record it
-                # somehow
-                te.recordDataMark(nextInt)
-                if gConfig.record_ops:
-                    self.fAddLogDone.write(
-                        "Wrote {} to {}\n".format(
-                            nextInt, regTableName))
-                    self.fAddLogDone.flush()
-                    os.fsync(self.fAddLogDone)
+            except: # Any exception at all
+                if gConfig.verify_data:
+                    self.unlockTable(fullTableName)     
+                raise

            # Now read it back and verify, we might encounter an error if table is dropped
            if gConfig.verify_data: # only if command line asks for it
                try:
-                        readBack = dbc.queryScalar("SELECT speed from {}.{} WHERE ts= '{}'".
+                    readBack = dbc.queryScalar("SELECT speed from {}.{} WHERE ts='{}'".
                        format(db.getName(), regTableName, nextTick))
                    if readBack != nextInt :
                        raise taos.error.ProgrammingError(
@@ -1905,14 +1969,50 @@ class TaskAddData(StateTransitionTask):
                            "Failed to read back same data for tick: {}, wrote: {}, read: {}"
                            .format(nextTick, nextInt, "Empty Result" if errno==0x991 else "Multiple Result"),
                            errno)
-                        # Re-throw no matter what
+                    elif errno in [0x218, 0x362]: # table doesn't exist
+                        # do nothing
+                        dummy = 0
+                    else:
+                        # Re-throw otherwise
                        raise
+                finally:
+                    self.unlockTable(fullTableName) # Unlock the table no matter what

+            # Successfully wrote the data into the DB, let's record it somehow
+            te.recordDataMark(nextInt)

-            self.activeTable.discard(i)  # not raising an error, unlike remove
+            if gConfig.record_ops:
+                self.fAddLogDone.write("Wrote {} to {}\n".format(nextInt, regTableName))
+                self.fAddLogDone.flush()
+                os.fsync(self.fAddLogDone)

+    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
+        # ds = self._dbManager # Quite DANGEROUS here, may result in multi-thread client access
+        db = self._db
+        dbc = wt.getDbConn()
+        numTables  = self.LARGE_NUMBER_OF_TABLES  if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES
+        numRecords = self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS
+        tblSeq = list(range(numTables ))
+        random.shuffle(tblSeq) # now we have random sequence
+        for i in tblSeq:
+            if (i in self.activeTable):  # wow already active
+                print("x", end="", flush=True) # concurrent insertion
+            else:
+                self.activeTable.add(i)  # marking it active
            
+            sTable = db.getFixedSuperTable()
+            regTableName = self.getRegTableName(i)  # "db.reg_table_{}".format(i)            
+            fullTableName = db.getName() + '.' + regTableName
+            # self._lockTable(fullTableName) # "create table" below. Stop it if the table is "locked"
+            sTable.ensureTable(self, wt.getDbConn(), db.getName(), regTableName)  # Ensure the table exists           
+            # self._unlockTable(fullTableName)
           
+            if Dice.throw(1) == 0: # 1 in 2 chance
+                self._addData(db, dbc, regTableName, te)
+            else:
+                self._addDataInBatch(db, dbc, regTableName, te)
+
+            self.activeTable.discard(i)  # not raising an error, unlike remove


 class ThreadStacks: # stack info for all threads
@@ -1936,17 +2036,18 @@ class ThreadStacks: # stack info for all threads
                    '__init__']: # the thread that extracted the stack
                    continue # ignore
            # Now print
-            print("\n<----- Thread Info for ID: {}".format(thNid))
+            print("\n<----- Thread Info for LWP/ID: {} (Execution stopped at Bottom Frame) <-----".format(thNid))
+            stackFrame = 0
            for frame in stack:
                # print(frame)
-                print("File {filename}, line {lineno}, in {name}".format(
-                    filename=frame.filename, lineno=frame.lineno, name=frame.name))
+                print("[{sf}] File {filename}, line {lineno}, in {name}".format(
+                    sf=stackFrame, filename=frame.filename, lineno=frame.lineno, name=frame.name))
                print("    {}".format(frame.line))
-            print("-----> End of Thread Info\n")
+            print("-----> End of Thread Info ----->\n")

 class ClientManager:
    def __init__(self):
-        print("Starting service manager")
+        Logging.info("Starting service manager")
        # signal.signal(signal.SIGTERM, self.sigIntHandler)
        # signal.signal(signal.SIGINT, self.sigIntHandler)

@@ -2048,7 +2149,7 @@ class ClientManager:
        thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
        self.tc = ThreadCoordinator(thPool, dbManager)
        
-        print("Starting client instance to: {}".format(tInst))
+        Logging.info("Starting client instance: {}".format(tInst))
        self.tc.run()
        # print("exec stats: {}".format(self.tc.getExecStats()))
        # print("TC failed = {}".format(self.tc.isFailed()))

--- a/tests/pytest/crash_gen/db.py
+++ b/tests/pytest/crash_gen/db.py
@@ -95,6 +95,11 @@ class DbConn:
        # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName)))
        return dbName in dbs # TODO: super weird type mangling seen, once here

+    def existsSuperTable(self, stName):
+        self.query("show stables")
+        sts = [v[0] for v in self.getQueryResult()]
+        return stName in sts
+
    def hasTables(self):
        return self.query("show tables") > 0

@@ -240,6 +245,7 @@ class MyTDSql:

    def _execInternal(self, sql):
        startTime = time.time() 
+        # Logging.debug("Executing SQL: " + sql)
        ret = self._cursor.execute(sql)
        # print("\nSQL success: {}".format(sql))
        queryTime =  time.time() - startTime

--- a/tests/pytest/crash_gen/misc.py
+++ b/tests/pytest/crash_gen/misc.py
@@ -27,7 +27,7 @@ class LoggingFilter(logging.Filter):

 class MyLoggingAdapter(logging.LoggerAdapter):
    def process(self, msg, kwargs):
-        return "[{}] {}".format(threading.get_ident() % 10000, msg), kwargs
+        return "[{:04d}] {}".format(threading.get_ident() % 10000, msg), kwargs
        # return '[%s] %s' % (self.extra['connid'], msg), kwargs


@@ -51,7 +51,7 @@ class Logging:
        _logger.addHandler(ch)

        # Logging adapter, to be used as a logger
-        print("setting logger variable")
+        # print("setting logger variable")
        # global logger
        cls.logger = MyLoggingAdapter(_logger, [])

@@ -166,6 +166,9 @@ class Progress:
    SERVICE_RECONNECT_START     = 4
    SERVICE_RECONNECT_SUCCESS   = 5
    SERVICE_RECONNECT_FAILURE   = 6
+    SERVICE_START_NAP           = 7
+    CREATE_TABLE_ATTEMPT        = 8
+
    tokens = {
        STEP_BOUNDARY:      '.',
        BEGIN_THREAD_STEP:  '[',
@@ -174,6 +177,8 @@ class Progress:
        SERVICE_RECONNECT_START:    '<r.',
        SERVICE_RECONNECT_SUCCESS:  '.r>',
        SERVICE_RECONNECT_FAILURE:  '.xr>',
+        SERVICE_START_NAP:           '_zz',
+        CREATE_TABLE_ATTEMPT: '_c',
    }

    @classmethod

--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@@ -47,6 +47,17 @@ class TdeInstance():
                .format(selfPath, projPath))
        return buildPath

+    @classmethod
+    def prepareGcovEnv(cls, env):
+        # Ref: https://gcc.gnu.org/onlinedocs/gcc/Cross-profiling.html
+        bPath = cls._getBuildPath() # build PATH
+        numSegments = len(bPath.split('/')) - 1 # "/x/TDengine/build" should yield 3
+        numSegments = numSegments - 1 # DEBUG only
+        env['GCOV_PREFIX'] = bPath + '/svc_gcov'
+        env['GCOV_PREFIX_STRIP'] = str(numSegments) # Strip every element, plus, ENV needs strings
+        Logging.info("Preparing GCOV environement to strip {} elements and use path: {}".format(
+            numSegments, env['GCOV_PREFIX'] ))
+
    def __init__(self, subdir='test', tInstNum=0, port=6030, fepPort=6030):
        self._buildDir  = self._getBuildPath()
        self._subdir    = '/' + subdir # TODO: tolerate "/"
@@ -217,6 +228,11 @@ class TdeSubProcess:
        #     raise CrashGenError("Empty instance not allowed in TdeSubProcess")
        # self._tInst = tInst # Default create at ServiceManagerThread

+    def __repr__(self):
+        if self.subProcess is None:
+            return '[TdeSubProc: Empty]'
+        return '[TdeSubProc: pid = {}]'.format(self.getPid())
+
    def getStdOut(self):
        return self.subProcess.stdout

@@ -236,16 +252,29 @@ class TdeSubProcess:
        if self.subProcess:  # already there
            raise RuntimeError("Corrupt process state")

+        # Prepare environment variables for coverage information
+        # Ref: https://stackoverflow.com/questions/2231227/python-subprocess-popen-with-a-modified-environment
+        myEnv = os.environ.copy()
+        TdeInstance.prepareGcovEnv(myEnv)
+
+        # print(myEnv)
+        # print(myEnv.items())
+        # print("Starting TDengine via Shell: {}".format(cmdLineStr))
+
+        useShell = True    
        self.subProcess = subprocess.Popen(
-            cmdLine,
-            shell=False,
+            ' '.join(cmdLine) if useShell else cmdLine,
+            shell=useShell,
            # svcCmdSingle, shell=True, # capture core dump?
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            # bufsize=1, # not supported in binary mode
-            close_fds=ON_POSIX
+            close_fds=ON_POSIX,
+            env=myEnv
            )  # had text=True, which interferred with reading EOF

+    STOP_SIGNAL = signal.SIGKILL # What signal to use (in kill) to stop a taosd process?
+
    def stop(self):
        """
        Stop a sub process, and try to return a meaningful return code.
@@ -267,7 +296,7 @@ class TdeSubProcess:
        SIGUSR2         12
        """
        if not self.subProcess:
-            print("Sub process already stopped")
+            Logging.error("Sub process already stopped")
            return  # -1

        retCode = self.subProcess.poll() # ret -N means killed with signal N, otherwise it's from exit(N)
@@ -278,20 +307,25 @@ class TdeSubProcess:
            return retCode

        # process still alive, let's interrupt it
-        print("Terminate running process, send SIG_INT and wait...")
+        Logging.info("Terminate running process, send SIG_{} and wait...".format(self.STOP_SIGNAL))
        # sub process should end, then IPC queue should end, causing IO thread to end        
-        # sig = signal.SIGINT
-        sig = signal.SIGKILL
-        self.subProcess.send_signal(sig) # SIGNINT or SIGKILL
+        topSubProc = psutil.Process(self.subProcess.pid)
+        for child in topSubProc.children(recursive=True):  # or parent.children() for recursive=False
+            child.send_signal(self.STOP_SIGNAL)
+            time.sleep(0.2) # 200 ms
+        # topSubProc.send_signal(sig) # now kill the main sub process (likely the Shell)
+
+        self.subProcess.send_signal(self.STOP_SIGNAL) # main sub process (likely the Shell)
        self.subProcess.wait(20)
        retCode = self.subProcess.returncode # should always be there
        # May throw subprocess.TimeoutExpired exception above, therefore
        # The process is guranteed to have ended by now
        self.subProcess = None        
        if retCode != 0: # != (- signal.SIGINT):
-            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode))
+            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(
+                self.STOP_SIGNAL, retCode))
        else:
-            Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig))
+            Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(self.STOP_SIGNAL))
        return - retCode

 class ServiceManager:
@@ -439,7 +473,7 @@ class ServiceManager:
                    
            time.sleep(self.PAUSE_BETWEEN_IPC_CHECK)  # pause, before next round
        # raise CrashGenError("dummy")
-        print("Service Manager Thread (with subprocess) ended, main thread exiting...")
+        Logging.info("Service Manager Thread (with subprocess) ended, main thread exiting...")

    def _getFirstInstance(self):
        return self._tInsts[0]
@@ -452,7 +486,7 @@ class ServiceManager:
            # Find if there's already a taosd service, and then kill it
            for proc in psutil.process_iter():
                if proc.name() == 'taosd':
-                    print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupt")
+                    Logging.info("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupt")
                    time.sleep(2.0)
                    proc.kill()
                # print("Process: {}".format(proc.name()))
@@ -559,7 +593,8 @@ class ServiceManagerThread:
        for i in range(0, 100):
            time.sleep(1.0)
            # self.procIpcBatch() # don't pump message during start up
-            print("_zz_", end="", flush=True)
+            Progress.emit(Progress.SERVICE_START_NAP)
+            # print("_zz_", end="", flush=True)
            if self._status.isRunning():
                Logging.info("[] TDengine service READY to process requests")
                Logging.info("[] TAOS service started: {}".format(self))
@@ -595,12 +630,12 @@ class ServiceManagerThread:

    def stop(self):
        # can be called from both main thread or signal handler
-        print("Terminating TDengine service running as the sub process...")
+        Logging.info("Terminating TDengine service running as the sub process...")
        if self.getStatus().isStopped():
-            print("Service already stopped")
+            Logging.info("Service already stopped")
            return
        if self.getStatus().isStopping():
-            print("Service is already being stopped")
+            Logging.info("Service is already being stopped")
            return
        # Linux will send Control-C generated SIGINT to the TDengine process
        # already, ref:
@@ -616,10 +651,10 @@ class ServiceManagerThread:
            if retCode == signal.SIGSEGV : # SGV
                Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
        except subprocess.TimeoutExpired as err:
-            print("Time out waiting for TDengine service process to exit")
+            Logging.info("Time out waiting for TDengine service process to exit")
        else:    
            if self._tdeSubProcess.isRunning():  # still running, should now never happen
-                print("FAILED to stop sub process, it is still running... pid = {}".format(
+                Logging.error("FAILED to stop sub process, it is still running... pid = {}".format(
                    self._tdeSubProcess.getPid()))
            else:
                self._tdeSubProcess = None  # not running any more
@@ -683,9 +718,9 @@ class ServiceManagerThread:
                return  # we are done with THIS BATCH
            else:  # got line, printing out
                if forceOutput:
-                    Logging.info(line)
+                    Logging.info('[TAOSD] ' + line)
                else:
-                    Logging.debug(line)
+                    Logging.debug('[TAOSD] ' + line)
        print(">", end="", flush=True)

    _ProgressBars = ["--", "//", "||", "\\\\"]
@@ -728,11 +763,11 @@ class ServiceManagerThread:

            # queue.put(line)
        # meaning sub process must have died
-        Logging.info("\nEnd of stream detected for TDengine STDOUT: {}".format(self))
+        Logging.info("EOF for TDengine STDOUT: {}".format(self))
        out.close()

    def svcErrorReader(self, err: IO, queue):
        for line in iter(err.readline, b''):
-            print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line))
-        Logging.info("\nEnd of stream detected for TDengine STDERR: {}".format(self))
+            Logging.info("TDengine STDERR: {}".format(line))
+        Logging.info("EOF for TDengine STDERR: {}".format(self))
        err.close()
\ No newline at end of file
--- a/tests/pytest/crash_gen_bootstrap.py
+++ b/tests/pytest/crash_gen_bootstrap.py
@@ -11,7 +11,7 @@
 ###################################################################

 import sys
-from crash_gen.crash_gen import MainExec
+from crash_gen.crash_gen_main import MainExec

 if __name__ == "__main__":