diff --git a/tests/pytest/crash_gen/README.md b/tests/pytest/crash_gen/README.md
index 6788ab1a63d0a7c515558695605d1ec8ac5fb7f9..8cdddd5b57c31d563b38d3e8769ab8dc19642347 100644
--- a/tests/pytest/crash_gen/README.md
+++ b/tests/pytest/crash_gen/README.md
@@ -6,6 +6,25 @@ To effectively test and debug our TDengine product, we have developed a simple t
 exercise various functions of the system in a randomized fashion, hoping to expose 
 maximum number of problems, hopefully without a pre-determined scenario.
 
+# Features
+
+This tool can run as a test client with the following features:
+
+1. Any number of concurrent threads
+1. Any number of test steps/loops
+1. Auto-create and writing to multiple databases
+1. Ignore specific error codes
+1. Write small or large data blocks
+1. Auto-generate out-of-sequence data, if needed
+1. Verify the result of write operations
+1. Concurrent writing to a shadow database for later data verification
+1. User specified number of replicas to use, against clusters
+
+This tool can also use to start a TDengine service, either in stand-alone mode or 
+cluster mode. The features include:
+
+1. User specified number of D-Nodes to create/use.
+
 # Preparation
 
 To run this tool, please ensure the followed preparation work is done first.
@@ -16,7 +35,7 @@ To run this tool, please ensure the followed preparation work is done first.
     Ubuntu 20.04LTS as our own development environment, and suggest you also use such
     an environment if possible.
 
-# Simple Execution
+# Simple Execution as Client Test Tool
 
 To run the tool with the simplest method, follow the steps below:
 
@@ -28,19 +47,21 @@ To run the tool with the simplest method, follow the steps below:
 
 That's it!
 
-# Running Clusters
+# Running Server-side Clusters
 
 This tool also makes it easy to test/verify the clustering capabilities of TDengine. You
 can start a cluster quite easily with the following command:
 
 ```
 $ cd tests/pytest/
-$ ./crash_gen.sh -e -o 3
+$ rm -rf ../../build/cluster_dnode_?; ./crash_gen.sh -e -o 3   # first part optional
 ```
 
 The `-e` option above tells the tool to start the service, and do not run any tests, while 
 the `-o 3` option tells the tool to start 3 DNodes and join them together in a cluster. 
-Obviously you can adjust the the number here.
+Obviously you can adjust the the number here. The `rm -rf` command line is optional 
+to clean up previous cluster data, so that we can start from a clean state with no data
+at all.
 
 ## Behind the Scenes
 
@@ -89,8 +110,9 @@ The exhaustive features of the tool is available through the `-h` option:
 
 ```
 $ ./crash_gen.sh -h
-usage: crash_gen_bootstrap.py [-h] [-a] [-b MAX_DBS] [-c CONNECTOR_TYPE] [-d] [-e] [-g IGNORE_ERRORS] [-i MAX_REPLICAS] [-l] [-n] [-o NUM_DNODES] [-p] [-r]
-                              [-s MAX_STEPS] [-t NUM_THREADS] [-v] [-x]
+usage: crash_gen_bootstrap.py [-h] [-a] [-b MAX_DBS] [-c CONNECTOR_TYPE] [-d] [-e] [-g IGNORE_ERRORS] 
+    [-i NUM_REPLICAS] [-k] [-l] [-m] [-n]
+    [-o NUM_DNODES] [-p] [-r] [-s MAX_STEPS] [-t NUM_THREADS] [-v] [-w] [-x]
 
 TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below)
 ---------------------------------------------------------------------
@@ -109,11 +131,14 @@ optional arguments:
   -e, --run-tdengine    Run TDengine service in foreground (default: false)
   -g IGNORE_ERRORS, --ignore-errors IGNORE_ERRORS
                         Ignore error codes, comma separated, 0x supported (default: None)
-  -i MAX_REPLICAS, --max-replicas MAX_REPLICAS
-                        Maximum number of replicas to use, when testing against clusters. (default: 1)
+  -i NUM_REPLICAS, --num-replicas NUM_REPLICAS
+                        Number (fixed) of replicas to use, when testing against clusters. (default: 1)
+  -k, --track-memory-leaks
+                        Use Valgrind tool to track memory leaks (default: false)
   -l, --larger-data     Write larger amount of data during write operations (default: false)
+  -m, --mix-oos-data    Mix out-of-sequence data into the test data stream (default: true)
   -n, --dynamic-db-table-names
-                        Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)
+                        Use non-fixed names for dbs/tables, for -b, useful for multi-instance executions (default: false)
   -o NUM_DNODES, --num-dnodes NUM_DNODES
                         Number of Dnodes to initialize, used with -e option. (default: 1)
   -p, --per-thread-db-connection
@@ -124,6 +149,7 @@ optional arguments:
   -t NUM_THREADS, --num-threads NUM_THREADS
                         Number of threads to run (default: 10)
   -v, --verify-data     Verify data written in a number of places by reading back (default: false)
+  -w, --use-shadow-db   Use a shaddow database to verify data integrity (default: false)
   -x, --continue-on-exception
                         Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)
 ```
diff --git a/tests/pytest/crash_gen/crash_gen_main.py b/tests/pytest/crash_gen/crash_gen_main.py
index 0fe94bad1d6b36aa1be5d47428dae19e918ce6e4..44295e8bee72bd4af398569887da69ab06234be5 100755
--- a/tests/pytest/crash_gen/crash_gen_main.py
+++ b/tests/pytest/crash_gen/crash_gen_main.py
@@ -1574,9 +1574,9 @@ class TaskCreateDb(StateTransitionTask):
     def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
         # was: self.execWtSql(wt, "create database db")
         repStr = ""
-        if gConfig.max_replicas != 1:
+        if gConfig.num_replicas != 1:
             # numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N
-            numReplica = gConfig.max_replicas # fixed, always
+            numReplica = gConfig.num_replicas # fixed, always
             repStr = "replica {}".format(numReplica)
         updatePostfix = "update 1" if gConfig.verify_data else "" # allow update only when "verify data" is active
         dbName = self._db.getName()
@@ -2394,11 +2394,16 @@ class MainExec:
             help='Ignore error codes, comma separated, 0x supported (default: None)')
         parser.add_argument(
             '-i',
-            '--max-replicas',
+            '--num-replicas',
             action='store',
             default=1,
             type=int,
-            help='Maximum number of replicas to use, when testing against clusters. (default: 1)')
+            help='Number (fixed) of replicas to use, when testing against clusters. (default: 1)')
+        parser.add_argument(
+            '-k',
+            '--track-memory-leaks',
+            action='store_true',
+            help='Use Valgrind tool to track memory leaks (default: false)')
         parser.add_argument(
             '-l',
             '--larger-data',
diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py
index ae6f8d5d3a5a4d6a8bf745a48ec2368aa5e832ad..cdbf2db4dab81ab07b51597eb0a2805cc931e34e 100644
--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@@ -19,6 +19,7 @@ from queue import Queue, Empty
 
 from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress
 from .db import DbConn, DbTarget
+import crash_gen.settings 
 
 class TdeInstance():
     """
@@ -132,6 +133,7 @@ keep 36500
 walLevel 1
 #
 # maxConnections 100
+quorum 2
 """
         cfgContent = cfgTemplate.format_map(cfgValues)
         f = open(cfgFile, "w")
@@ -164,7 +166,12 @@ walLevel 1
         return "127.0.0.1"
 
     def getServiceCmdLine(self): # to start the instance
-        return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()
+        cmdLine = []
+        if crash_gen.settings.gConfig.track_memory_leaks:
+            Logging.info("Invoking VALGRIND on service...")
+            cmdLine = ['valgrind', '--leak-check=yes']
+        cmdLine += ["exec " + self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()        
+        return cmdLine
     
     def _getDnodes(self, dbc):
         dbc.query("show dnodes")
@@ -202,7 +209,7 @@ walLevel 1
         self.generateCfgFile() # service side generates config file, client does not
         self.rotateLogs()
 
-        self._smThread.start(self.getServiceCmdLine())
+        self._smThread.start(self.getServiceCmdLine(), self.getLogDir()) # May raise exceptions
 
     def stop(self):
         self._smThread.stop()
@@ -225,7 +232,7 @@ class TdeSubProcess:
     # RET_SUCCESS = -4
 
     def __init__(self):
-        self.subProcess = None
+        self.subProcess = None # type: subprocess.Popen
         # if tInst is None:
         #     raise CrashGenError("Empty instance not allowed in TdeSubProcess")
         # self._tInst = tInst # Default create at ServiceManagerThread
@@ -263,7 +270,7 @@ class TdeSubProcess:
         # print("Starting TDengine with env: ", myEnv.items())
         # print("Starting TDengine via Shell: {}".format(cmdLineStr))
 
-        useShell = True    
+        useShell = True # Needed to pass environments into it
         self.subProcess = subprocess.Popen(
             # ' '.join(cmdLine) if useShell else cmdLine,
             # shell=useShell,
@@ -276,12 +283,12 @@ class TdeSubProcess:
             env=myEnv
             )  # had text=True, which interferred with reading EOF
 
-    STOP_SIGNAL = signal.SIGKILL # signal.SIGKILL/SIGINT # What signal to use (in kill) to stop a taosd process?
+    STOP_SIGNAL = signal.SIGINT # signal.SIGKILL/SIGINT # What signal to use (in kill) to stop a taosd process?
     SIG_KILL_RETCODE = 137 # ref: https://stackoverflow.com/questions/43268156/process-finished-with-exit-code-137-in-pycharm
 
     def stop(self):
         """
-        Stop a sub process, and try to return a meaningful return code.
+        Stop a sub process, DO NOT return anything, process all conditions INSIDE
 
         Common POSIX signal values (from man -7 signal):
         SIGHUP           1
@@ -301,40 +308,99 @@ class TdeSubProcess:
         """
         if not self.subProcess:
             Logging.error("Sub process already stopped")
-            return  # -1
+            return
 
         retCode = self.subProcess.poll() # ret -N means killed with signal N, otherwise it's from exit(N)
         if retCode:  # valid return code, process ended
-            retCode = -retCode # only if valid
+            # retCode = -retCode # only if valid
             Logging.warning("TSP.stop(): process ended itself")
             self.subProcess = None
-            return retCode
+            return
 
         # process still alive, let's interrupt it
-        Logging.info("Terminate running process, send SIG_{} and wait...".format(self.STOP_SIGNAL))
+        self._stopForSure(self.subProcess, self.STOP_SIGNAL) # success if no exception
+        self.subProcess = None
+
         # sub process should end, then IPC queue should end, causing IO thread to end        
-        topSubProc = psutil.Process(self.subProcess.pid)
-        for child in topSubProc.children(recursive=True):  # or parent.children() for recursive=False
-            child.send_signal(self.STOP_SIGNAL)
-            time.sleep(0.2) # 200 ms
-        # topSubProc.send_signal(sig) # now kill the main sub process (likely the Shell)
-
-        self.subProcess.send_signal(self.STOP_SIGNAL) # main sub process (likely the Shell)
-        self.subProcess.wait(20)
-        retCode = self.subProcess.returncode # should always be there
-        # May throw subprocess.TimeoutExpired exception above, therefore
-        # The process is guranteed to have ended by now
-        self.subProcess = None       
-        if retCode == self.SIG_KILL_RETCODE:
-            Logging.info("TSP.stop(): sub proc KILLED, as expected")
-        elif retCode == (- self.STOP_SIGNAL):
-            Logging.info("TSP.stop(), sub process STOPPED, as expected")
-        elif retCode != 0: # != (- signal.SIGINT):
-            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(
-                self.STOP_SIGNAL, retCode))
-        else:
-            Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(self.STOP_SIGNAL))
-        return - retCode
+
+    @classmethod
+    def _stopForSure(cls, proc: subprocess.Popen, sig: int):
+        ''' 
+        Stop a process and all sub processes with a singal, and SIGKILL if necessary
+        '''
+        def doKillTdService(proc: subprocess.Popen, sig: int):
+            Logging.info("Killing sub-sub process {} with signal {}".format(proc.pid, sig))
+            proc.send_signal(sig)
+            try:            
+                retCode = proc.wait(20)
+                if (- retCode) == signal.SIGSEGV: # Crashed
+                    Logging.warning("Process {} CRASHED, please check CORE file!".format(proc.pid))
+                elif (- retCode) == sig : 
+                    Logging.info("TD service terminated with expected return code {}".format(sig))
+                else:
+                    Logging.warning("TD service terminated, EXPECTING ret code {}, got {}".format(sig, -retCode))
+                return True # terminated successfully
+            except subprocess.TimeoutExpired as err:
+                Logging.warning("Failed to kill sub-sub process {} with signal {}".format(proc.pid, sig))
+            return False # failed to terminate
+
+
+        def doKillChild(child: psutil.Process, sig: int):
+            Logging.info("Killing sub-sub process {} with signal {}".format(child.pid, sig))
+            child.send_signal(sig)
+            try:            
+                retCode = child.wait(20)
+                if (- retCode) == signal.SIGSEGV: # Crashed
+                    Logging.warning("Process {} CRASHED, please check CORE file!".format(child.pid))
+                elif (- retCode) == sig : 
+                    Logging.info("Sub-sub process terminated with expected return code {}".format(sig))
+                else:
+                    Logging.warning("Process terminated, EXPECTING ret code {}, got {}".format(sig, -retCode))
+                return True # terminated successfully
+            except psutil.TimeoutExpired as err:
+                Logging.warning("Failed to kill sub-sub process {} with signal {}".format(child.pid, sig))
+            return False # did not terminate
+
+        def doKill(proc: subprocess.Popen, sig: int):
+            pid = proc.pid
+            try:
+                topSubProc = psutil.Process(pid)
+                for child in topSubProc.children(recursive=True):  # or parent.children() for recursive=False
+                    Logging.warning("Unexpected child to be killed")
+                    doKillChild(child, sig)
+            except psutil.NoSuchProcess as err:
+                Logging.info("Process not found, can't kill, pid = {}".format(pid))
+            
+            return doKillTdService(proc, sig)
+            # TODO: re-examine if we need to kill the top process, which is always the SHELL for now
+            # try:
+            #     proc.wait(1) # SHELL process here, may throw subprocess.TimeoutExpired exception
+            #     # expRetCode = self.SIG_KILL_RETCODE if sig==signal.SIGKILL else (-sig)
+            #     # if retCode == expRetCode:
+            #     #     Logging.info("Process terminated with expected return code {}".format(retCode))
+            #     # else:
+            #     #     Logging.warning("Process terminated, EXPECTING ret code {}, got {}".format(expRetCode, retCode))
+            #     # return True # success
+            # except subprocess.TimeoutExpired as err:
+            #     Logging.warning("Failed to kill process {} with signal {}".format(pid, sig))
+            # return False # failed to kill
+
+        def softKill(proc, sig):
+            return doKill(proc, sig)
+
+        def hardKill(proc):
+            return doKill(proc, signal.SIGKILL)
+
+        
+
+        pid = proc.pid
+        Logging.info("Terminate running processes under {}, with SIG #{} and wait...".format(pid, sig))
+        if softKill(proc, sig):            
+            return# success
+        if sig != signal.SIGKILL: # really was soft above            
+            if hardKill(proc):
+                return
+        raise CrashGenError("Failed to stop process, pid={}".format(pid))
 
 class ServiceManager:
     PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
@@ -560,7 +626,8 @@ class ServiceManagerThread:
         # self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
         # self._tInst    = tInst or TdeInstance() # Need an instance
 
-        self._thread = None # The actual thread, # type: threading.Thread
+        self._thread  = None # The actual thread, # type: threading.Thread
+        self._thread2 = None # watching stderr
         self._status = Status(Status.STATUS_STOPPED) # The status of the underlying service, actually.
 
     def __repr__(self):
@@ -568,11 +635,20 @@ class ServiceManagerThread:
             self.getStatus(), self._tdeSubProcess)
 
     def getStatus(self):
+        '''
+        Get the status of the process being managed. (misnomer alert!)
+        '''
         return self._status
 
     # Start the thread (with sub process), and wait for the sub service
     # to become fully operational
-    def start(self, cmdLine):
+    def start(self, cmdLine : str, logDir: str):
+        '''
+        Request the manager thread to start a new sub process, and manage it.
+
+        :param cmdLine: the command line to invoke
+        :param logDir: the logging directory, to hold stdout/stderr files
+        '''
         if self._thread:
             raise RuntimeError("Unexpected _thread")
         if self._tdeSubProcess:
@@ -582,20 +658,30 @@ class ServiceManagerThread:
 
         self._status.set(Status.STATUS_STARTING)
         self._tdeSubProcess = TdeSubProcess()
-        self._tdeSubProcess.start(cmdLine)
+        self._tdeSubProcess.start(cmdLine) # TODO: verify process is running
 
         self._ipcQueue = Queue()
         self._thread = threading.Thread( # First thread captures server OUTPUT
             target=self.svcOutputReader,
-            args=(self._tdeSubProcess.getStdOut(), self._ipcQueue))
+            args=(self._tdeSubProcess.getStdOut(), self._ipcQueue, logDir))
         self._thread.daemon = True  # thread dies with the program
         self._thread.start()
+        time.sleep(0.01)
+        if not self._thread.is_alive(): # What happened?
+            Logging.info("Failed to started process to monitor STDOUT")
+            self.stop()
+            raise CrashGenError("Failed to start thread to monitor STDOUT")
+        Logging.info("Successfully started process to monitor STDOUT")
 
         self._thread2 = threading.Thread( # 2nd thread captures server ERRORs
             target=self.svcErrorReader,
-            args=(self._tdeSubProcess.getStdErr(), self._ipcQueue))
+            args=(self._tdeSubProcess.getStdErr(), self._ipcQueue, logDir))
         self._thread2.daemon = True  # thread dies with the program
         self._thread2.start()
+        time.sleep(0.01)
+        if not self._thread2.is_alive():
+            self.stop()
+            raise CrashGenError("Failed to start thread to monitor STDERR")
 
         # wait for service to start
         for i in range(0, 100):
@@ -643,7 +729,7 @@ class ServiceManagerThread:
             Logging.info("Service already stopped")
             return
         if self.getStatus().isStopping():
-            Logging.info("Service is already being stopped")
+            Logging.info("Service is already being stopped, pid: {}".format(self._tdeSubProcess.getPid()))
             return
         # Linux will send Control-C generated SIGINT to the TDengine process
         # already, ref:
@@ -653,14 +739,14 @@ class ServiceManagerThread:
 
         self._status.set(Status.STATUS_STOPPING)
         # retCode = self._tdeSubProcess.stop()
-        try:
-            retCode = self._tdeSubProcess.stop()
-            # print("Attempted to stop sub process, got return code: {}".format(retCode))
-            if retCode == signal.SIGSEGV : # SGV
-                Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
-        except subprocess.TimeoutExpired as err:
-            Logging.info("Time out waiting for TDengine service process to exit")
-        else:    
+        # try:
+        #     retCode = self._tdeSubProcess.stop()
+        #     # print("Attempted to stop sub process, got return code: {}".format(retCode))
+        #     if retCode == signal.SIGSEGV : # SGV
+        #         Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
+        # except subprocess.TimeoutExpired as err:
+        #     Logging.info("Time out waiting for TDengine service process to exit")
+        if not self._tdeSubProcess.stop(): # everything withing
             if self._tdeSubProcess.isRunning():  # still running, should now never happen
                 Logging.error("FAILED to stop sub process, it is still running... pid = {}".format(
                     self._tdeSubProcess.getPid()))
@@ -683,16 +769,18 @@ class ServiceManagerThread:
             raise RuntimeError(
                 "SMT.Join(): Unexpected status: {}".format(self._status))
 
-        if self._thread:
-            self._thread.join()
-            self._thread = None
-            self._status.set(Status.STATUS_STOPPED)
-            # STD ERR thread
-            self._thread2.join()
-            self._thread2 = None
+        if self._thread or self._thread2 :
+            if self._thread:
+                self._thread.join()
+                self._thread = None
+            if self._thread2: # STD ERR thread            
+                self._thread2.join()
+                self._thread2 = None
         else:
             print("Joining empty thread, doing nothing")
 
+        self._status.set(Status.STATUS_STOPPED)
+
     def _trimQueue(self, targetSize):
         if targetSize <= 0:
             return  # do nothing
@@ -739,11 +827,22 @@ class ServiceManagerThread:
         print(pBar, end="", flush=True)
         print('\b\b\b\b', end="", flush=True)
 
-    def svcOutputReader(self, out: IO, queue):
+    def svcOutputReader(self, out: IO, queue, logDir: str):
+        '''
+        The infinite routine that processes the STDOUT stream for the sub process being managed.
+
+        :param out: the IO stream object used to fetch the data from
+        :param queue: the queue where we dump the roughly parsed line-by-line data
+        :param logDir: where we should dump a verbatim output file
+        '''
+        os.makedirs(logDir, exist_ok=True)
+        logFile = os.path.join(logDir,'stdout.log')
+        fOut = open(logFile, 'wb')
         # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
         # print("This is the svcOutput Reader...")
         # for line in out :
         for line in iter(out.readline, b''):
+            fOut.write(line)
             # print("Finished reading a line: {}".format(line))
             # print("Adding item to queue...")
             try:
@@ -772,10 +871,16 @@ class ServiceManagerThread:
             # queue.put(line)
         # meaning sub process must have died
         Logging.info("EOF for TDengine STDOUT: {}".format(self))
-        out.close()
+        out.close() # Close the stream
+        fOut.close() # Close the output file
 
-    def svcErrorReader(self, err: IO, queue):
+    def svcErrorReader(self, err: IO, queue, logDir: str):
+        os.makedirs(logDir, exist_ok=True)
+        logFile = os.path.join(logDir,'stderr.log')
+        fErr = open(logFile, 'wb')
         for line in iter(err.readline, b''):
+            fErr.write(line)
             Logging.info("TDengine STDERR: {}".format(line))
         Logging.info("EOF for TDengine STDERR: {}".format(self))
-        err.close()
\ No newline at end of file
+        err.close()
+        fErr.close()
\ No newline at end of file