Merge branch 'develop' into feature/query

370fadb0 · Haojun Liao · d6db033d · af1d98fc · 370fadb0 · 370fadb0
15 changed file
--- a/src/connector/python/linux/python3/taos/cursor.py
+++ b/src/connector/python/linux/python3/taos/cursor.py
 from .cinterface import CTaosInterface
 from .error import *
 from .constants import FieldType
+import threading

 # querySeqNum = 0

@@ -37,6 +38,7 @@ class TDengineCursor(object):
        self._block_iter = 0
        self._affected_rows = 0
        self._logfile = ""
+        self._threadId = threading.get_ident()

        if connection is not None:
            self._connection = connection
@@ -103,6 +105,12 @@ class TDengineCursor(object):
    def execute(self, operation, params=None):
        """Prepare and execute a database operation (query or command).
        """
+        # if threading.get_ident() != self._threadId:
+        #     info ="Cursor execute:Thread ID not match,creater:"+str(self._threadId)+" caller:"+str(threading.get_ident())
+        #     raise OperationalError(info)
+            # print(info)
+            # return None
+
        if not operation:
            return None

@@ -188,6 +196,11 @@ class TDengineCursor(object):
    def fetchall(self):
        """Fetch all (remaining) rows of a query result, returning them as a sequence of sequences (e.g. a list of tuples). Note that the cursor's arraysize attribute can affect the performance of this operation.
        """
+        # if threading.get_ident() != self._threadId:
+        #     info ="[WARNING] Cursor fetchall:Thread ID not match,creater:"+str(self._threadId)+" caller:"+str(threading.get_ident())
+        #     raise OperationalError(info)
+            # print(info)
+            # return None
        if self._result is None or self._fields is None:
            raise OperationalError("Invalid use of fetchall")

@@ -232,6 +245,12 @@ class TDengineCursor(object):
    def _handle_result(self):
        """Handle the return result from query.
        """
+        # if threading.get_ident() != self._threadId:
+        #     info = "Cursor handleresult:Thread ID not match,creater:"+str(self._threadId)+" caller:"+str(threading.get_ident())
+        #     raise OperationalError(info)
+            # print(info)
+            # return None
+
        self._description = []
        for ele in self._fields:
            self._description.append(

--- a/src/dnode/src/dnodeMWrite.c
+++ b/src/dnode/src/dnodeMWrite.c
@@ -131,8 +131,8 @@ static void dnodeFreeMnodeWriteMsg(SMnodeMsg *pWrite) {
  taosFreeQitem(pWrite);
 }

-void dnodeSendRpcMnodeWriteRsp(void *pRaw, int32_t code) {
-  SMnodeMsg *pWrite = pRaw;
+void dnodeSendRpcMnodeWriteRsp(void *pMsg, int32_t code) {
+  SMnodeMsg *pWrite = pMsg;
  if (pWrite == NULL) return;
  if (code == TSDB_CODE_MND_ACTION_IN_PROGRESS) return;
  if (code == TSDB_CODE_MND_ACTION_NEED_REPROCESSED) {

--- a/src/kit/shell/src/shellImport.c
+++ b/src/kit/shell/src/shellImport.c
@@ -206,9 +206,10 @@ static void shellSourceFile(TAOS *con, char *fptr) {
    
    if (code != 0) {
      fprintf(stderr, "DB error: %s: %s (%d)\n", taos_errstr(con), fname, lineNo);
-      /* free local resouce: allocated memory/metric-meta refcnt */
-      taos_free_result(pSql);
    }
+    
+    /* free local resouce: allocated memory/metric-meta refcnt */
+    taos_free_result(pSql);

    memset(cmd, 0, MAX_COMMAND_SIZE);
    cmd_len = 0;

--- a/src/kit/taosdemo/taosdemo.c
+++ b/src/kit/taosdemo/taosdemo.c
@@ -520,9 +520,8 @@ int main(int argc, char *argv[]) {
    snprintf(command, BUFFER_SIZE, "create table if not exists %s.meters (ts timestamp%s tags (areaid int, loc binary(10))", db_name, cols);
    queryDB(taos, command);
    printf("meters created!\n");
-
-    taos_close(taos);
  }
+  taos_close(taos);
  
  /* Wait for table to create  */
  multiThreadCreateTable(cols, use_metric, threads, ntables, db_name, tb_prefix, ip_addr, port, user, pass);
@@ -792,9 +791,6 @@ void * createTable(void *sarg)
      snprintf(command, BUFFER_SIZE, "create table if not exists %s.%s%d (ts timestamp%s;", winfo->db_name, winfo->tb_prefix, i, winfo->cols);
      queryDB(winfo->taos, command);
    }
-
-    taos_close(winfo->taos);
-
  } else {
    /* Create all the tables; */
    printf("Creating table from %d to %d\n", winfo->start_table_id, winfo->end_table_id);
@@ -812,7 +808,6 @@ void * createTable(void *sarg)
    }
      queryDB(winfo->taos, command);
    }
-    taos_close(winfo->taos);
  }

  return NULL;

--- a/src/mnode/inc/mnodeSdb.h
+++ b/src/mnode/inc/mnodeSdb.h
@@ -53,6 +53,7 @@ typedef struct {
  void *   rowData;
  int32_t  rowSize;
  int32_t  retCode; // for callback in sdb queue
+  int32_t  processedCount; // for sync fwd callback
  int32_t  (*cb)(struct SMnodeMsg *pMsg, int32_t code);
  struct SMnodeMsg *pMsg;
 } SSdbOper;

--- a/src/mnode/src/mnodeDnode.c
+++ b/src/mnode/src/mnodeDnode.c
@@ -88,13 +88,13 @@ static int32_t mnodeDnodeActionDelete(SSdbOper *pOper) {
 }

 static int32_t mnodeDnodeActionUpdate(SSdbOper *pOper) {
-  SDnodeObj *pDnode = pOper->pObj;
-  SDnodeObj *pSaved = mnodeGetDnode(pDnode->dnodeId);
-  if (pSaved != NULL && pDnode != pSaved) {
-    memcpy(pSaved, pDnode, pOper->rowSize);
-    free(pDnode);
-    mnodeDecDnodeRef(pSaved);
+  SDnodeObj *pNew = pOper->pObj;
+  SDnodeObj *pDnode = mnodeGetDnode(pNew->dnodeId);
+  if (pDnode != NULL && pNew != pDnode) {
+    memcpy(pDnode, pNew, pOper->rowSize);
+    free(pNew);
  }
+  mnodeDecDnodeRef(pDnode);

  return TSDB_CODE_SUCCESS;
 }

--- a/src/mnode/src/mnodeSdb.c
+++ b/src/mnode/src/mnodeSdb.c
@@ -72,8 +72,6 @@ typedef struct {
  void *     sync;
  void *     wal;
  SSyncCfg   cfg;
-  sem_t      sem;
-  int32_t    code;
  int32_t    numOfTables;
  SSdbTable *tableList[SDB_TABLE_MAX];
  pthread_mutex_t mutex;
@@ -244,27 +242,36 @@ static void sdbNotifyRole(void *ahandle, int8_t role) {
  sdbUpdateMnodeRoles();
 }

+FORCE_INLINE
 static void sdbConfirmForward(void *ahandle, void *param, int32_t code) {
-  tsSdbObj.code = code;
-  sem_post(&tsSdbObj.sem);
-  sdbDebug("forward request confirmed, version:%" PRIu64 ", result:%s", (int64_t)param, tstrerror(code));
-}
+  assert(param);
+  SSdbOper * pOper = param;
+  SMnodeMsg *pMsg = pOper->pMsg;
+  if (code <= 0) pOper->retCode = code;
+
+  int32_t processedCount = atomic_add_fetch_32(&pOper->processedCount, 1);
+  if (processedCount <= 1) {
+    if (pMsg != NULL) {
+      sdbDebug("app:%p:%p, waiting for confirm this operation, count:%d", pMsg->rpcMsg.ahandle, pMsg, processedCount);
+    }
+    return;
+  }

- static int32_t sdbForwardToPeer(SWalHead *pHead) {
-  if (tsSdbObj.sync == NULL) return TSDB_CODE_SUCCESS;
+  if (pMsg != NULL) {
+    sdbDebug("app:%p:%p, is confirmed and will do callback func", pMsg->rpcMsg.ahandle, pMsg);
+  }

-  int32_t code = syncForwardToPeer(tsSdbObj.sync, pHead, (void*)pHead->version, TAOS_QTYPE_RPC);
-  if (code > 0) {
-    sdbDebug("forward request is sent, version:%" PRIu64 ", code:%d", pHead->version, code);
-    sem_wait(&tsSdbObj.sem);
-    return tsSdbObj.code;
-  } 
-  return code;
+  if (pOper->cb != NULL) {
+    pOper->retCode = (*pOper->cb)(pMsg, pOper->retCode);
+  }
+
+  dnodeSendRpcMnodeWriteRsp(pMsg, pOper->retCode);
+  taosFreeQitem(pOper);
 }

 void sdbUpdateSync() {
  SSyncCfg syncCfg = {0};
-  int32_t index = 0;
+  int32_t  index = 0;

  SDMMnodeInfos *mnodes = dnodeGetMnodeInfos();
  for (int32_t i = 0; i < mnodes->nodeNum; ++i) {
@@ -298,7 +305,7 @@ void sdbUpdateSync() {
  }

  syncCfg.replica = index;
-  syncCfg.quorum = (syncCfg.replica == 1) ? 1:2;
+  syncCfg.quorum = (syncCfg.replica == 1) ? 1 : 2;

  bool hasThisDnode = false;
  for (int32_t i = 0; i < syncCfg.replica; ++i) {
@@ -325,10 +332,10 @@ void sdbUpdateSync() {
  syncInfo.getWalInfo = sdbGetWalInfo;
  syncInfo.getFileInfo = sdbGetFileInfo;
  syncInfo.writeToCache = sdbWriteToQueue;
-  syncInfo.confirmForward = sdbConfirmForward; 
+  syncInfo.confirmForward = sdbConfirmForward;
  syncInfo.notifyRole = sdbNotifyRole;
  tsSdbObj.cfg = syncCfg;
-  
+
  if (tsSdbObj.sync) {
    syncReconfig(tsSdbObj.sync, &syncCfg);
  } else {
@@ -339,7 +346,6 @@ void sdbUpdateSync() {

 int32_t sdbInit() {
  pthread_mutex_init(&tsSdbObj.mutex, NULL);
-  sem_init(&tsSdbObj.sem, 0, 0);

  if (sdbInitWriteWorker() != 0) {
    return -1;
@@ -379,7 +385,6 @@ void sdbCleanUp() {
    tsSdbObj.wal = NULL;
  }
  
-  sem_destroy(&tsSdbObj.sem);
  pthread_mutex_destroy(&tsSdbObj.mutex);
 }

@@ -513,24 +518,22 @@ static int sdbWrite(void *param, void *data, int type) {
  assert(pTable != NULL);

  pthread_mutex_lock(&tsSdbObj.mutex);
+  
  if (pHead->version == 0) {
-     // assign version
+    // assign version
    tsSdbObj.version++;
    pHead->version = tsSdbObj.version;
  } else {
    // for data from WAL or forward, version may be smaller
    if (pHead->version <= tsSdbObj.version) {
      pthread_mutex_unlock(&tsSdbObj.mutex);
-      if (type == TAOS_QTYPE_FWD && tsSdbObj.sync != NULL) {
-        sdbDebug("forward request is received, version:%" PRIu64 " confirm it", pHead->version);
-        syncConfirmForward(tsSdbObj.sync, pHead->version, TSDB_CODE_SUCCESS);
-      }
+      sdbDebug("table:%s, failed to restore %s record:%s from source(%d), version:%" PRId64 " too large, sdb version:%" PRId64,
+               pTable->tableName, sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), type, pHead->version, tsSdbObj.version);
      return TSDB_CODE_SUCCESS;
    } else if (pHead->version != tsSdbObj.version + 1) {
      pthread_mutex_unlock(&tsSdbObj.mutex);
-      sdbError("table:%s, failed to restore %s record:%s from wal, version:%" PRId64 " too large, sdb version:%" PRId64,
-               pTable->tableName, sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version,
-               tsSdbObj.version);
+      sdbError("table:%s, failed to restore %s record:%s from source(%d), version:%" PRId64 " too large, sdb version:%" PRId64,
+               pTable->tableName, sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), type, pHead->version, tsSdbObj.version);
      return TSDB_CODE_MND_APP_ERROR;
    } else {
      tsSdbObj.version = pHead->version;
@@ -542,28 +545,36 @@ static int sdbWrite(void *param, void *data, int type) {
    pthread_mutex_unlock(&tsSdbObj.mutex);
    return code;
  }
-  
-  code = sdbForwardToPeer(pHead);
+
  pthread_mutex_unlock(&tsSdbObj.mutex);

  // from app, oper is created
  if (pOper != NULL) {
-    sdbTrace("record from app is disposed, table:%s action:%s record:%s version:%" PRIu64 " result:%s",
-             pTable->tableName, sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version,
-             tstrerror(code));
-    return code;
+    // forward to peers
+    pOper->processedCount = 0;
+    int32_t syncCode = syncForwardToPeer(tsSdbObj.sync, pHead, pOper, TAOS_QTYPE_RPC);
+    if (syncCode <= 0) pOper->processedCount = 1;
+
+    if (syncCode < 0) {
+      sdbError("table:%s, failed to forward request, result:%s action:%s record:%s version:%" PRId64, pTable->tableName,
+               tstrerror(syncCode), sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version);
+    } else if (syncCode > 0) {
+      sdbDebug("table:%s, forward request is sent, action:%s record:%s version:%" PRId64, pTable->tableName,
+               sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version);
+    } else {
+      sdbTrace("table:%s, no need to send fwd request, action:%s record:%s version:%" PRId64, pTable->tableName,
+               sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version);
+    }
+    return syncCode;
  }

-  // from wal or forward msg, oper not created, should add into hash
-  if (tsSdbObj.sync != NULL) {
-    sdbTrace("record from wal forward is disposed, table:%s action:%s record:%s version:%" PRIu64 " confirm it",
-             pTable->tableName, sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version);
-    syncConfirmForward(tsSdbObj.sync, pHead->version, code);
-  } else {
-    sdbTrace("record from wal restore is disposed, table:%s action:%s record:%s version:%" PRIu64, pTable->tableName,
-             sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version);
-  }
+  sdbDebug("table:%s, record from wal/fwd is disposed, action:%s record:%s version:%" PRId64, pTable->tableName,
+           sdbGetActionStr(action), sdbGetKeyStr(pTable, pHead->cont), pHead->version);

+  // even it is WAL/FWD, it shall be called to update version in sync
+  syncForwardToPeer(tsSdbObj.sync, pHead, pOper, TAOS_QTYPE_RPC);
+
+  // from wal or forward msg, oper not created, should add into hash
  if (action == SDB_ACTION_INSERT) {
    SSdbOper oper = {.rowSize = pHead->len, .rowData = pHead->cont, .table = pTable};
    code = (*pTable->decodeFp)(&oper);
@@ -627,7 +638,7 @@ int32_t sdbInsertRow(SSdbOper *pOper) {
  memcpy(pNewOper, pOper, sizeof(SSdbOper));

  if (pNewOper->pMsg != NULL) {
-    sdbDebug("app:%p:%p, table:%s record:%p:%s, insert action is add to sdb queue, ", pNewOper->pMsg->rpcMsg.ahandle,
+    sdbDebug("app:%p:%p, table:%s record:%p:%s, insert action is add to sdb queue", pNewOper->pMsg->rpcMsg.ahandle,
             pNewOper->pMsg, pTable->tableName, pOper->pObj, sdbGetKeyStrFromObj(pTable, pOper->pObj));
  }

@@ -677,7 +688,7 @@ int32_t sdbDeleteRow(SSdbOper *pOper) {
  memcpy(pNewOper, pOper, sizeof(SSdbOper));

  if (pNewOper->pMsg != NULL) {
-    sdbDebug("app:%p:%p, table:%s record:%p:%s, delete action is add to sdb queue, ", pNewOper->pMsg->rpcMsg.ahandle,
+    sdbDebug("app:%p:%p, table:%s record:%p:%s, delete action is add to sdb queue", pNewOper->pMsg->rpcMsg.ahandle,
             pNewOper->pMsg, pTable->tableName, pOper->pObj, sdbGetKeyStrFromObj(pTable, pOper->pObj));
  }

@@ -727,7 +738,7 @@ int32_t sdbUpdateRow(SSdbOper *pOper) {
  memcpy(pNewOper, pOper, sizeof(SSdbOper));

  if (pNewOper->pMsg != NULL) {
-    sdbDebug("app:%p:%p, table:%s record:%p:%s, update action is add to sdb queue, ", pNewOper->pMsg->rpcMsg.ahandle,
+    sdbDebug("app:%p:%p, table:%s record:%p:%s, update action is add to sdb queue", pNewOper->pMsg->rpcMsg.ahandle,
             pNewOper->pMsg, pTable->tableName, pOper->pObj, sdbGetKeyStrFromObj(pTable, pOper->pObj));
  }

@@ -943,20 +954,20 @@ static void *sdbWorkerFp(void *param) {
      taosGetQitem(tsSdbWriteQall, &type, &item);
      if (type == TAOS_QTYPE_RPC) {
        pOper = (SSdbOper *)item;
+        pOper->processedCount = 1;
        pHead = (void *)pOper + sizeof(SSdbOper) + SDB_SYNC_HACK;
+        if (pOper->pMsg != NULL) {
+          sdbDebug("app:%p:%p, table:%s record:%p:%s version:%" PRIu64 ", will be processed in sdb queue",
+                   pOper->pMsg->rpcMsg.ahandle, pOper->pMsg, ((SSdbTable *)pOper->table)->tableName, pOper->pObj,
+                   sdbGetKeyStr(pOper->table, pHead->cont), pHead->version);
+        }
      } else {
        pHead = (SWalHead *)item;
        pOper = NULL;
      }

-      if (pOper != NULL && pOper->pMsg != NULL) {
-        sdbDebug("app:%p:%p, table:%s record:%p:%s version:%" PRIu64 ", will be processed in sdb queue",
-                 pOper->pMsg->rpcMsg.ahandle, pOper->pMsg, ((SSdbTable *)pOper->table)->tableName, pOper->pObj,
-                 sdbGetKeyStr(pOper->table, pHead->cont), pHead->version);
-      }
-
      int32_t code = sdbWrite(pOper, pHead, type);
-      if (pOper) pOper->retCode = code;
+      if (pOper && code <= 0) pOper->retCode = code;
    }

    walFsync(tsSdbObj.wal);
@@ -965,25 +976,17 @@ static void *sdbWorkerFp(void *param) {
    taosResetQitems(tsSdbWriteQall);
    for (int32_t i = 0; i < numOfMsgs; ++i) {
      taosGetQitem(tsSdbWriteQall, &type, &item);
+
      if (type == TAOS_QTYPE_RPC) {
        pOper = (SSdbOper *)item;
-        if (pOper != NULL && pOper->cb != NULL) {
-          sdbTrace("app:%p:%p, will do callback func, index:%d", pOper->pMsg->rpcMsg.ahandle, pOper->pMsg, i);
-          pOper->retCode = (*pOper->cb)(pOper->pMsg, pOper->retCode);
-        }
-
-        if (pOper != NULL && pOper->pMsg != NULL) {
-          sdbTrace("app:%p:%p, msg is processed, result:%s", pOper->pMsg->rpcMsg.ahandle, pOper->pMsg,
-                   tstrerror(pOper->retCode));
-        }
-
-        if (pOper != NULL) {
-          sdbDecRef(pOper->table, pOper->pObj);
-        }
-
-        dnodeSendRpcMnodeWriteRsp(pOper->pMsg, pOper->retCode);
+        sdbDecRef(pOper->table, pOper->pObj);
+        sdbConfirmForward(NULL, pOper, pOper->retCode);
+      } else if (type == TAOS_QTYPE_FWD) {
+        syncConfirmForward(tsSdbObj.sync, pHead->version, TSDB_CODE_SUCCESS);
+        taosFreeQitem(item);
+      } else {
+        taosFreeQitem(item);
      }
-      taosFreeQitem(item);
    }
  }


--- a/src/mnode/src/mnodeTable.c
+++ b/src/mnode/src/mnodeTable.c
@@ -783,9 +783,15 @@ static int32_t mnodeProcessTableMetaMsg(SMnodeMsg *pMsg) {

 static int32_t mnodeCreateSuperTableCb(SMnodeMsg *pMsg, int32_t code) {
  SSuperTableObj *pTable = (SSuperTableObj *)pMsg->pTable;
-  if (pTable != NULL) {
-    mLInfo("app:%p:%p, stable:%s, is created in sdb, result:%s", pMsg->rpcMsg.ahandle, pMsg, pTable->info.tableId,
-            tstrerror(code));
+  assert(pTable);
+
+  if (code == TSDB_CODE_SUCCESS) {
+    mLInfo("stable:%s, is created in sdb", pTable->info.tableId);
+  } else {
+    mError("app:%p:%p, stable:%s, failed to create in sdb, reason:%s", pMsg->rpcMsg.ahandle, pMsg, pTable->info.tableId,
+           tstrerror(code));
+    SSdbOper desc = {.type = SDB_OPER_GLOBAL, .pObj = pTable, .table = tsSuperTableSdb};
+    sdbDeleteRow(&desc);
  }

  return code;
@@ -1561,10 +1567,16 @@ static int32_t mnodeDoCreateChildTableCb(SMnodeMsg *pMsg, int32_t code) {
  SChildTableObj *pTable = (SChildTableObj *)pMsg->pTable;
  assert(pTable);

-  mDebug("app:%p:%p, table:%s, create table in id:%d, uid:%" PRIu64 ", result:%s", pMsg->rpcMsg.ahandle, pMsg,
-         pTable->info.tableId, pTable->sid, pTable->uid, tstrerror(code));
-
-  if (code != TSDB_CODE_SUCCESS) return code;
+  if (code == TSDB_CODE_SUCCESS) {
+    mDebug("app:%p:%p, table:%s, create table in sid:%d, uid:%" PRIu64, pMsg->rpcMsg.ahandle, pMsg, pTable->info.tableId,
+           pTable->sid, pTable->uid);
+  } else {
+    mError("app:%p:%p, table:%s, failed to create table sid:%d, uid:%" PRIu64 ", reason:%s", pMsg->rpcMsg.ahandle, pMsg,
+           pTable->info.tableId, pTable->sid, pTable->uid, tstrerror(code));
+    SSdbOper desc = {.type = SDB_OPER_GLOBAL, .pObj = pTable, .table = tsChildTableSdb};
+    sdbDeleteRow(&desc);
+    return code;
+  }

  SCMCreateTableMsg *pCreate = pMsg->rpcMsg.pCont;
  SMDCreateTableMsg *pMDCreate = mnodeBuildCreateChildTableMsg(pCreate, pTable);

--- a/src/mnode/src/mnodeVgroup.c
+++ b/src/mnode/src/mnodeVgroup.c
@@ -348,17 +348,23 @@ void *mnodeGetNextVgroup(void *pIter, SVgObj **pVgroup) {
 }

 static int32_t mnodeCreateVgroupCb(SMnodeMsg *pMsg, int32_t code) {
+  SVgObj *pVgroup = pMsg->pVgroup;
+  SDbObj *pDb = pMsg->pDb;
+  assert(pVgroup);
+
  if (code != TSDB_CODE_SUCCESS) {
-    pMsg->pVgroup = NULL;
+    mError("app:%p:%p, vgId:%d, failed to create in sdb, reason:%s", pMsg->rpcMsg.ahandle, pMsg, pVgroup->vgId,
+           tstrerror(code));
+    SSdbOper desc = {.type = SDB_OPER_GLOBAL, .pObj = pVgroup, .table = tsVgroupSdb};
+    sdbDeleteRow(&desc);
    return code;
  }

-  SVgObj *pVgroup = pMsg->pVgroup;
-  SDbObj *pDb = pMsg->pDb;
-
-  mInfo("vgId:%d, is created in mnode, db:%s replica:%d", pVgroup->vgId, pDb->name, pVgroup->numOfVnodes);
+  mInfo("app:%p:%p, vgId:%d, is created in mnode, db:%s replica:%d", pMsg->rpcMsg.ahandle, pMsg, pVgroup->vgId,
+        pDb->name, pVgroup->numOfVnodes);
  for (int32_t i = 0; i < pVgroup->numOfVnodes; ++i) {
-    mInfo("vgId:%d, index:%d, dnode:%d", pVgroup->vgId, i, pVgroup->vnodeGid[i].dnodeId);
+    mInfo("app:%p:%p, vgId:%d, index:%d, dnode:%d", pMsg->rpcMsg.ahandle, pMsg, pVgroup->vgId, i,
+          pVgroup->vnodeGid[i].dnodeId);
  }

  mnodeIncVgroupRef(pVgroup);

--- a/src/rpc/test/rclient.c
+++ b/src/rpc/test/rclient.c
@@ -156,6 +156,7 @@ int main(int argc, char *argv[]) {
  }

  tInfo("client is initialized");
+  tInfo("threads:%d msgSize:%d requests:%d", appThreads, msgSize, numOfReqs);

  gettimeofday(&systemTime, NULL);
  startTime = systemTime.tv_sec*1000000 + systemTime.tv_usec;

--- a/src/rpc/test/rserver.c
+++ b/src/rpc/test/rserver.c
@@ -24,23 +24,21 @@ int msgSize = 128;
 int commit = 0;
 int dataFd = -1;
 void *qhandle = NULL;
+void *qset = NULL;

 void processShellMsg() {
  static int num = 0;
  taos_qall  qall;
  SRpcMsg   *pRpcMsg, rpcMsg;
  int        type;
+  void      *pvnode;
 
  qall = taosAllocateQall();

  while (1) {
-    int numOfMsgs = taosReadAllQitems(qhandle, qall);
-    if (numOfMsgs <= 0) {
-      usleep(100);
-      continue;
-    }     
-
+    int numOfMsgs = taosReadAllQitemsFromQset(qset, qall, &pvnode);
    tDebug("%d shell msgs are received", numOfMsgs);
+    if (numOfMsgs <= 0) break;

    for (int i=0; i<numOfMsgs; ++i) {
      taosGetQitem(qall, &type, (void **)&pRpcMsg);
@@ -82,15 +80,6 @@ void processShellMsg() {
  }

  taosFreeQall(qall);
-/*
-  SRpcIpSet ipSet;
-  ipSet.numOfIps = 1;
-  ipSet.index = 0;
-  ipSet.port = 7000;
-  ipSet.ip[0] = inet_addr("192.168.0.2");
-
-  rpcSendRedirectRsp(ahandle, &ipSet);
-*/

 }

@@ -189,6 +178,8 @@ int main(int argc, char *argv[]) {
  }

  qhandle = taosOpenQueue(sizeof(SRpcMsg));
+  qset = taosOpenQset();
+  taosAddIntoQset(qset, qhandle, NULL);

  processShellMsg();


--- a/tests/pytest/crash_gen.py
+++ b/tests/pytest/crash_gen.py
-#!/usr/bin/python3.7
+#-----!/usr/bin/python3.7
 ###################################################################
 #           Copyright (c) 2016 by TAOS Technologies, Inc.
 #                     All rights reserved.
@@ -15,6 +15,8 @@ from __future__ import annotations  # For type hinting before definition, ref: h

 import sys
 import os
+import io
+import signal
 import traceback
 # Require Python 3
 if sys.version_info[0] < 3:
@@ -23,6 +25,7 @@ if sys.version_info[0] < 3:
 import getopt
 import argparse
 import copy
+import requests

 import threading
 import random
@@ -30,10 +33,14 @@ import time
 import logging
 import datetime
 import textwrap
+import requests
+from requests.auth import HTTPBasicAuth

 from typing import List
 from typing import Dict
 from typing import Set
+from typing import IO
+from queue import Queue, Empty

 from util.log import *
 from util.dnodes import *
@@ -76,7 +83,10 @@ class WorkerThread:

        # Let us have a DB connection of our own
        if ( gConfig.per_thread_db_connection ): # type: ignore
-            self._dbConn = DbConn()   
+            # print("connector_type = {}".format(gConfig.connector_type))
+            self._dbConn = DbConn.createNative() if (gConfig.connector_type == 'native') else DbConn.createRest()
+
+        self._dbInUse = False # if "use db" was executed already

    def logDebug(self, msg):
        logger.debug("    TRD[{}] {}".format(self._tid, msg))
@@ -84,7 +94,14 @@ class WorkerThread:
    def logInfo(self, msg):
        logger.info("    TRD[{}] {}".format(self._tid, msg))

-   
+    def dbInUse(self):
+        return self._dbInUse
+
+    def useDb(self):
+        if ( not self._dbInUse ):
+            self.execSql("use db")
+        self._dbInUse = True
+
    def getTaskExecutor(self):
        return self._tc.getTaskExecutor()     

@@ -97,6 +114,7 @@ class WorkerThread:
        logger.info("Starting to run thread: {}".format(self._tid))

        if ( gConfig.per_thread_db_connection ): # type: ignore
+            logger.debug("Worker thread openning database connection")
            self._dbConn.open()

        self._doTaskLoop()       
@@ -118,12 +136,17 @@ class WorkerThread:
                logger.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...")
                break

+            # Fetch a task from the Thread Coordinator
            logger.debug("[TRD] Worker thread [{}] about to fetch task".format(self._tid))
            task = tc.fetchTask()
+
+            # Execute such a task
            logger.debug("[TRD] Worker thread [{}] about to execute task: {}".format(self._tid, task.__class__.__name__))
            task.execute(self)
            tc.saveExecutedTask(task)
            logger.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid))
+
+            self._dbInUse = False # there may be changes between steps
  
    def verifyThreadSelf(self): # ensure we are called by this own thread
        if ( threading.get_ident() != self._thread.ident ): 
@@ -163,6 +186,18 @@ class WorkerThread:
        else:
            return self._tc.getDbManager().getDbConn().execute(sql)

+    def querySql(self, sql): # TODO: expose DbConn directly
+        if ( gConfig.per_thread_db_connection ):
+            return self._dbConn.query(sql)            
+        else:
+            return self._tc.getDbManager().getDbConn().query(sql)
+
+    def getQueryResult(self):
+        if ( gConfig.per_thread_db_connection ):
+            return self._dbConn.getQueryResult()       
+        else:
+            return self._tc.getDbManager().getDbConn().getQueryResult()
+
    def getDbConn(self):
        if ( gConfig.per_thread_db_connection ):
            return self._dbConn     
@@ -175,8 +210,9 @@ class WorkerThread:
    #     else:
    #         return self._tc.getDbState().getDbConn().query(sql)

+# The coordinator of all worker threads, mostly running in main thread
 class ThreadCoordinator:
-    def __init__(self, pool, dbManager):
+    def __init__(self, pool: ThreadPool, dbManager):
        self._curStep = -1 # first step is 0
        self._pool = pool
        # self._wd = wd
@@ -187,6 +223,7 @@ class ThreadCoordinator:

        self._stepBarrier = threading.Barrier(self._pool.numThreads + 1) # one barrier for all threads
        self._execStats = ExecutionStats()
+        self._runStatus = MainExec.STATUS_RUNNING

    def getTaskExecutor(self):
        return self._te
@@ -197,6 +234,10 @@ class ThreadCoordinator:
    def crossStepBarrier(self):
        self._stepBarrier.wait()

+    def requestToStop(self):
+        self._runStatus = MainExec.STATUS_STOPPING
+        self._execStats.registerFailure("User Interruption")
+
    def run(self):              
        self._pool.createAndStartThreads(self)

@@ -204,48 +245,73 @@ class ThreadCoordinator:
        self._curStep = -1 # not started yet
        maxSteps = gConfig.max_steps # type: ignore
        self._execStats.startExec() # start the stop watch
-        failed = False
-        while(self._curStep < maxSteps-1 and not failed):  # maxStep==10, last curStep should be 9
+        transitionFailed = False
+        hasAbortedTask = False
+        while(self._curStep < maxSteps-1 and 
+            (not transitionFailed) and 
+            (self._runStatus==MainExec.STATUS_RUNNING) and
+            (not hasAbortedTask)):  # maxStep==10, last curStep should be 9
+
            if not gConfig.debug: 
                print(".", end="", flush=True) # print this only if we are not in debug mode
            logger.debug("[TRD] Main thread going to sleep")

-            # Now ready to enter a step
+            # Now main thread (that's us) is ready to enter a step
            self.crossStepBarrier() # let other threads go past the pool barrier, but wait at the thread gate
            self._stepBarrier.reset() # Other worker threads should now be at the "gate"            

            # At this point, all threads should be pass the overall "barrier" and before the per-thread "gate"
-            try:
-                self._dbManager.getStateMachine().transition(self._executedTasks) # at end of step, transiton the DB state
-            except taos.error.ProgrammingError as err:
-                if ( err.msg == 'network unavailable' ): # broken DB connection
-                    logger.info("DB connection broken, execution failed")
-                    traceback.print_stack()
-                    failed = True
-                    self._te = None # Not running any more
-                    self._execStats.registerFailure("Broken DB Connection")
-                    # continue # don't do that, need to tap all threads at end, and maybe signal them to stop
-                else:
-                    raise 
-            finally:
-                pass
+            # We use this period to do house keeping work, when all worker threads are QUIET.
+            hasAbortedTask = False
+            for task in self._executedTasks :
+                if task.isAborted() :
+                    print("Task aborted: {}".format(task))
+                    hasAbortedTask = True
+                    break
+
+            if hasAbortedTask : # do transition only if tasks are error free
+                self._execStats.registerFailure("Aborted Task Encountered")
+            else: 
+                try:
+                    sm = self._dbManager.getStateMachine()
+                    logger.debug("[STT] starting transitions")
+                    sm.transition(self._executedTasks) # at end of step, transiton the DB state
+                    logger.debug("[STT] transition ended")
+                    # Due to limitation (or maybe not) of the Python library, we cannot share connections across threads
+                    if sm.hasDatabase() :
+                        for t in self._pool.threadList:
+                            logger.debug("[DB] use db for all worker threads")
+                            t.useDb()
+                            # t.execSql("use db") # main thread executing "use db" on behalf of every worker thread
+                except taos.error.ProgrammingError as err:
+                    if ( err.msg == 'network unavailable' ): # broken DB connection
+                        logger.info("DB connection broken, execution failed")
+                        traceback.print_stack()
+                        transitionFailed = True
+                        self._te = None # Not running any more
+                        self._execStats.registerFailure("Broken DB Connection")
+                        # continue # don't do that, need to tap all threads at end, and maybe signal them to stop
+                    else:
+                        raise 
+            # finally:
+            #     pass
            
            self.resetExecutedTasks() # clear the tasks after we are done

            # Get ready for next step
            logger.debug("<-- Step {} finished".format(self._curStep))
            self._curStep += 1 # we are about to get into next step. TODO: race condition here!                
-            logger.debug("\r\n--> Step {} starts with main thread waking up".format(self._curStep)) # Now not all threads had time to go to sleep
+            logger.debug("\r\n\n--> Step {} starts with main thread waking up".format(self._curStep)) # Now not all threads had time to go to sleep

            # A new TE for the new step
-            if not failed: # only if not failed
+            if not transitionFailed: # only if not failed
                self._te = TaskExecutor(self._curStep)

            logger.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format(self._curStep)) # Now not all threads had time to go to sleep            
-            self.tapAllThreads()
+            self.tapAllThreads() # Worker threads will wake up at this point, and each execute it's own task

        logger.debug("Main thread ready to finish up...")
-        if not failed: # only in regular situations
+        if not transitionFailed: # only in regular situations
            self.crossStepBarrier() # Cross it one last time, after all threads finish
            self._stepBarrier.reset()
            logger.debug("Main thread in exclusive zone...")
@@ -255,11 +321,17 @@ class ThreadCoordinator:

        logger.debug("Main thread joining all threads")
        self._pool.joinAll() # Get all threads to finish
-        logger.info("All worker thread finished")
+        logger.info("\nAll worker threads finished")
        self._execStats.endExec()

-    def logStats(self):
-        self._execStats.logStats()
+    def printStats(self):
+        self._execStats.printStats()
+
+    def isFailed(self):
+        return self._execStats.isFailed()
+
+    def getExecStats(self):
+        return self._execStats

    def tapAllThreads(self): # in a deterministic manner
        wakeSeq = []
@@ -268,7 +340,7 @@ class ThreadCoordinator:
                wakeSeq.append(i)
            else:
                wakeSeq.insert(0, i)
-        logger.debug("[TRD] Main thread waking up worker thread: {}".format(str(wakeSeq)))
+        logger.debug("[TRD] Main thread waking up worker threads: {}".format(str(wakeSeq)))
        # TODO: set dice seed to a deterministic value
        for i in wakeSeq:
            self._pool.threadList[i].tapStepGate() # TODO: maybe a bit too deep?!
@@ -306,7 +378,7 @@ class ThreadPool:
        self.maxSteps = maxSteps
        # Internal class variables
        self.curStep = 0
-        self.threadList = []
+        self.threadList = [] # type: List[WorkerThread]
        
    # starting to run all the threads, in locking steps
    def createAndStartThreads(self, tc: ThreadCoordinator):
@@ -397,26 +469,39 @@ class LinearQueue():
                    return ret

 class DbConn:
+    TYPE_NATIVE = "native-c"
+    TYPE_REST = "rest-api"
+    TYPE_INVALID = "invalid"
+
+    @classmethod
+    def create(cls, connType):
+        if connType == cls.TYPE_NATIVE:
+            return DbConnNative()
+        elif connType == cls.TYPE_REST:
+            return DbConnRest()
+        else:
+            raise RuntimeError("Unexpected connection type: {}".format(connType))
+
+    @classmethod
+    def createNative(cls):
+        return cls.create(cls.TYPE_NATIVE)
+
+    @classmethod
+    def createRest(cls):
+        return cls.create(cls.TYPE_REST)
+
    def __init__(self):
-        self._conn = None 
-        self._cursor = None
        self.isOpen = False
-        
-    def open(self): # Open connection
+        self._type = self.TYPE_INVALID
+
+    def open(self):
        if ( self.isOpen ):
            raise RuntimeError("Cannot re-open an existing DB connection")

-        cfgPath = "../../build/test/cfg" 
-        self._conn = taos.connect(host="127.0.0.1", config=cfgPath) # TODO: make configurable
-        self._cursor = self._conn.cursor()
+        # below implemented by child classes
+        self.openByType()

-        # Get the connection/cursor ready
-        self._cursor.execute('reset query cache')
-        # self._cursor.execute('use db') # note we do this in _findCurrenState
-
-        # Open connection
-        self._tdSql = TDSql()
-        self._tdSql.init(self._cursor)
+        logger.debug("[DB] data connection opened, type = {}".format(self._type))
        self.isOpen = True

    def resetDb(self): # reset the whole database, etc.
@@ -424,17 +509,128 @@ class DbConn:
            raise RuntimeError("Cannot reset database until connection is open")
        # self._tdSql.prepare() # Recreate database, etc.

-        self._cursor.execute('drop database if exists db')
+        self.execute('drop database if exists db')
        logger.debug("Resetting DB, dropped database")
        # self._cursor.execute('create database db')
        # self._cursor.execute('use db')
-
        # tdSql.execute('show databases')

+    def queryScalar(self, sql) -> int :
+        return self._queryAny(sql)
+
+    def queryString(self, sql) -> str :
+        return self._queryAny(sql)
+
+    def _queryAny(self, sql) : # actual query result as an int
+        if ( not self.isOpen ):
+            raise RuntimeError("Cannot query database until connection is open")
+        nRows = self.query(sql)
+        if nRows != 1 :
+            raise RuntimeError("Unexpected result for query: {}, rows = {}".format(sql, nRows))
+        if self.getResultRows() != 1 or self.getResultCols() != 1:
+            raise RuntimeError("Unexpected result set for query: {}".format(sql))
+        return self.getQueryResult()[0][0]
+
+    def execute(self, sql):
+        raise RuntimeError("Unexpected execution, should be overriden")
+    def openByType(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+    def getQueryResult(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+    def getResultRows(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+    def getResultCols(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql
+class DbConnRest(DbConn):
+    def __init__(self):
+        super().__init__()
+        self._type = self.TYPE_REST
+        self._url = "http://localhost:6020/rest/sql" # fixed for now
+        self._result = None
+
+    def openByType(self): # Open connection
+        pass # do nothing, always open
+        
+    def close(self):
+        if ( not self.isOpen ):
+            raise RuntimeError("Cannot clean up database until connection is open")
+        # Do nothing for REST
+        logger.debug("[DB] REST Database connection closed")
+        self.isOpen = False
+
+    def _doSql(self, sql):
+        r = requests.post(self._url, 
+            data = sql,
+            auth = HTTPBasicAuth('root', 'taosdata'))
+        rj = r.json()
+        # Sanity check for the "Json Result"
+        if (not 'status' in rj):
+            raise RuntimeError("No status in REST response")
+
+        if rj['status'] == 'error': # clearly reported error
+            if (not 'code' in rj): # error without code
+                raise RuntimeError("REST error return without code")            
+            errno = rj['code'] # May need to massage this in the future
+            # print("Raising programming error with REST return: {}".format(rj))
+            raise taos.error.ProgrammingError(rj['desc'], errno) # todo: check existance of 'desc'
+
+        if rj['status'] != 'succ': # better be this
+            raise RuntimeError("Unexpected REST return status: {}".format(rj['status']))
+
+        nRows = rj['rows'] if ('rows' in rj) else 0
+        self._result = rj        
+        return nRows
+
+    def execute(self, sql): 
+        if ( not self.isOpen ):
+            raise RuntimeError("Cannot execute database commands until connection is open")
+        logger.debug("[SQL-REST] Executing SQL: {}".format(sql))
+        nRows = self._doSql(sql)
+        logger.debug("[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
+        return nRows
+
+    def query(self, sql) :  # return rows affected
+        return self.execute(sql)
+
+    def getQueryResult(self):
+        return self._result['data']
+
+    def getResultRows(self):
+        print(self._result)
+        raise RuntimeError("TBD")
+        # return self._tdSql.queryRows
+
+    def getResultCols(self):
+        print(self._result)
+        raise RuntimeError("TBD")
+    
+class DbConnNative(DbConn):
+    def __init__(self):
+        super().__init__()
+        self._type = self.TYPE_REST
+        self._conn = None 
+        self._cursor = None
+        
+    def openByType(self): # Open connection
+        cfgPath = "../../build/test/cfg" 
+        self._conn = taos.connect(host="127.0.0.1", config=cfgPath) # TODO: make configurable
+        self._cursor = self._conn.cursor()
+
+        # Get the connection/cursor ready
+        self._cursor.execute('reset query cache')
+        # self._cursor.execute('use db') # do this at the beginning of every step
+
+        # Open connection
+        self._tdSql = TDSql()
+        self._tdSql.init(self._cursor)
+        
    def close(self):
        if ( not self.isOpen ):
            raise RuntimeError("Cannot clean up database until connection is open")
        self._tdSql.close()
+        logger.debug("[DB] Database connection closed")
        self.isOpen = False

    def execute(self, sql): 
@@ -450,29 +646,19 @@ class DbConn:
            raise RuntimeError("Cannot query database until connection is open")
        logger.debug("[SQL] Executing SQL: {}".format(sql))
        nRows = self._tdSql.query(sql)
-        logger.debug("[SQL] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
+        logger.debug("[SQL] Query Result, nRows = {}, SQL = {}".format(nRows, sql))
        return nRows
        # results are in: return self._tdSql.queryResult

    def getQueryResult(self):
        return self._tdSql.queryResult

-    def _queryAny(self, sql) : # actual query result as an int
-        if ( not self.isOpen ):
-            raise RuntimeError("Cannot query database until connection is open")
-        tSql = self._tdSql
-        nRows = tSql.query(sql)
-        if nRows != 1 :
-            raise RuntimeError("Unexpected result for query: {}, rows = {}".format(sql, nRows))
-        if tSql.queryRows != 1 or tSql.queryCols != 1:
-            raise RuntimeError("Unexpected result set for query: {}".format(sql))
-        return tSql.queryResult[0][0]
+    def getResultRows(self):
+        return self._tdSql.queryRows

-    def queryScalar(self, sql) -> int :
-        return self._queryAny(sql)
+    def getResultCols(self):
+        return self._tdSql.queryCols

-    def queryString(self, sql) -> str :
-        return self._queryAny(sql)
    
 class AnyState:
    STATE_INVALID    = -1
@@ -620,10 +806,10 @@ class StateDbOnly(AnyState):
            # self.assertHasTask(tasks, DropDbTask) # implied by hasSuccess
            # self.assertAtMostOneSuccess(tasks, DropDbTask)
            # self._state = self.STATE_EMPTY
-        if ( self.hasSuccess(tasks, TaskCreateSuperTable) ): # did not drop db, create table success
-            # self.assertHasTask(tasks, CreateFixedTableTask) # tried to create table
-            if ( not self.hasTask(tasks, TaskDropSuperTable) ): 
-                self.assertAtMostOneSuccess(tasks, TaskCreateSuperTable) # at most 1 attempt is successful, if we don't drop anything
+        # if ( self.hasSuccess(tasks, TaskCreateSuperTable) ): # did not drop db, create table success
+        #     # self.assertHasTask(tasks, CreateFixedTableTask) # tried to create table
+        #     if ( not self.hasTask(tasks, TaskDropSuperTable) ): 
+        #         self.assertAtMostOneSuccess(tasks, TaskCreateSuperTable) # at most 1 attempt is successful, if we don't drop anything
            # self.assertNoTask(tasks, DropDbTask) # should have have tried
            # if ( not self.hasSuccess(tasks, AddFixedDataTask) ): # just created table, no data yet
            #     # can't say there's add-data attempts, since they may all fail
@@ -648,7 +834,9 @@ class StateSuperTableOnly(AnyState):

    def verifyTasksToState(self, tasks, newState):
        if ( self.hasSuccess(tasks, TaskDropSuperTable) ): # we are able to drop the table
-            self.assertAtMostOneSuccess(tasks, TaskDropSuperTable)
+            #self.assertAtMostOneSuccess(tasks, TaskDropSuperTable)
+            self.hasSuccess(tasks, TaskCreateSuperTable) # we must have had recreted it
+
            # self._state = self.STATE_DB_ONLY
        # elif ( self.hasSuccess(tasks, AddFixedDataTask) ): # no success dropping the table, but added data
        #     self.assertNoTask(tasks, DropFixedTableTask) # not true in massively parrallel cases
@@ -680,18 +868,19 @@ class StateHasData(AnyState):
                self.assertNoTask(tasks, TaskDropDb) # we must have drop_db task
            self.hasSuccess(tasks, TaskDropSuperTable)
            # self.assertAtMostOneSuccess(tasks, DropFixedSuperTableTask) # TODO: dicy
-        elif ( newState.equals(AnyState.STATE_TABLE_ONLY) ): # data deleted
-            self.assertNoTask(tasks, TaskDropDb)
-            self.assertNoTask(tasks, TaskDropSuperTable)
-            self.assertNoTask(tasks, TaskAddData)
+        # elif ( newState.equals(AnyState.STATE_TABLE_ONLY) ): # data deleted
+            # self.assertNoTask(tasks, TaskDropDb)
+            # self.assertNoTask(tasks, TaskDropSuperTable)
+            # self.assertNoTask(tasks, TaskAddData)
            # self.hasSuccess(tasks, DeleteDataTasks)
        else: # should be STATE_HAS_DATA
-            self.assertNoTask(tasks, TaskDropDb)
+            if (not self.hasTask(tasks, TaskCreateDb) ): # only if we didn't create one
+                self.assertNoTask(tasks, TaskDropDb) # we shouldn't have dropped it
            if (not self.hasTask(tasks, TaskCreateSuperTable)) :  # if we didn't create the table
                self.assertNoTask(tasks, TaskDropSuperTable) # we should not have a task that drops it            
            # self.assertIfExistThenSuccess(tasks, ReadFixedDataTask)

-class StateMechine :
+class StateMechine:
    def __init__(self, dbConn):
        self._dbConn = dbConn
        self._curState = self._findCurrentState() # starting state
@@ -700,8 +889,17 @@ class StateMechine :
    def getCurrentState(self):
        return self._curState

+    def hasDatabase(self):
+        return self._curState.canDropDb()  # ha, can drop DB means it has one
+
    # May be slow, use cautionsly...
    def getTaskTypes(self): # those that can run (directly/indirectly) from the current state
+        def typesToStrings(types):
+            ss = []
+            for t in types:
+                ss.append(t.__name__)
+            return ss
+
        allTaskClasses = StateTransitionTask.__subclasses__() # all state transition tasks
        firstTaskTypes = []
        for tc in allTaskClasses:
@@ -720,7 +918,7 @@ class StateMechine :

        if len(taskTypes) <= 0:
            raise RuntimeError("No suitable task types found for state: {}".format(self._curState))   
-        logger.debug("[OPS] Tasks found for state {}: {}".format(self._curState, taskTypes))     
+        logger.debug("[OPS] Tasks found for state {}: {}".format(self._curState, typesToStrings(taskTypes)))     
        return taskTypes

    def _findCurrentState(self):
@@ -730,7 +928,7 @@ class StateMechine :
            # logger.debug("Found EMPTY state")
            logger.debug("[STT] empty database found, between {} and {}".format(ts, time.time()))
            return StateEmpty()
-        dbc.execute("use db") # did not do this when openning connection
+        dbc.execute("use db") # did not do this when openning connection, and this is NOT the worker thread, which does this on their own
        if dbc.query("show tables") == 0 : # no tables
            # logger.debug("Found DB ONLY state")
            logger.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time()))
@@ -746,6 +944,7 @@ class StateMechine :

    def transition(self, tasks):
        if ( len(tasks) == 0 ): # before 1st step, or otherwise empty
+            logger.debug("[STT] Starting State: {}".format(self._curState))
            return # do nothing

        self._dbConn.execute("show dnodes") # this should show up in the server log, separating steps
@@ -807,14 +1006,14 @@ class DbManager():
        self._lock = threading.RLock()
        
        # self.openDbServerConnection()
-        self._dbConn = DbConn()
+        self._dbConn = DbConn.createNative() if (gConfig.connector_type=='native') else DbConn.createRest()
        try:
            self._dbConn.open() # may throw taos.error.ProgrammingError: disconnected
        except taos.error.ProgrammingError as err:
            # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err))
            if ( err.msg == 'client disconnected' ): # cannot open DB connection
                print("Cannot establish DB connection, please re-run script without parameter, and follow the instructions.")
-                sys.exit()
+                sys.exit(2)
            else:
                raise            
        except:
@@ -829,7 +1028,7 @@ class DbManager():
    def getDbConn(self):
        return self._dbConn

-    def getStateMachine(self):
+    def getStateMachine(self) -> StateMechine :
        return self._stateMachine

    # def getState(self):
@@ -869,8 +1068,11 @@ class DbManager():

    def getNextTick(self):
        with self._lock: # prevent duplicate tick
-            self._lastTick += datetime.timedelta(0, 1) # add one second to it
-            return self._lastTick
+            if Dice.throw(10) == 0 : # 1 in 10 chance
+                return self._lastTick + datetime.timedelta(0, -100)
+            else: # regular
+                self._lastTick += datetime.timedelta(0, 1) # add one second to it
+                return self._lastTick

    def getNextInt(self):
        with self._lock:
@@ -894,15 +1096,60 @@ class DbManager():
        self._dbConn.close()      

 class TaskExecutor():
+    class BoundedList:
+        def __init__(self, size = 10):
+            self._size = size
+            self._list = []
+
+        def add(self, n: int) :
+            if not self._list: # empty
+                self._list.append(n)
+                return
+            # now we should insert
+            nItems = len(self._list)
+            insPos = 0
+            for i in range(nItems):
+                insPos = i
+                if n <= self._list[i] : # smaller than this item, time to insert
+                    break # found the insertion point
+                insPos += 1 # insert to the right
+
+            if insPos == 0 : # except for the 1st item, # TODO: elimiate first item as gating item
+                return # do nothing
+
+            # print("Inserting at postion {}, value: {}".format(insPos, n))
+            self._list.insert(insPos, n) # insert
+            
+            newLen = len(self._list)
+            if newLen <= self._size :
+                return # do nothing
+            elif newLen == (self._size + 1) :                
+                del self._list[0] # remove the first item
+            else :
+                raise RuntimeError("Corrupt Bounded List")
+
+        def __str__(self):
+            return repr(self._list)
+
+    _boundedList = BoundedList()
+
    def __init__(self, curStep):
        self._curStep = curStep

+    @classmethod
+    def getBoundedList(cls):
+        return cls._boundedList
+
    def getCurStep(self):
        return self._curStep

    def execute(self, task: Task, wt: WorkerThread): # execute a task on a thread
        task.execute(wt)

+    def recordDataMark(self, n: int):
+        # print("[{}]".format(n), end="", flush=True)
+        self._boundedList.add(n)
+
    # def logInfo(self, msg):
    #     logger.info("    T[{}.x]: ".format(self._curStep) + msg)

@@ -922,6 +1169,7 @@ class Task():
        self._dbManager = dbManager
        self._workerThread = None 
        self._err = None
+        self._aborted = False
        self._curStep = None
        self._numRows = None # Number of rows affected

@@ -930,10 +1178,14 @@ class Task():
        # logger.debug("Creating new task {}...".format(self._taskNum))

        self._execStats = execStats
+        self._lastSql = "" # last SQL executed/attempted

    def isSuccess(self):
        return self._err == None

+    def isAborted(self):
+        return self._aborted
+
    def clone(self): # TODO: why do we need this again?
        newTask = self.__class__(self._dbManager, self._execStats)
        return newTask
@@ -960,10 +1212,31 @@ class Task():
        try:
            self._executeInternal(te, wt) # TODO: no return value?
        except taos.error.ProgrammingError as err:
-            self.logDebug("[=] Taos library exception: errno={:X}, msg: {}".format(err.errno, err))
-            self._err = err           
-        except:
-            self.logDebug("[=] Unexpected exception")
+            errno2 = err.errno if (err.errno > 0) else 0x80000000 + err.errno # correct error scheme
+            if ( errno2 in [0x200, 0x360, 0x362, 0x36A, 0x36B, 0x36D, 0x381, 0x380, 0x383, 0x503, 0x600,
+                1000 # REST catch-all error
+                ]) : # allowed errors
+                self.logDebug("[=] Acceptable Taos library exception: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, self._lastSql))
+                print("_", end="", flush=True)
+                self._err = err  
+            else:
+                errMsg = "[=] Unexpected Taos library exception: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, self._lastSql)
+                self.logDebug(errMsg)
+                if gConfig.debug :
+                    raise # so that we see full stack
+                else: # non-debug
+                    print("\n\n----------------------------\nProgram ABORTED Due to Unexpected TAOS Error: \n\n{}\n".format(errMsg) +
+                        "----------------------------\n")
+                    # sys.exit(-1)
+                    self._err = err
+                    self._aborted = True
+        except Exception as e :
+            self.logInfo("Non-TAOS exception encountered")
+            self._err = e 
+            self._aborted = True
+            traceback.print_exc()
+        except :
+            self.logDebug("[=] Unexpected exception, SQL: {}".format(self._lastSql))
            raise
        self._execStats.endTaskType(self.__class__.__name__, self.isSuccess())
        
@@ -971,8 +1244,21 @@ class Task():
        self._execStats.incExecCount(self.__class__.__name__, self.isSuccess()) # TODO: merge with above.

    def execSql(self, sql):
+        self._lastSql = sql
        return self._dbManager.execute(sql)

+    def execWtSql(self, wt: WorkerThread, sql): # execute an SQL on the worker thread
+        self._lastSql = sql
+        return wt.execSql(sql)
+
+    def queryWtSql(self, wt: WorkerThread, sql): # execute an SQL on the worker thread
+        self._lastSql = sql
+        return wt.querySql(sql)
+
+    def getQueryResult(self, wt: WorkerThread): # execute an SQL on the worker thread
+        return wt.getQueryResult()
+
+
                  
 class ExecutionStats:
    def __init__(self):
@@ -987,6 +1273,12 @@ class ExecutionStats:
        self._failed = False
        self._failureReason = None

+    def __str__(self):
+        return "[ExecStats: _failed={}, _failureReason={}".format(self._failed, self._failureReason)
+
+    def isFailed(self):
+        return self._failed == True
+
    def startExec(self):
        self._execStartTime = time.time()

@@ -1018,7 +1310,7 @@ class ExecutionStats:
        self._failed = True
        self._failureReason = reason

-    def logStats(self):
+    def printStats(self):
        logger.info("----------------------------------------------------------------------")
        logger.info("| Crash_Gen test {}, with the following stats:".
            format("FAILED (reason: {})".format(self._failureReason) if self._failed else "SUCCEEDED"))
@@ -1033,11 +1325,17 @@ class ExecutionStats:
        logger.info("| Total Task Busy Time (elapsed time when any task is in progress): {:.3f} seconds".format(self._accRunTime))
        logger.info("| Average Per-Task Execution Time: {:.3f} seconds".format(self._accRunTime/execTimesAny))
        logger.info("| Total Elapsed Time (from wall clock): {:.3f} seconds".format(self._elapsedTime))
+        logger.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList()))
        logger.info("----------------------------------------------------------------------")
        


 class StateTransitionTask(Task):
+    LARGE_NUMBER_OF_TABLES = 35
+    SMALL_NUMBER_OF_TABLES = 3
+    LARGE_NUMBER_OF_RECORDS = 50
+    SMALL_NUMBER_OF_RECORDS = 3
+
    @classmethod
    def getInfo(cls): # each sub class should supply their own information
        raise RuntimeError("Overriding method expected")
@@ -1060,6 +1358,10 @@ class StateTransitionTask(Task):
        # return state.getValue() in cls.getBeginStates()
        raise RuntimeError("must be overriden")

+    @classmethod
+    def getRegTableName(cls, i):
+        return "db.reg_table_{}".format(i)
+
    def execute(self, wt: WorkerThread):
        super().execute(wt)
        
@@ -1073,7 +1375,7 @@ class TaskCreateDb(StateTransitionTask):
        return state.canCreateDb()

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
-        wt.execSql("create database db")       
+        self.execWtSql(wt, "create database db")       

 class TaskDropDb(StateTransitionTask):
    @classmethod
@@ -1085,7 +1387,7 @@ class TaskDropDb(StateTransitionTask):
        return state.canDropDb()

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
-        wt.execSql("drop database db")
+        self.execWtSql(wt, "drop database db")
        logger.debug("[OPS] database dropped at {}".format(time.time()))

 class TaskCreateSuperTable(StateTransitionTask):
@@ -1098,8 +1400,13 @@ class TaskCreateSuperTable(StateTransitionTask):
        return state.canCreateFixedSuperTable()

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
-        tblName = self._dbManager.getFixedSuperTableName()        
-        wt.execSql("create table db.{} (ts timestamp, speed int) tags (b binary(200), f float) ".format(tblName))
+        if not wt.dbInUse(): # no DB yet, to the best of our knowledge
+            logger.debug("Skipping task, no DB yet")
+            return
+
+        tblName = self._dbManager.getFixedSuperTableName()   
+        # wt.execSql("use db")    # should always be in place
+        self.execWtSql(wt, "create table db.{} (ts timestamp, speed int) tags (b binary(200), f float) ".format(tblName))
        # No need to create the regular tables, INSERT will do that automatically


@@ -1114,16 +1421,16 @@ class TaskReadData(StateTransitionTask):

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
        sTbName = self._dbManager.getFixedSuperTableName()        
-        dbc = wt.getDbConn()
-        dbc.query("select TBNAME from db.{}".format(sTbName)) # TODO: analyze result set later
+        self.queryWtSql(wt, "select TBNAME from db.{}".format(sTbName)) # TODO: analyze result set later
+
        if random.randrange(5) == 0 : # 1 in 5 chance, simulate a broken connection. TODO: break connection in all situations
-            dbc.close()
-            dbc.open()
+            wt.getDbConn().close()
+            wt.getDbConn().open()
        else:
-            rTables = dbc.getQueryResult()
+            rTables = self.getQueryResult(wt) # wt.getDbConn().getQueryResult()
            # print("rTables[0] = {}, type = {}".format(rTables[0], type(rTables[0])))
            for rTbName in rTables : # regular tables
-                dbc.query("select * from db.{}".format(rTbName[0])) # TODO: check success failure
+                self.execWtSql(wt, "select * from db.{}".format(rTbName[0]))

        # tdSql.query(" cars where tbname in ('carzero', 'carone')")

@@ -1137,8 +1444,33 @@ class TaskDropSuperTable(StateTransitionTask):
        return state.canDropFixedSuperTable()

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
-        tblName = self._dbManager.getFixedSuperTableName()        
-        wt.execSql("drop table db.{}".format(tblName))
+        # 1/2 chance, we'll drop the regular tables one by one, in a randomized sequence
+        if Dice.throw(2) == 0 : 
+            tblSeq = list(range(2 + (self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES))) 
+            random.shuffle(tblSeq) 
+            tickOutput = False # if we have spitted out a "d" character for "drop regular table"
+            isSuccess = True
+            for i in tblSeq: 
+                regTableName = self.getRegTableName(i); # "db.reg_table_{}".format(i)                  
+                try:
+                    self.execWtSql(wt, "drop table {}".format(regTableName)) # nRows always 0, like MySQL
+                except taos.error.ProgrammingError as err:                    
+                    errno2 = err.errno if (err.errno > 0) else 0x80000000 + err.errno # correcting for strange error number scheme
+                    if ( errno2 in [0x362]) : # mnode invalid table name
+                        isSuccess = False
+                        logger.debug("[DB] Acceptable error when dropping a table")
+                    continue # try to delete next regular table
+
+                if (not tickOutput):
+                    tickOutput = True # Print only one time
+                    if isSuccess :
+                        print("d", end="", flush=True)
+                    else:
+                        print("f", end="", flush=True)                    
+
+        # Drop the super table itself
+        tblName = self._dbManager.getFixedSuperTableName()     
+        self.execWtSql(wt, "drop table db.{}".format(tblName))

 class TaskAlterTags(StateTransitionTask):
    @classmethod
@@ -1153,20 +1485,18 @@ class TaskAlterTags(StateTransitionTask):
        tblName = self._dbManager.getFixedSuperTableName()   
        dice = Dice.throw(4)
        if dice == 0 :
-            wt.execSql("alter table db.{} add tag extraTag int".format(tblName))
+            sql = "alter table db.{} add tag extraTag int".format(tblName)
        elif dice == 1 :
-            wt.execSql("alter table db.{} drop tag extraTag".format(tblName))
+            sql = "alter table db.{} drop tag extraTag".format(tblName)
        elif dice == 2 :
-            wt.execSql("alter table db.{} drop tag newTag".format(tblName))
+            sql = "alter table db.{} drop tag newTag".format(tblName)
        else: # dice == 3
-            wt.execSql("alter table db.{} change tag extraTag newTag".format(tblName))
+            sql = "alter table db.{} change tag extraTag newTag".format(tblName)
+
+        self.execWtSql(wt, sql)

 class TaskAddData(StateTransitionTask):
    activeTable : Set[int] = set() # Track which table is being actively worked on
-    LARGE_NUMBER_OF_TABLES = 35
-    SMALL_NUMBER_OF_TABLES = 3
-    LARGE_NUMBER_OF_RECORDS = 50
-    SMALL_NUMBER_OF_RECORDS = 3

    # We use these two files to record operations to DB, useful for power-off tests
    fAddLogReady = None
@@ -1192,7 +1522,7 @@ class TaskAddData(StateTransitionTask):
        
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
        ds = self._dbManager
-        wt.execSql("use db") # TODO: seems to be an INSERT bug to require this
+        # wt.execSql("use db") # TODO: seems to be an INSERT bug to require this
        tblSeq = list(range(self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES)) 
        random.shuffle(tblSeq) 
        for i in tblSeq: 
@@ -1202,10 +1532,10 @@ class TaskAddData(StateTransitionTask):
                print("x", end="", flush=True)
            else:
                self.activeTable.add(i) # marking it active
-            # No need to shuffle data sequence, unless later we decide to do non-increment insertion            
+            # No need to shuffle data sequence, unless later we decide to do non-increment insertion      
+            regTableName = self.getRegTableName(i); # "db.reg_table_{}".format(i)      
            for j in range(self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS) : # number of records per table
-                nextInt = ds.getNextInt()
-                regTableName = "db.reg_table_{}".format(i)
+                nextInt = ds.getNextInt()                
                if gConfig.record_ops:
                    self.prepToRecordOps()
                    self.fAddLogReady.write("Ready to write {} to {}\n".format(nextInt, regTableName))
@@ -1216,7 +1546,9 @@ class TaskAddData(StateTransitionTask):
                    ds.getFixedSuperTableName(), 
                    ds.getNextBinary(), ds.getNextFloat(),
                    ds.getNextTick(), nextInt)
-                wt.execSql(sql) 
+                self.execWtSql(wt, sql) 
+                # Successfully wrote the data into the DB, let's record it somehow
+                te.recordDataMark(nextInt)
                if gConfig.record_ops:
                    self.fAddLogDone.write("Wrote {} to {}\n".format(nextInt, regTableName))
                    self.fAddLogDone.flush()
@@ -1285,17 +1617,213 @@ class LoggingFilter(logging.Filter):
        if ( record.levelno >= logging.INFO ) :
            return True # info or above always log

-        msg = record.msg
-        # print("type = {}, value={}".format(type(msg), msg))
-        # sys.exit()
-
        # Commenting out below to adjust...

        # if msg.startswith("[TRD]"):
        #     return False
        return True

+class MyLoggingAdapter(logging.LoggerAdapter):    
+    def process(self, msg, kwargs):
+        return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs
+        # return '[%s] %s' % (self.extra['connid'], msg), kwargs
+
+class SvcManager:    
+    
+    def __init__(self):
+        print("Starting service manager")
+        signal.signal(signal.SIGTERM, self.sigIntHandler)
+        signal.signal(signal.SIGINT, self.sigIntHandler)
+        self.ioThread = None
+        self.subProcess = None
+        self.shouldStop = False
+        self.status = MainExec.STATUS_RUNNING
+
+    def svcOutputReader(self, out: IO, queue):
+        # print("This is the svcOutput Reader...")
+        for line in out : # iter(out.readline, b''):
+            # print("Finished reading a line: {}".format(line))
+            queue.put(line.rstrip()) # get rid of new lines
+        print("No more output from incoming IO") # meaning sub process must have died
+        out.close()
+
+    def sigIntHandler(self, signalNumber, frame):
+        if self.status != MainExec.STATUS_RUNNING :
+            print("Ignoring repeated SIGINT...")
+            return # do nothing if it's already not running
+        self.status = MainExec.STATUS_STOPPING # immediately set our status
+
+        print("Terminating program...")
+        self.subProcess.send_signal(signal.SIGINT)
+        self.shouldStop = True
+        self.joinIoThread()
+
+    def joinIoThread(self):
+        if self.ioThread :
+            self.ioThread.join()
+            self.ioThread = None        
+
+    def run(self):
+        ON_POSIX = 'posix' in sys.builtin_module_names
+        svcCmd = ['../../build/build/bin/taosd', '-c', '../../build/test/cfg']
+        # svcCmd = ['vmstat', '1']
+        self.subProcess = subprocess.Popen(svcCmd, stdout=subprocess.PIPE, bufsize=1, close_fds=ON_POSIX, text=True)
+        q = Queue()
+        self.ioThread = threading.Thread(target=self.svcOutputReader, args=(self.subProcess.stdout, q))
+        self.ioThread.daemon = True # thread dies with the program
+        self.ioThread.start()
+
+        # proc = subprocess.Popen(['echo', '"to stdout"'], 
+        #                 stdout=subprocess.PIPE,
+        #                 )
+        # stdout_value = proc.communicate()[0]
+        # print('\tstdout: {}'.format(repr(stdout_value)))
+
+        while True :
+            try:  
+                line = q.get_nowait() # getting output at fast speed                    
+            except Empty:
+                # print('no output yet')
+                time.sleep(2.3) # wait only if there's no output
+            else: # got line
+                print(line)
+            # print("----end of iteration----")
+            if self.shouldStop:
+                print("Ending main Svc thread")
+                break
+
+        print("end of loop")
+        
+        self.joinIoThread()
+        print("Finished")
+
+class ClientManager:
+    def __init__(self):
+        print("Starting service manager")
+        signal.signal(signal.SIGTERM, self.sigIntHandler)
+        signal.signal(signal.SIGINT, self.sigIntHandler)
+
+        self.status = MainExec.STATUS_RUNNING
+        self.tc = None
+
+    def sigIntHandler(self, signalNumber, frame):
+        if self.status != MainExec.STATUS_RUNNING :
+            print("Ignoring repeated SIGINT...")
+            return # do nothing if it's already not running
+        self.status = MainExec.STATUS_STOPPING # immediately set our status
+
+        print("Terminating program...")
+        self.tc.requestToStop()
+
+    def _printLastNumbers(self): # to verify data durability
+        dbManager = DbManager(resetDb=False)
+        dbc = dbManager.getDbConn()
+        if dbc.query("show databases") == 0 : # no databae
+            return
+        if dbc.query("show tables") == 0 : # no tables
+            return
+
+        dbc.execute("use db")
+        sTbName = dbManager.getFixedSuperTableName()        
+
+        # get all regular tables
+        dbc.query("select TBNAME from db.{}".format(sTbName)) # TODO: analyze result set later
+        rTables = dbc.getQueryResult()
+
+        bList = TaskExecutor.BoundedList()
+        for rTbName in rTables : # regular tables
+            dbc.query("select speed from db.{}".format(rTbName[0]))
+            numbers = dbc.getQueryResult()
+            for row in numbers :                
+                # print("<{}>".format(n), end="", flush=True)
+                bList.add(row[0])
+
+        print("Top numbers in DB right now: {}".format(bList))
+        print("TDengine client execution is about to start in 2 seconds...")
+        time.sleep(2.0)
+        dbManager = None # release?
+
+    def prepare(self):
+        self._printLastNumbers()
+
+    def run(self):
+        self._printLastNumbers()
+
+        dbManager = DbManager() # Regular function
+        Dice.seed(0) # initial seeding of dice
+        thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
+        self.tc = ThreadCoordinator(thPool, dbManager)
        
+        self.tc.run()
+        # print("exec stats: {}".format(self.tc.getExecStats()))
+        # print("TC failed = {}".format(self.tc.isFailed()))
+        self.conclude()
+        # print("TC failed (2) = {}".format(self.tc.isFailed()))
+        return 1 if self.tc.isFailed() else 0 # Linux return code: ref https://shapeshed.com/unix-exit-codes/
+
+    def conclude(self):
+        self.tc.printStats()
+        self.tc.getDbManager().cleanUp()    
+
+
+class MainExec:
+    STATUS_RUNNING = 1
+    STATUS_STOPPING = 2
+    # STATUS_STOPPED = 3 # Not used yet
+
+    @classmethod
+    def runClient(cls):
+        clientManager = ClientManager()
+        return clientManager.run()
+
+    @classmethod
+    def runService(cls):
+        svcManager = SvcManager()
+        svcManager.run()
+
+    @classmethod
+    def runTemp(cls): # for debugging purposes
+        # # Hack to exercise reading from disk, imcreasing coverage. TODO: fix
+        # dbc = dbState.getDbConn()
+        # sTbName = dbState.getFixedSuperTableName()   
+        # dbc.execute("create database if not exists db")
+        # if not dbState.getState().equals(StateEmpty()):
+        #     dbc.execute("use db")     
+
+        # rTables = None
+        # try: # the super table may not exist
+        #     sql = "select TBNAME from db.{}".format(sTbName)
+        #     logger.info("Finding out tables in super table: {}".format(sql))
+        #     dbc.query(sql) # TODO: analyze result set later
+        #     logger.info("Fetching result")
+        #     rTables = dbc.getQueryResult()
+        #     logger.info("Result: {}".format(rTables))
+        # except taos.error.ProgrammingError as err:
+        #     logger.info("Initial Super table OPS error: {}".format(err))
+        
+        # # sys.exit()
+        # if ( not rTables == None):
+        #     # print("rTables[0] = {}, type = {}".format(rTables[0], type(rTables[0])))
+        #     try:
+        #         for rTbName in rTables : # regular tables
+        #             ds = dbState
+        #             logger.info("Inserting into table: {}".format(rTbName[0]))
+        #             sql = "insert into db.{} values ('{}', {});".format(
+        #                 rTbName[0],                    
+        #                 ds.getNextTick(), ds.getNextInt())
+        #             dbc.execute(sql)
+        #         for rTbName in rTables : # regular tables        
+        #             dbc.query("select * from db.{}".format(rTbName[0])) # TODO: check success failure
+        #         logger.info("Initial READING operation is successful")       
+        #     except taos.error.ProgrammingError as err:
+        #         logger.info("Initial WRITE/READ error: {}".format(err))   
+        
+        # Sandbox testing code
+        # dbc = dbState.getDbConn()
+        # while True:
+        #     rows = dbc.query("show databases") 
+        #     print("Rows: {}, time={}".format(rows, time.time()))
+        return 

 def main():
    # Super cool Python argument library: https://docs.python.org/3/library/argparse.html
@@ -1308,93 +1836,55 @@ def main():
            2. You run the server there before this script: ./build/bin/taosd -c test/cfg

            '''))
+
+    parser.add_argument('-c', '--connector-type', action='store', default='native', type=str,
+                        help='Connector type to use: native, rest, or mixed (default: 10)')
    parser.add_argument('-d', '--debug', action='store_true',                        
                        help='Turn on DEBUG mode for more logging (default: false)')
+    parser.add_argument('-e', '--run-tdengine', action='store_true',                        
+                        help='Run TDengine service in foreground (default: false)')
    parser.add_argument('-l', '--larger-data', action='store_true',                        
                        help='Write larger amount of data during write operations (default: false)')
    parser.add_argument('-p', '--per-thread-db-connection', action='store_true',                        
                        help='Use a single shared db connection (default: false)')
    parser.add_argument('-r', '--record-ops', action='store_true',                        
                        help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')                    
-    parser.add_argument('-s', '--max-steps', action='store', default=100, type=int,
+    parser.add_argument('-s', '--max-steps', action='store', default=1000, type=int,
                        help='Maximum number of steps to run (default: 100)')
-    parser.add_argument('-t', '--num-threads', action='store', default=10, type=int,
+    parser.add_argument('-t', '--num-threads', action='store', default=5, type=int,
                        help='Number of threads to run (default: 10)')

    global gConfig
    gConfig = parser.parse_args()
-    if len(sys.argv) == 1:
-        parser.print_help()
-        sys.exit()

+    # if len(sys.argv) == 1:
+    #     parser.print_help()
+    #     sys.exit()
+ 
+    # Logging Stuff
    global logger
-    logger = logging.getLogger('CrashGen')
-    logger.addFilter(LoggingFilter())
+    _logger = logging.getLogger('CrashGen') # real logger
+    _logger.addFilter(LoggingFilter()) 
+    ch = logging.StreamHandler()
+    _logger.addHandler(ch)
+
+    logger = MyLoggingAdapter(_logger, []) # Logging adapter, to be used as a logger
+
    if ( gConfig.debug ):
        logger.setLevel(logging.DEBUG) # default seems to be INFO        
    else:
        logger.setLevel(logging.INFO)
-    ch = logging.StreamHandler()
-    logger.addHandler(ch)
-
-    # resetDb = False # DEBUG only
-    # dbState = DbState(resetDb)  # DBEUG only!
-    dbManager = DbManager() # Regular function
-    Dice.seed(0) # initial seeding of dice
-    tc = ThreadCoordinator(
-        ThreadPool(gConfig.num_threads, gConfig.max_steps), 
-        # WorkDispatcher(dbState), # Obsolete?
-        dbManager
-        )
-
-    # # Hack to exercise reading from disk, imcreasing coverage. TODO: fix
-    # dbc = dbState.getDbConn()
-    # sTbName = dbState.getFixedSuperTableName()   
-    # dbc.execute("create database if not exists db")
-    # if not dbState.getState().equals(StateEmpty()):
-    #     dbc.execute("use db")     
-
-    # rTables = None
-    # try: # the super table may not exist
-    #     sql = "select TBNAME from db.{}".format(sTbName)
-    #     logger.info("Finding out tables in super table: {}".format(sql))
-    #     dbc.query(sql) # TODO: analyze result set later
-    #     logger.info("Fetching result")
-    #     rTables = dbc.getQueryResult()
-    #     logger.info("Result: {}".format(rTables))
-    # except taos.error.ProgrammingError as err:
-    #     logger.info("Initial Super table OPS error: {}".format(err))
-    
-    # # sys.exit()
-    # if ( not rTables == None):
-    #     # print("rTables[0] = {}, type = {}".format(rTables[0], type(rTables[0])))
-    #     try:
-    #         for rTbName in rTables : # regular tables
-    #             ds = dbState
-    #             logger.info("Inserting into table: {}".format(rTbName[0]))
-    #             sql = "insert into db.{} values ('{}', {});".format(
-    #                 rTbName[0],                    
-    #                 ds.getNextTick(), ds.getNextInt())
-    #             dbc.execute(sql)
-    #         for rTbName in rTables : # regular tables        
-    #             dbc.query("select * from db.{}".format(rTbName[0])) # TODO: check success failure
-    #         logger.info("Initial READING operation is successful")       
-    #     except taos.error.ProgrammingError as err:
-    #         logger.info("Initial WRITE/READ error: {}".format(err))   
-    
    
+    # Run server or client
+    if gConfig.run_tdengine : # run server
+        MainExec.runService()
+    else :
+        return MainExec.runClient()

-    # Sandbox testing code
-    # dbc = dbState.getDbConn()
-    # while True:
-    #     rows = dbc.query("show databases") 
-    #     print("Rows: {}, time={}".format(rows, time.time()))
-    
-    tc.run()
-    tc.logStats()
-    dbManager.cleanUp()    
    
    # logger.info("Crash_Gen execution finished")

 if __name__ == "__main__":
-    main()
+    exitCode = main()
+    # print("Exiting with code: {}".format(exitCode))
+    sys.exit(exitCode)
--- a/tests/pytest/crash_gen.sh
+++ b/tests/pytest/crash_gen.sh
@@ -38,4 +38,4 @@ export PYTHONPATH=$(pwd)/../../src/connector/python/linux/python3
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/../../build/build/lib

 # Now we are all let, and let's see if we can find a crash. Note we pass all params
-./crash_gen.py $@
+python3 ./crash_gen.py $@
--- a/tests/script/unique/cluster/client1_0.sim
+++ b/tests/script/unique/cluster/client1_0.sim
+sql connect
+
+$db = db1
+$stb = stb1
+print =============== client1_0:
+
+sql use $db
+
+$tblNum = 1000
+
+$i = 1
+while $i < $tblNum
+  $tb = tb . $i
+  sql create table $tb using $stb tags ($i, 'abcd')
+  $i = $i + 1
+endw 
--- a/tests/script/unique/cluster/cluster_main.sim
+++ b/tests/script/unique/cluster/cluster_main.sim
+system sh/stop_dnodes.sh
+system sh/deploy.sh -n dnode1 -i 1
+system sh/deploy.sh -n dnode2 -i 2
+system sh/deploy.sh -n dnode3 -i 3
+system sh/deploy.sh -n dnode4 -i 4
+
+system sh/cfg.sh -n dnode1 -c numOfMnodes -v 3
+system sh/cfg.sh -n dnode2 -c numOfMnodes -v 3
+system sh/cfg.sh -n dnode3 -c numOfMnodes -v 3
+system sh/cfg.sh -n dnode4 -c numOfMnodes -v 3
+
+system sh/cfg.sh -n dnode1 -c walLevel -v 1
+system sh/cfg.sh -n dnode2 -c walLevel -v 1
+system sh/cfg.sh -n dnode3 -c walLevel -v 1
+system sh/cfg.sh -n dnode4 -c walLevel -v 1
+
+system sh/cfg.sh -n dnode1 -c balanceInterval -v 10
+system sh/cfg.sh -n dnode2 -c balanceInterval -v 10
+system sh/cfg.sh -n dnode3 -c balanceInterval -v 10
+system sh/cfg.sh -n dnode4 -c balanceInterval -v 10
+
+system sh/cfg.sh -n dnode1 -c numOfTotalVnodes -v 4
+system sh/cfg.sh -n dnode2 -c numOfTotalVnodes -v 4
+system sh/cfg.sh -n dnode3 -c numOfTotalVnodes -v 4
+system sh/cfg.sh -n dnode4 -c numOfTotalVnodes -v 4
+
+system sh/cfg.sh -n dnode1 -c alternativeRole -v 0
+system sh/cfg.sh -n dnode2 -c alternativeRole -v 0
+system sh/cfg.sh -n dnode3 -c alternativeRole -v 0
+system sh/cfg.sh -n dnode4 -c alternativeRole -v 0
+
+system sh/cfg.sh -n dnode1 -c maxtablesPerVnode -v 1000
+system sh/cfg.sh -n dnode2 -c maxtablesPerVnode -v 1000
+system sh/cfg.sh -n dnode3 -c maxtablesPerVnode -v 1000
+system sh/cfg.sh -n dnode4 -c maxtablesPerVnode -v 1000
+
+system sh/cfg.sh -n dnode1 -c arbitrator -v $arbitrator
+system sh/cfg.sh -n dnode2 -c arbitrator -v $arbitrator
+system sh/cfg.sh -n dnode3 -c arbitrator -v $arbitrator
+system sh/cfg.sh -n dnode4 -c arbitrator -v $arbitrator
+
+print ============== step0: start tarbitrator
+system sh/exec_tarbitrator.sh -s start
+
+print ============== step1: start dnode1/dnode2/dnode3
+system sh/exec.sh -n dnode1 -s start
+system sh/exec.sh -n dnode2 -s start
+system sh/exec.sh -n dnode3 -s start
+sleep 3000
+sql connect
+sql create dnode $hostname2
+sql create dnode $hostname3
+sleep 3000
+
+print ============== step2: create db1 with replica 3
+$db = db1
+print create database $db replica 3
+#sql create database $db replica 3 maxTables $totalTableNum
+sql create database $db replica 3
+sql use $db
+
+print ============== step3: create stable stb1
+$stb = stb1
+sql create table $stb (ts timestamp, c1 int, c2 int) tags(t1 int, t2 binary(8))
+
+print ============== step4: start 10 client1/ 10 client2/ 10 client3/ 10 client4/ 1 client5
+run_back unique/cluster/client1_0.sim
+#run_back unique/cluster/client1_1.sim
+#run_back unique/big_cluster/client1_2.sim
+#run_back unique/big_cluster/client1_3.sim
+#run_back unique/big_cluster/client1_4.sim
+#run_back unique/big_cluster/client1_5.sim
+#run_back unique/big_cluster/client1_6.sim
+#run_back unique/big_cluster/client1_7.sim
+#run_back unique/big_cluster/client1_8.sim
+#run_back unique/big_cluster/client1_9.sim
+
+
+print wait for a while to let clients start insert data
+sleep 5000    
+
+$loop_cnt = 0
+loop_cluster_do:
+print **** **** **** START loop cluster do **** **** **** ****
+print ============== step5: start dnode4 and add into cluster, then wait dnode4 ready
+system sh/exec.sh -n dnode4 -s start
+sql create dnode $hostname4
+
+wait_dnode4_ready_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 4 then
+  sleep 2000
+  goto wait_dnode4_ready_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+#$dnode4Status = $data4_4 
+
+if $loop_cnt == 0 then
+  $dnode4Status = $data4_4 
+elif $loop_cnt == 1 then
+  $dnode4Status = $data4_6 
+elif $loop_cnt == 2 then
+  $dnode4Status = $data4_8 
+else then
+  print **** **** **** END loop cluster do 2**** **** **** ****
+  return
+endi
+
+if $dnode4Status != ready then
+  sleep 2000
+  goto wait_dnode4_ready_0
+endi
+
+
+print ============== step6: stop and drop dnode1, then remove data dir of dnode1 
+system sh/exec.sh -n dnode1 -s stop -x SIGINT
+
+$cnt = 0
+wait_dnode1_offline_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 4 then
+  sleep 2000
+  goto wait_dnode1_offline_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+ 
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+$dnode4Status = $data4_4 
+
+if $loop_cnt == 0 then
+  $dnode1Status = $data4_1 
+elif $loop_cnt == 1 then
+  $dnode1Status = $data4_5 
+elif $loop_cnt == 2 then
+  $dnode1Status = $data4_7 
+elif $loop_cnt == 3 then
+  $dnode1Status = $data4_9 
+else then
+  print **** **** **** END loop cluster do 1**** **** **** ****
+  return
+endi
+
+if $dnode1Status != offline then
+  sleep 2000
+  goto wait_dnode1_offline_0
+endi
+
+sql drop dnode $hostname1
+system rm -rf ../../../sim/dnode1
+
+
+print ============== step7: stop dnode2, because mnodes < 50%, so clusert don't provide services
+system sh/exec.sh -n dnode2 -s stop -x SIGINT
+
+sql show dnodes -x wait_dnode2_offline_0
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode2_offline_0
+endi
+wait_dnode2_offline_0:
+
+#$cnt = 0
+#wait_dnode2_offline_0:
+#$cnt = $cnt + 1
+#if $cnt == 10 then
+#  return -1
+#endi
+#sql show dnodes -x wait_dnode2_offline_0
+#if $rows != 3 then
+#  sleep 2000
+#  goto wait_dnode2_offline_0
+#endi
+#print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+#print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+#print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+#print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+#$dnode1Status = $data4_1  
+#$dnode2Status = $data4_2
+#$dnode3Status = $data4_3  
+#$dnode4Status = $data4_4 
+#
+#if $dnode2Status != offline then
+#  sleep 2000
+#  goto wait_dnode1_offline_0
+#endi
+
+print ============== step8: restart dnode2, then wait sync end
+system sh/exec.sh -n dnode2 -s start
+
+$cnt = 0
+wait_dnode2_ready_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode2_ready_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+$dnode4Status = $data4_4 
+
+if $dnode2Status != ready then
+  sleep 2000
+  goto wait_dnode2_ready_0
+endi
+
+
+print ============== step9: stop dnode3, then wait sync end
+system sh/exec.sh -n dnode3 -s stop -x SIGINT
+
+$cnt = 0
+wait_dnode3_offline_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode3_offline_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+$dnode4Status = $data4_4 
+
+if $dnode3Status != offline then
+  sleep 2000
+  goto wait_dnode3_offline_0
+endi
+
+print ============== step10: restart dnode3, then wait sync end
+system sh/exec.sh -n dnode3 -s start
+
+$cnt = 0
+wait_dnode3_ready_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode3_ready_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+$dnode4Status = $data4_4 
+
+if $dnode3Status != ready then
+  sleep 2000
+  goto wait_dnode3_ready_0
+endi
+
+print ============== step11: stop dnode4, then wait sync end
+system sh/exec.sh -n dnode4 -s stop -x SIGINT
+
+$cnt = 0
+wait_dnode4_offline_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode4_offline_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+#$dnode4Status = $data4_4 
+
+if $loop_cnt == 0 then
+  $dnode4Status = $data4_4 
+elif $loop_cnt == 1 then
+  $dnode4Status = $data4_6 
+elif $loop_cnt == 2 then
+  $dnode4Status = $data4_8 
+else then
+  print **** **** **** END loop cluster do 2**** **** **** ****
+  return
+endi
+
+if $dnode4Status != offline then
+  sleep 2000
+  goto wait_dnode4_offline_0
+endi
+
+print ============== step12: restart dnode4, then wait sync end
+system sh/exec.sh -n dnode4 -s start
+
+$cnt = 0
+wait_dnode4_ready_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode4_ready_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+#$dnode4Status = $data4_4 
+
+if $loop_cnt == 0 then
+  $dnode4Status = $data4_4 
+elif $loop_cnt == 1 then
+  $dnode4Status = $data4_6 
+elif $loop_cnt == 2 then
+  $dnode4Status = $data4_8 
+else then
+  print **** **** **** END loop cluster do 2**** **** **** ****
+  return
+endi
+
+if $dnode4Status != ready then
+  sleep 2000
+  goto wait_dnode4_ready_0
+endi
+
+print ============== step13: alter replica 2
+sql alter database $db replica 2
+sql show database
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+
+if $data0_5 != 2 then
+  print rplica is not modify to 2, error!!!!!!
+  return
+endi
+
+print ============== step14: stop and drop dnode4, then remove data dir of dnode4 
+system sh/exec.sh -n dnode4 -s stop -x SIGINT
+
+$cnt = 0
+wait_dnode4_offline_1:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode4_offline_1
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+ 
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+#$dnode4Status = $data4_4 
+
+if $loop_cnt == 0 then
+  $dnode4Status = $data4_4 
+elif $loop_cnt == 1 then
+  $dnode4Status = $data4_6 
+elif $loop_cnt == 2 then
+  $dnode4Status = $data4_8 
+else then
+  print **** **** **** END loop cluster do 2**** **** **** ****
+  return
+endi
+
+if $dnode4Status != offline then
+  sleep 2000
+  goto wait_dnode4_offline_1
+endi
+
+sql drop dnode $hostname4
+system rm -rf ../../../sim/dnode4
+
+
+print ============== step15: alter replica 1
+sql alter database $db replica 1
+sql show database
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+
+if $data0_5 != 1 then
+  print rplica is not modify to 1, error!!!!!!
+  return
+endi
+
+
+print ============== step16: alter replica 2
+sql alter database $db replica 1
+sql show database
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+
+if $data0_5 != 2 then
+  print rplica is not modify to 2, error!!!!!!
+  return
+endi
+
+print ============== step17: start dnode1 and add into cluster, then wait dnode1 ready
+system sh/exec.sh -n dnode1 -s start
+sql create dnode $hostname1
+
+wait_dnode1_ready_0:
+$cnt = $cnt + 1
+if $cnt == 10 then
+  return -1
+endi
+sql show dnodes
+if $rows != 3 then
+  sleep 2000
+  goto wait_dnode1_ready_0
+endi
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+print $data0_2  $data1_2  $data2_2  $data3_2  $data4_2
+print $data0_3  $data1_3  $data2_3  $data3_3  $data4_3
+print $data0_4  $data1_4  $data2_4  $data3_4  $data4_4
+#$dnode1Status = $data4_1  
+$dnode2Status = $data4_2
+$dnode3Status = $data4_3  
+$dnode4Status = $data4_4 
+
+if $loop_cnt == 0 then
+  $dnode1Status = $data4_1 
+elif $loop_cnt == 1 then
+  $dnode1Status = $data4_5 
+elif $loop_cnt == 2 then
+  $dnode1Status = $data4_7 
+elif $loop_cnt == 3 then
+  $dnode1Status = $data4_9 
+else then
+  print **** **** **** END loop cluster do 3**** **** **** ****
+  return
+endi
+
+if $dnode1Status != ready then
+  sleep 2000
+  goto wait_dnode1_ready_0
+endi
+
+print ============== step18: alter replica 3
+sql alter database $db replica 3
+sql show database
+print $data0_1  $data1_1  $data2_1  $data3_1  $data4_1
+
+if $data0_5 != 3 then
+  print rplica is not modify to 3, error!!!!!!
+  return
+endi
+
+$loop_cnt = $loop_cnt + 1
+goto loop_cluster_do