enh: rsma batch process

441ce216 · Cary Xu · f0cdafd6 · 441ce216 · 441ce216 · 441ce216
12 changed file
--- a/include/common/tmsg.h
+++ b/include/common/tmsg.h
@@ -2667,31 +2667,6 @@ typedef struct {
  int32_t padding;
 } SRSmaExecMsg;

-typedef struct {
-  int64_t suid;
-  int8_t  level;
-} SRSmaFetchMsg;
-
-static FORCE_INLINE int32_t tEncodeSRSmaFetchMsg(SEncoder* pCoder, const SRSmaFetchMsg* pReq) {
-  if (tStartEncode(pCoder) < 0) return -1;
-
-  if (tEncodeI64(pCoder, pReq->suid) < 0) return -1;
-  if (tEncodeI8(pCoder, pReq->level) < 0) return -1;
-
-  tEndEncode(pCoder);
-  return 0;
-}
-
-static FORCE_INLINE int32_t tDecodeSRSmaFetchMsg(SDecoder* pCoder, SRSmaFetchMsg* pReq) {
-  if (tStartDecode(pCoder) < 0) return -1;
-
-  if (tDecodeI64(pCoder, &pReq->suid) < 0) return -1;
-  if (tDecodeI8(pCoder, &pReq->level) < 0) return -1;
-
-  tEndDecode(pCoder);
-  return 0;
-}
-
 typedef struct {
  int8_t         version;       // for compatibility(default 0)
  int8_t         intervalUnit;  // MACRO: TIME_UNIT_XXX

--- a/include/common/tmsgdef.h
+++ b/include/common/tmsgdef.h
@@ -201,7 +201,7 @@ enum {
  TD_DEF_MSG_TYPE(TDMT_VND_CANCEL_SMA, "vnode-cancel-sma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_DROP_SMA, "vnode-drop-sma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_SUBMIT_RSMA, "vnode-submit-rsma", SSubmitReq, SSubmitRsp)
-  TD_DEF_MSG_TYPE(TDMT_VND_FETCH_RSMA, "vnode-fetch-rsma", SRSmaFetchMsg, NULL)
+  TD_DEF_MSG_TYPE(TDMT_VND_FETCH_RSMA, "vnode-fetch-rsma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_EXEC_RSMA, "vnode-exec-rsma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_DELETE, "delete-data", SVDeleteReq, SVDeleteRsp)
  TD_DEF_MSG_TYPE(TDMT_VND_BATCH_DEL, "batch-delete", SBatchDeleteReq, NULL)

--- a/include/util/tqueue.h
+++ b/include/util/tqueue.h
@@ -76,6 +76,7 @@ void       taosFreeQall(STaosQall *qall);
 int32_t    taosReadAllQitems(STaosQueue *queue, STaosQall *qall);
 int32_t    taosGetQitem(STaosQall *qall, void **ppItem);
 void       taosResetQitems(STaosQall *qall);
+int32_t    taosQallItemSize(STaosQall *qall);

 STaosQset *taosOpenQset();
 void       taosCloseQset(STaosQset *qset);

--- a/source/dnode/mnode/impl/src/mndStb.c
+++ b/source/dnode/mnode/impl/src/mndStb.c
@@ -442,6 +442,8 @@ static void *mndBuildVCreateStbReq(SMnode *pMnode, SVgObj *pVgroup, SStbObj *pSt
  if (req.rollup) {
    req.rsmaParam.maxdelay[0] = pStb->maxdelay[0];
    req.rsmaParam.maxdelay[1] = pStb->maxdelay[1];
+    req.rsmaParam.watermark[0] = pStb->watermark[0];
+    req.rsmaParam.watermark[1] = pStb->watermark[1];
    if (pStb->ast1Len > 0) {
      if (mndConvertRsmaTask(&req.rsmaParam.qmsg[0], &req.rsmaParam.qmsgLen[0], pStb->pAst1, pStb->uid,
                             STREAM_TRIGGER_WINDOW_CLOSE, req.rsmaParam.watermark[0]) < 0) {

--- a/source/dnode/vnode/src/inc/sma.h
+++ b/source/dnode/vnode/src/inc/sma.h
@@ -90,14 +90,14 @@ struct SRSmaStat {
  SSma            *pSma;
  int64_t          commitAppliedVer;  // vnode applied version for async commit
  int64_t          refId;             // shared by fetch tasks
-  volatile int64_t qBufSize;          // queue buffer size
+  volatile int64_t nBufItems;         // number of items in queue buffer
  SRWLatch         lock;              // r/w lock for rsma fs(e.g. qtaskinfo)
+  volatile int8_t  nExecutor;         // [1, max(half of query threads, 4)]
  int8_t           triggerStat;       // shared by fetch tasks
  int8_t           commitStat;        // 0 not in committing, 1 in committing
-  int8_t           execStat;          // 0 not in exec , 1 in exec
  SArray          *aTaskFile;         // qTaskFiles committed recently(for recovery/snapshot r/w)
  SHashObj        *infoHash;          // key: suid, value: SRSmaInfo
-  SHashObj        *fetchHash;         // key: suid, value: L1 or L2 or L1|L2
+  tsem_t           notEmpty;          // has items in queue buffer
 };

 struct SSmaStat {
@@ -111,7 +111,6 @@ struct SSmaStat {
 #define SMA_STAT_TSMA(s)     (&(s)->tsmaStat)
 #define SMA_STAT_RSMA(s)     (&(s)->rsmaStat)
 #define RSMA_INFO_HASH(r)    ((r)->infoHash)
-#define RSMA_FETCH_HASH(r)   ((r)->fetchHash)
 #define RSMA_TRIGGER_STAT(r) (&(r)->triggerStat)
 #define RSMA_COMMIT_STAT(r)  (&(r)->commitStat)
 #define RSMA_REF_ID(r)       ((r)->refId)
@@ -120,8 +119,10 @@ struct SSmaStat {
 struct SRSmaInfoItem {
  int8_t  level;
  int8_t  triggerStat;
-  uint16_t interval;  // second
-  int32_t  maxDelay;
+  uint8_t nSkipped;  // number of skipped to fetch data from all active window
+  int8_t  fetchLevel;
+  int32_t maxDelay;  // ms
+  int64_t lastFetch; // ms
  tmr_h   tmrId;
 };

@@ -129,8 +130,10 @@ struct SRSmaInfo {
  STSchema *pTSchema;
  int64_t   suid;
  int64_t   refId;     // refId of SRSmaStat
-  uint64_t  delFlag : 1;
-  uint64_t  lastReceived : 63;  // second
+  int64_t   lastRecv;  // ms
+  int8_t    delFlag;
+  int8_t    assigned;  // 0 idle, 1 assgined for exec
+  int16_t   padding;
  T_REF_DECLARE()
  SRSmaInfoItem items[TSDB_RETENTION_L2];
  void         *taskInfo[TSDB_RETENTION_L2];   // qTaskInfo_t

--- a/source/dnode/vnode/src/inc/vnodeInt.h
+++ b/source/dnode/vnode/src/inc/vnodeInt.h
@@ -198,7 +198,6 @@ int32_t smaAsyncPreCommit(SSma* pSma);
 int32_t smaAsyncCommit(SSma* pSma);
 int32_t smaAsyncPostCommit(SSma* pSma);
 int32_t smaDoRetention(SSma* pSma, int64_t now);
-int32_t smaProcessFetch(SSma* pSma, void* pMsg);
 int32_t smaProcessExec(SSma* pSma, void* pMsg);

 int32_t tdProcessTSmaCreate(SSma* pSma, int64_t version, const char* msg);

--- a/source/dnode/vnode/src/sma/smaCommit.c
+++ b/source/dnode/vnode/src/sma/smaCommit.c
@@ -321,10 +321,10 @@ static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma) {
  int32_t nLoops = 0;
  while (1) {
    if (T_REF_VAL_GET(pStat) == 0) {
-      smaDebug("vgId:%d, rsma fetch tasks all finished", SMA_VID(pSma));
+      smaDebug("vgId:%d, rsma commit, fetch tasks all finished", SMA_VID(pSma));
      break;
    } else {
-      smaDebug("vgId:%d, rsma fetch tasks not all finished yet", SMA_VID(pSma));
+      smaDebug("vgId:%d, rsma commit, fetch tasks not all finished yet", SMA_VID(pSma));
    }
    ++nLoops;
    if (nLoops > 1000) {
@@ -338,30 +338,25 @@ static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma) {
   *  1) This is high cost task and should not put in asyncPreCommit originally.
   *  2) But, if put in asyncCommit, would trigger taskInfo cloning frequently.
   */
-  nLoops = 0;
-  smaInfo("vgId:%d, start to wait for rsma qtask free, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId());
+  if (tdRSmaProcessExecImpl(pSma, RSMA_EXEC_COMMIT) < 0) {
+    return TSDB_CODE_FAILED;
+  }

-  int8_t old;
-  while (1) {
-    old = atomic_val_compare_exchange_8(&pRSmaStat->execStat, 0, 1);
-    if (old == 0) break;
-    if (++nLoops > 1000) {
+  smaInfo("vgId:%d, rsma commit, wait for all items to be consumed, TID:%p", SMA_VID(pSma), (void*)taosGetSelfPthreadId());
+  nLoops = 0;
+  while (atomic_load_64(&pRSmaStat->nBufItems) > 0) {
+    ++nLoops;
+    if (nLoops > 1000) {
      sched_yield();
      nLoops = 0;
-      smaDebug("vgId:%d, wait for rsma qtask free, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId());
-    }
    }
-
-  smaInfo("vgId:%d, end to wait for rsma qtask free, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId());
-
-  if (tdRSmaProcessExecImpl(pSma, RSMA_EXEC_COMMIT) < 0) {
-    atomic_store_8(&pRSmaStat->execStat, 0);
-    return TSDB_CODE_FAILED;
  }
+  smaInfo("vgId:%d, rsma commit, all items are consumed, TID:%p", SMA_VID(pSma), (void*)taosGetSelfPthreadId());

+#if 0 // consuming task of qTaskInfo clone 
  // step 4:  swap queue/qall and iQueue/iQall
  // lock
-  taosWLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWLockLatch(SMA_ENV_LOCK(pEnv));

  ASSERT(RSMA_INFO_HASH(pRSmaStat));

@@ -376,11 +371,9 @@ static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma) {
    pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), pIter);
  }

-  atomic_store_64(&pRSmaStat->qBufSize, 0);
-  atomic_store_8(&pRSmaStat->execStat, 0);
  // unlock
-  taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
-
+  // taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
+#endif
  // step 5: others
  pRSmaStat->commitAppliedVer = pSma->pVnode->state.applied;

@@ -426,7 +419,7 @@ static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma) {

  // step 1: merge qTaskInfo and iQTaskInfo
  // lock
-  taosWLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWLockLatch(SMA_ENV_LOCK(pEnv));

  void *pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), NULL);
  while (pIter) {
@@ -480,10 +473,9 @@ static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma) {
    taosHashRemove(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t));
  }
  taosArrayDestroy(rsmaDeleted);
-  // TODO: remove suid in files?

  // unlock
-  taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWUnLockLatch(SMA_ENV_LOCK(pEnv));

  // step 2: cleanup outdated qtaskinfo files
  tdCleanupQTaskInfoFiles(pSma, pRSmaStat);

--- a/source/dnode/vnode/src/sma/smaEnv.c
+++ b/source/dnode/vnode/src/sma/smaEnv.c
@@ -209,6 +209,7 @@ static int32_t tdInitSmaStat(SSmaStat **pSmaStat, int8_t smaType, const SSma *pS
      SRSmaStat *pRSmaStat = (SRSmaStat *)(*pSmaStat);
      pRSmaStat->pSma = (SSma *)pSma;
      atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_INIT);
+      tsem_init(&pRSmaStat->notEmpty, 0, 0);

      // init smaMgmt
      smaInit();
@@ -230,12 +231,6 @@ static int32_t tdInitSmaStat(SSmaStat **pSmaStat, int8_t smaType, const SSma *pS
      if (!RSMA_INFO_HASH(pRSmaStat)) {
        return TSDB_CODE_FAILED;
      }
-
-      RSMA_FETCH_HASH(pRSmaStat) = taosHashInit(
-          RSMA_TASK_INFO_HASH_SLOT, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_ENTRY_LOCK);
-      if (!RSMA_FETCH_HASH(pRSmaStat)) {
-        return TSDB_CODE_FAILED;
-      }
    } else if (smaType == TSDB_SMA_TYPE_TIME_RANGE) {
      // TODO
    } else {
@@ -267,6 +262,7 @@ static void tdDestroyRSmaStat(void *pRSmaStat) {
    smaDebug("vgId:%d, destroy rsma stat %p", SMA_VID(pSma), pRSmaStat);
    // step 1: set rsma trigger stat cancelled
    atomic_store_8(RSMA_TRIGGER_STAT(pStat), TASK_TRIGGER_STAT_CANCELLED);
+    tsem_destroy(&(pStat->notEmpty));

    // step 2: destroy the rsma info and associated fetch tasks
    if (taosHashGetSize(RSMA_INFO_HASH(pStat)) > 0) {
@@ -279,10 +275,7 @@ static void tdDestroyRSmaStat(void *pRSmaStat) {
    }
    taosHashCleanup(RSMA_INFO_HASH(pStat));

-    // step 3: destroy the rsma fetch hash
-    taosHashCleanup(RSMA_FETCH_HASH(pStat));
-
-    // step 4: wait all triggered fetch tasks finished
+    // step 3: wait all triggered fetch tasks finished
    int32_t nLoops = 0;
    while (1) {
      if (T_REF_VAL_GET((SSmaStat *)pStat) == 0) {

--- a/source/dnode/vnode/src/sma/smaRollup.c
+++ b/source/dnode/vnode/src/sma/smaRollup.c
--- a/source/dnode/vnode/src/vnd/vnodeSvr.c
+++ b/source/dnode/vnode/src/vnd/vnodeSvr.c
@@ -301,8 +301,6 @@ int32_t vnodeProcessQueryMsg(SVnode *pVnode, SRpcMsg *pMsg) {
      return qWorkerProcessQueryMsg(&handle, pVnode->pQuery, pMsg, 0);
    case TDMT_SCH_QUERY_CONTINUE:
      return qWorkerProcessCQueryMsg(&handle, pVnode->pQuery, pMsg, 0);
-    case TDMT_VND_FETCH_RSMA:
-      return smaProcessFetch(pVnode->pSma, pMsg);
    case TDMT_VND_EXEC_RSMA:
      return smaProcessExec(pVnode->pSma, pMsg);
    default:

--- a/source/libs/executor/src/executorimpl.c
+++ b/source/libs/executor/src/executorimpl.c
@@ -3131,6 +3131,7 @@ int32_t aggDecodeResultRow(SOperatorInfo* pOperator, char* result) {

    initResultRow(resultRow);
    pInfo->resultRowInfo.cur = (SResultRowPosition){.pageId = resultRow->pageId, .offset = resultRow->offset};
+    // releaseBufPage(pSup->pResultBuf, getBufPage(pSup->pResultBuf, pageId));
  }

  if (offset != length) {

--- a/source/util/src/tqueue.c
+++ b/source/util/src/tqueue.c
@@ -299,6 +299,7 @@ int32_t taosGetQitem(STaosQall *qall, void **ppItem) {
 }

 void    taosResetQitems(STaosQall *qall) { qall->current = qall->start; }
+int32_t taosQallItemSize(STaosQall *qall) { return qall->numOfItems; }

 STaosQset *taosOpenQset() {
  STaosQset *qset = taosMemoryCalloc(sizeof(STaosQset), 1);