From ed34f490ca883533e76f5f02d488a8f4dcdd5122 Mon Sep 17 00:00:00 2001 From: Cary Xu Date: Fri, 15 Jul 2022 16:06:41 +0800 Subject: [PATCH] enh: async commit api for rsma --- source/dnode/vnode/src/inc/sma.h | 35 +++++-- source/dnode/vnode/src/inc/vnodeInt.h | 9 +- source/dnode/vnode/src/sma/smaCommit.c | 126 ++++++++++++++++++++--- source/dnode/vnode/src/sma/smaEnv.c | 4 +- source/dnode/vnode/src/sma/smaRollup.c | 52 ++++------ source/dnode/vnode/src/vnd/vnodeCommit.c | 4 +- 6 files changed, 168 insertions(+), 62 deletions(-) diff --git a/source/dnode/vnode/src/inc/sma.h b/source/dnode/vnode/src/inc/sma.h index e767d94ebd..cfeb4ebeb2 100644 --- a/source/dnode/vnode/src/inc/sma.h +++ b/source/dnode/vnode/src/inc/sma.h @@ -67,7 +67,9 @@ struct SRSmaStat { int64_t submitVer; int64_t refId; // shared by fetch tasks int8_t triggerStat; // shared by fetch tasks + int8_t commitStat; // 0 not in committing, 1 in committing SHashObj *rsmaInfoHash; // key: stbUid, value: SRSmaInfo; + SHashObj *iRsmaInfoHash; // key: stbUid, value: SRSmaInfo; immutable rsmaInfoHash }; struct SSmaStat { @@ -78,12 +80,29 @@ struct SSmaStat { T_REF_DECLARE() }; -#define SMA_TSMA_STAT(s) (&(s)->tsmaStat) -#define SMA_RSMA_STAT(s) (&(s)->rsmaStat) -#define RSMA_INFO_HASH(r) ((r)->rsmaInfoHash) -#define RSMA_TRIGGER_STAT(r) (&(r)->triggerStat) -#define RSMA_REF_ID(r) ((r)->refId) -#define RSMA_SUBMIT_VER(r) ((r)->submitVer) +#define SMA_TSMA_STAT(s) (&(s)->tsmaStat) +#define SMA_RSMA_STAT(s) (&(s)->rsmaStat) +#define RSMA_INFO_HASH(r) ((r)->rsmaInfoHash) +#define RSMA_IMU_INFO_HASH(r) ((r)->iRsmaInfoHash) +#define RSMA_TRIGGER_STAT(r) (&(r)->triggerStat) +#define RSMA_COMMIT_STAT(r) (&(r)->commitStat) +#define RSMA_REF_ID(r) ((r)->refId) +#define RSMA_SUBMIT_VER(r) ((r)->submitVer) + +struct SRSmaInfoItem { + void *taskInfo; // qTaskInfo_t + int64_t refId; + tmr_h tmrId; + int32_t maxDelay; + int8_t level; + int8_t triggerStat; +}; + +struct SRSmaInfo { + STSchema *pTSchema; + int64_t suid; + SRSmaInfoItem items[TSDB_RETENTION_L2]; +}; enum { TASK_TRIGGER_STAT_INIT = 0, @@ -94,6 +113,8 @@ enum { TASK_TRIGGER_STAT_DROPPED = 5, }; +#define RSMA_TASK_INFO_HASH_SLOT 8 + void tdDestroySmaEnv(SSmaEnv *pSmaEnv); void *tdFreeSmaEnv(SSmaEnv *pSmaEnv); @@ -183,7 +204,7 @@ static FORCE_INLINE void tdSmaStatSetDropped(STSmaStat *pTStat) { atomic_or_fetch_8(&pTStat->state, TSDB_SMA_STAT_DROPPED); } } - +void tdFreeQTaskInfo(qTaskInfo_t *taskHandle, int32_t vgId, int32_t level); static int32_t tdDestroySmaState(SSmaStat *pSmaStat, int8_t smaType); void *tdFreeSmaState(SSmaStat *pSmaStat, int8_t smaType); void *tdFreeRSmaInfo(SSma *pSma, SRSmaInfo *pInfo); diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 394c314819..3b32ca9c60 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -164,10 +164,11 @@ void smaCleanUp(); int32_t smaOpen(SVnode* pVnode); int32_t smaClose(SSma* pSma); int32_t smaBegin(SSma* pSma); -int32_t smaPreCommit(SSma* pSma); -int32_t smaCommit(SSma* pSma); -int32_t smaPostCommit(SSma* pSma); -int32_t smaAsyncCommit(SSma* pSma); +int32_t smaSyncPreCommit(SSma* pSma); +int32_t smaSyncCommit(SSma* pSma); +int32_t smaSyncPostCommit(SSma* pSma); +int32_t smaAsyncPreCommit(SSma* pSma); +int32_t smaAsyncPostCommit(SSma* pSma); int32_t tdProcessTSmaCreate(SSma* pSma, int64_t version, const char* msg); int32_t tdProcessTSmaInsert(SSma* pSma, int64_t indexUid, const char* msg); diff --git a/source/dnode/vnode/src/sma/smaCommit.c b/source/dnode/vnode/src/sma/smaCommit.c index 889c4adf24..4729187e8c 100644 --- a/source/dnode/vnode/src/sma/smaCommit.c +++ b/source/dnode/vnode/src/sma/smaCommit.c @@ -18,7 +18,8 @@ static int32_t tdProcessRSmaPreCommitImpl(SSma *pSma); static int32_t tdProcessRSmaCommitImpl(SSma *pSma); static int32_t tdProcessRSmaPostCommitImpl(SSma *pSma); -static int32_t tdProcessRSmaAsyncCommitImpl(SSma *pSma); +static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma); +static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma); /** * @brief Only applicable to Rollup SMA @@ -26,7 +27,7 @@ static int32_t tdProcessRSmaAsyncCommitImpl(SSma *pSma); * @param pSma * @return int32_t */ -int32_t smaPreCommit(SSma *pSma) { return tdProcessRSmaPreCommitImpl(pSma); } +int32_t smaSyncPreCommit(SSma *pSma) { return tdProcessRSmaPreCommitImpl(pSma); } /** * @brief Only applicable to Rollup SMA @@ -34,7 +35,7 @@ int32_t smaPreCommit(SSma *pSma) { return tdProcessRSmaPreCommitImpl(pSma); } * @param pSma * @return int32_t */ -int32_t smaCommit(SSma *pSma) { return tdProcessRSmaCommitImpl(pSma); } +int32_t smaSyncCommit(SSma *pSma) { return tdProcessRSmaCommitImpl(pSma); } /** * @brief Only applicable to Rollup SMA @@ -42,7 +43,7 @@ int32_t smaCommit(SSma *pSma) { return tdProcessRSmaCommitImpl(pSma); } * @param pSma * @return int32_t */ -int32_t smaPostCommit(SSma *pSma) { return tdProcessRSmaPostCommitImpl(pSma); } +int32_t smaSyncPostCommit(SSma *pSma) { return tdProcessRSmaPostCommitImpl(pSma); } /** * @brief Only applicable to Rollup SMA @@ -50,7 +51,15 @@ int32_t smaPostCommit(SSma *pSma) { return tdProcessRSmaPostCommitImpl(pSma); } * @param pSma * @return int32_t */ -int32_t smaAsyncCommit(SSma *pSma) { return tdProcessRSmaAsyncCommitImpl(pSma); } +int32_t smaAsyncPreCommit(SSma *pSma) { return tdProcessRSmaAsyncPreCommitImpl(pSma); } + +/** + * @brief Only applicable to Rollup SMA + * + * @param pSma + * @return int32_t + */ +int32_t smaAsyncPostCommit(SSma *pSma) { return tdProcessRSmaAsyncPostCommitImpl(pSma); } /** * @brief set rsma trigger stat active @@ -71,18 +80,17 @@ int32_t smaBegin(SSma *pSma) { atomic_val_compare_exchange_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED, TASK_TRIGGER_STAT_ACTIVE); switch (rsmaTriggerStat) { case TASK_TRIGGER_STAT_PAUSED: { - smaDebug("vgId:%d rsma trigger stat from paused to active", SMA_VID(pSma)); + smaDebug("vgId:%d, rsma trigger stat from paused to active", SMA_VID(pSma)); break; } case TASK_TRIGGER_STAT_INIT: { atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_ACTIVE); - smaDebug("vgId:%d rsma trigger stat from init to active", SMA_VID(pSma)); + smaDebug("vgId:%d, rsma trigger stat from init to active", SMA_VID(pSma)); break; } default: { atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_ACTIVE); - smaWarn("vgId:%d rsma trigger stat %" PRIi8 " is unexpected", SMA_VID(pSma), rsmaTriggerStat); - ASSERT(0); + smaError("vgId:%d, rsma trigger stat %" PRIi8 " is unexpected", SMA_VID(pSma), rsmaTriggerStat); break; } } @@ -90,7 +98,7 @@ int32_t smaBegin(SSma *pSma) { } /** - * @brief pre-commit for rollup sma. + * @brief pre-commit for rollup sma(sync commit). * 1) set trigger stat of rsma timer TASK_TRIGGER_STAT_PAUSED. * 2) wait all triggered fetch tasks finished * 3) perform persist task for qTaskInfo @@ -107,8 +115,7 @@ static int32_t tdProcessRSmaPreCommitImpl(SSma *pSma) { SSmaStat *pStat = SMA_ENV_STAT(pSmaEnv); SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat); - - // step 1: set persistence task paused + // step 1: set rsma stat paused atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED); // step 2: wait all triggered fetch tasks finished @@ -236,12 +243,101 @@ static int32_t tdProcessRSmaPostCommitImpl(SSma *pSma) { /** * @brief Rsma async commit implementation - * 1) Wait all running fetch task finish to fetch and put submitMsg into level 2/3 wQueue(blocking level 1 write) + * 1) set rsma stat TASK_TRIGGER_STAT_PAUSED + * 2) Wait all running fetch task finish to fetch and put submitMsg into level 2/3 wQueue(blocking level 1 write) + * 3) * * @param pSma * @return int32_t */ -static int32_t tdProcessRSmaAsyncCommitImpl(SSma *pSma) { - // TODO +static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma) { + SSmaEnv *pSmaEnv = SMA_RSMA_ENV(pSma); + if (!pSmaEnv) { + return TSDB_CODE_SUCCESS; + } + + SSmaStat *pStat = SMA_ENV_STAT(pSmaEnv); + SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat); + + // step 1: set rsma stat paused + atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED); + + // step 2: wait all triggered fetch tasks finished + int32_t nLoops = 0; + while (1) { + if (T_REF_VAL_GET(pStat) == 0) { + smaDebug("vgId:%d, rsma fetch tasks all finished", SMA_VID(pSma)); + break; + } else { + smaDebug("vgId:%d, rsma fetch tasks not all finished yet", SMA_VID(pSma)); + } + ++nLoops; + if (nLoops > 1000) { + sched_yield(); + nLoops = 0; + } + } + + // step 3: swap rsmaInfoHash and iRsmaInfoHash + ASSERT(!RSMA_IMU_INFO_HASH(pRSmaStat)); + ASSERT(RSMA_INFO_HASH(pRSmaStat)); + + RSMA_IMU_INFO_HASH(pRSmaStat) = RSMA_INFO_HASH(pRSmaStat); + RSMA_INFO_HASH(pRSmaStat) = + taosHashInit(RSMA_TASK_INFO_HASH_SLOT, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_ENTRY_LOCK); + if (!RSMA_INFO_HASH(pRSmaStat)) { + smaError("vgId:%d, rsma async commit failed since %s", SMA_VID(pSma), terrstr()); + return TSDB_CODE_FAILED; + } + + return TSDB_CODE_SUCCESS; +} + +/** + * @brief Migrate rsmaInfo from iRsmaInfo to rsmaInfo if rsmaInfoHash not empty. + * + * @param pSma + * @return int32_t + */ +static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma) { + SSmaEnv *pSmaEnv = SMA_RSMA_ENV(pSma); + if (!pSmaEnv) { + return TSDB_CODE_SUCCESS; + } + + SSmaStat *pStat = SMA_ENV_STAT(pSmaEnv); + SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat); + + // step 1: merge rsmaInfoHash and iRsmaInfoHash + + if (taosHashGetSize(RSMA_INFO_HASH(pRSmaStat)) <= 0) { + // TODO: + } + + void *pIter = taosHashIterate(RSMA_IMU_INFO_HASH(pRSmaStat), NULL); + while (pIter) { + tb_uid_t *pSuid = (tb_uid_t *)taosHashGetKey(pIter, NULL); + + if (!taosHashGet(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t))) { + taosHashPut(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t), pIter, sizeof(pIter)); + smaDebug("vgId:%d, rsma async post commit, migrated from iRsmaInfoHash for table:%" PRIi64, SMA_VID(pSma), *pSuid); + } else { + SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pIter; + for (int32_t i = 0; i < TSDB_RETENTION_L2; ++i) { + SRSmaInfoItem *pItem = &pRSmaInfo->items[i]; + if (pItem->taskInfo) { + tdFreeQTaskInfo(pItem->taskInfo, SMA_VID(pSma), i + 1); + } + } + smaDebug("vgId:%d, rsma async post commit, free rsma info since already COW for table:%" PRIi64, SMA_VID(pSma), + *pSuid); + } + + pIter = taosHashIterate(RSMA_IMU_INFO_HASH(pRSmaStat), pIter); + } + + taosHashCleanup(RSMA_IMU_INFO_HASH(pRSmaStat)); + RSMA_IMU_INFO_HASH(pRSmaStat) = NULL; + return TSDB_CODE_SUCCESS; } diff --git a/source/dnode/vnode/src/sma/smaEnv.c b/source/dnode/vnode/src/sma/smaEnv.c index 4a7d4db874..77377cf089 100644 --- a/source/dnode/vnode/src/sma/smaEnv.c +++ b/source/dnode/vnode/src/sma/smaEnv.c @@ -17,7 +17,6 @@ typedef struct SSmaStat SSmaStat; -#define RSMA_TASK_INFO_HASH_SLOT 8 #define SMA_MGMT_REF_NUM 10240 extern SSmaMgmt smaMgmt; @@ -311,7 +310,6 @@ int32_t tdDestroySmaState(SSmaStat *pSmaStat, int8_t smaType) { if (taosRemoveRef(smaMgmt.rsetId, RSMA_REF_ID(pRSmaStat)) < 0) { smaError("vgId:%d, remove refId:%" PRIi64 " from rsmaRef:%" PRIi32 " failed since %s", SMA_VID(pRSmaStat->pSma), RSMA_REF_ID(pRSmaStat), smaMgmt.rsetId, terrstr()); - ASSERT(0); } else { smaDebug("vgId:%d, remove refId:%" PRIi64 " from rsmaRef:%" PRIi32 " succeed", SMA_VID(pRSmaStat->pSma), RSMA_REF_ID(pRSmaStat), smaMgmt.rsetId); @@ -361,7 +359,7 @@ int32_t tdCheckAndInitSmaEnv(SSma *pSma, int8_t smaType) { } break; default: - smaError("vgId:%d undefined smaType:%", SMA_VID(pSma), smaType); + smaError("vgId:%d, undefined smaType:%", SMA_VID(pSma), smaType); return TSDB_CODE_FAILED; } diff --git a/source/dnode/vnode/src/sma/smaRollup.c b/source/dnode/vnode/src/sma/smaRollup.c index a6d56acbfb..300a0ec4be 100644 --- a/source/dnode/vnode/src/sma/smaRollup.c +++ b/source/dnode/vnode/src/sma/smaRollup.c @@ -48,20 +48,7 @@ static int32_t tdRSmaRestoreQTaskInfoInit(SSma *pSma, int64_t *nTables); static int32_t tdRSmaRestoreQTaskInfoReload(SSma *pSma, int64_t *committed); static int32_t tdRSmaRestoreTSDataReload(SSma *pSma, int64_t committed); -struct SRSmaInfoItem { - void *taskInfo; // qTaskInfo_t - int64_t refId; - tmr_h tmrId; - int32_t maxDelay; - int8_t level; - int8_t triggerStat; -}; -struct SRSmaInfo { - STSchema *pTSchema; - int64_t suid; - SRSmaInfoItem items[TSDB_RETENTION_L2]; -}; static SRSmaInfo *tdGetRSmaInfoByItem(SRSmaInfoItem *pItem) { // adapt accordingly if definition of SRSmaInfo update @@ -102,7 +89,7 @@ static FORCE_INLINE int32_t tdRSmaQTaskInfoContLen(int32_t lenWithHead) { static FORCE_INLINE void tdRSmaQTaskInfoIterDestroy(SRSmaQTaskInfoIter *pIter) { taosMemoryFreeClear(pIter->pBuf); } -static FORCE_INLINE void tdFreeTaskHandle(qTaskInfo_t *taskHandle, int32_t vgId, int32_t level) { +void tdFreeQTaskInfo(qTaskInfo_t *taskHandle, int32_t vgId, int32_t level) { // Note: free/kill may in RC qTaskInfo_t otaskHandle = atomic_load_ptr(taskHandle); if (otaskHandle && atomic_val_compare_exchange_ptr(taskHandle, otaskHandle, NULL)) { @@ -123,7 +110,7 @@ void *tdFreeRSmaInfo(SSma *pSma, SRSmaInfo *pInfo) { i + 1); taosTmrStopA(&pItem->tmrId); } - tdFreeTaskHandle(&pItem->taskInfo, SMA_VID(pSma), i + 1); + tdFreeQTaskInfo(&pItem->taskInfo, SMA_VID(pSma), i + 1); } else { smaDebug("vgId:%d, table %" PRIi64 " no need to destroy rsma info level %d since empty taskInfo", SMA_VID(pSma), pInfo->suid, i + 1); @@ -257,22 +244,21 @@ int32_t tdFetchTbUidList(SSma *pSma, STbUidStore **ppStore, tb_uid_t suid, tb_ui static int32_t tdSetRSmaInfoItemParams(SSma *pSma, SRSmaParam *param, SRSmaStat *pStat, SRSmaInfo *pRSmaInfo, int8_t idx) { - SRetention *pRetention = SMA_RETENTION(pSma); - STsdbCfg *pTsdbCfg = SMA_TSDB_CFG(pSma); + if ((param->qmsgLen > 0) && param->qmsg[idx]) { + SRetention *pRetention = SMA_RETENTION(pSma); + STsdbCfg *pTsdbCfg = SMA_TSDB_CFG(pSma); + SReadHandle handle = { + .meta = pSma->pVnode->pMeta, + .vnode = pSma->pVnode, + .initTqReader = 1, + }; - SReadHandle handle = { - .meta = pSma->pVnode->pMeta, - .vnode = pSma->pVnode, - .initTqReader = 1, - }; - - if (param->qmsg[idx]) { SRSmaInfoItem *pItem = &(pRSmaInfo->items[idx]); pItem->refId = RSMA_REF_ID(pStat); pItem->taskInfo = qCreateStreamExecTaskInfo(param->qmsg[idx], &handle); if (!pItem->taskInfo) { terrno = TSDB_CODE_RSMA_QTASKINFO_CREATE; - goto _err; + return TSDB_CODE_FAILED; } pItem->triggerStat = TASK_TRIGGER_STAT_INACTIVE; if (param->maxdelay[idx] < TSDB_MIN_ROLLUP_MAX_DELAY) { @@ -286,13 +272,11 @@ static int32_t tdSetRSmaInfoItemParams(SSma *pSma, SRSmaParam *param, SRSmaStat pItem->maxDelay = TSDB_MAX_ROLLUP_MAX_DELAY; } pItem->level = idx == 0 ? TSDB_RETENTION_L1 : TSDB_RETENTION_L2; - smaInfo("vgId:%d table:%" PRIi64 " level:%" PRIi8 " maxdelay:%" PRIi64 " watermark:%" PRIi64 + smaInfo("vgId:%d, table:%" PRIi64 " level:%" PRIi8 " maxdelay:%" PRIi64 " watermark:%" PRIi64 ", finally maxdelay:%" PRIi32, SMA_VID(pSma), pRSmaInfo->suid, idx + 1, param->maxdelay[idx], param->watermark[idx], pItem->maxDelay); } return TSDB_CODE_SUCCESS; -_err: - return TSDB_CODE_FAILED; } /** @@ -562,7 +546,9 @@ static int32_t tdRSmaFetchAndSubmitResult(SRSmaInfoItem *pItem, STSchema *pTSche SSDataBlock *output = NULL; uint64_t ts; if (qExecTask(pItem->taskInfo, &output, &ts) < 0) { - ASSERT(false); + smaError("vgId:%d, qExecTask for rsma table %" PRIi64 "l evel %" PRIi8 " failed since %s", SMA_VID(pSma), suid, + pItem->level, terrstr()); + goto _err; } if (!output) { break; @@ -572,7 +558,7 @@ static int32_t tdRSmaFetchAndSubmitResult(SRSmaInfoItem *pItem, STSchema *pTSche pResult = taosArrayInit(1, sizeof(SSDataBlock)); if (!pResult) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return TSDB_CODE_FAILED; + goto _err; } } @@ -891,7 +877,7 @@ int32_t tdProcessRSmaRestoreImpl(SSma *pSma) { return TSDB_CODE_SUCCESS; _err: - smaError("vgId:%d failed to restore rsma task since %s", SMA_VID(pSma), terrstr()); + smaError("vgId:%d, failed to restore rsma task since %s", SMA_VID(pSma), terrstr()); return TSDB_CODE_FAILED; } @@ -1217,6 +1203,10 @@ static void tdRSmaFetchTrigger(void *param, void *tmrId) { smaDebug("vgId:%d, not fetch rsma level %" PRIi8 " data since stat is %" PRIi8 ", rsetId rsetId:%" PRIi64 " refId:%d", SMA_VID(pSma), pItem->level, rsmaTriggerStat, smaMgmt.rsetId, pItem->refId); + if (rsmaTriggerStat == TASK_TRIGGER_STAT_PAUSED) { + taosTmrReset(tdRSmaFetchTrigger, pItem->maxDelay > 5000 ? 5000 : pItem->maxDelay, pItem, smaMgmt.tmrHandle, + &pItem->tmrId); + } return; } default: diff --git a/source/dnode/vnode/src/vnd/vnodeCommit.c b/source/dnode/vnode/src/vnd/vnodeCommit.c index ebbb691e28..4e55c05a47 100644 --- a/source/dnode/vnode/src/vnd/vnodeCommit.c +++ b/source/dnode/vnode/src/vnd/vnodeCommit.c @@ -233,7 +233,7 @@ int vnodeCommit(SVnode *pVnode) { walBeginSnapshot(pVnode->pWal, pVnode->state.applied); // preCommit - smaPreCommit(pVnode->pSma); + smaSyncPreCommit(pVnode->pSma); // commit each sub-system if (metaCommit(pVnode->pMeta) < 0) { @@ -276,7 +276,7 @@ int vnodeCommit(SVnode *pVnode) { pVnode->state.committed = info.state.committed; // postCommit - smaPostCommit(pVnode->pSma); + smaSyncPostCommit(pVnode->pSma); // apply the commit (TODO) walEndSnapshot(pVnode->pWal); -- GitLab