/* * Copyright (c) 2019 TAOS Data, Inc. * * This program is free software: you can use, redistribute, and/or modify * it under the terms of the GNU Affero General Public License, version 3 * or later ("AGPL"), as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "sma.h" extern SSmaMgmt smaMgmt; #if 0 static int32_t tdProcessRSmaSyncPreCommitImpl(SSma *pSma); static int32_t tdProcessRSmaSyncCommitImpl(SSma *pSma); static int32_t tdProcessRSmaSyncPostCommitImpl(SSma *pSma); #endif static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma); static int32_t tdProcessRSmaAsyncCommitImpl(SSma *pSma, SCommitInfo *pInfo); static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma); static int32_t tdUpdateQTaskInfoFiles(SSma *pSma, SRSmaStat *pRSmaStat); #if 0 /** * @brief Only applicable to Rollup SMA * * @param pSma * @return int32_t */ int32_t smaSyncPreCommit(SSma *pSma) { return tdProcessRSmaSyncPreCommitImpl(pSma); } /** * @brief Only applicable to Rollup SMA * * @param pSma * @return int32_t */ int32_t smaSyncCommit(SSma *pSma) { return tdProcessRSmaSyncCommitImpl(pSma); } /** * @brief Only applicable to Rollup SMA * * @param pSma * @return int32_t */ int32_t smaSyncPostCommit(SSma *pSma) { return tdProcessRSmaSyncPostCommitImpl(pSma); } #endif /** * @brief async commit, only applicable to Rollup SMA * * @param pSma * @return int32_t */ int32_t smaPrepareAsyncCommit(SSma *pSma) { return tdProcessRSmaAsyncPreCommitImpl(pSma); } /** * @brief async commit, only applicable to Rollup SMA * * @param pSma * @return int32_t */ int32_t smaCommit(SSma *pSma, SCommitInfo *pInfo) { return tdProcessRSmaAsyncCommitImpl(pSma, pInfo); } /** * @brief async commit, only applicable to Rollup SMA * * @param pSma * @return int32_t */ int32_t smaPostCommit(SSma *pSma) { return tdProcessRSmaAsyncPostCommitImpl(pSma); } /** * @brief prepare rsma1/2, and set rsma trigger stat active * * @param pSma * @return int32_t */ int32_t smaBegin(SSma *pSma) { int32_t code = 0; SVnode *pVnode = pSma->pVnode; if ((code = tsdbBegin(VND_RSMA1(pVnode))) < 0) { smaError("vgId:%d, failed to begin rsma1 since %s", TD_VID(pVnode), tstrerror(code)); goto _exit; } if ((code = tsdbBegin(VND_RSMA2(pVnode))) < 0) { smaError("vgId:%d, failed to begin rsma2 since %s", TD_VID(pVnode), tstrerror(code)); goto _exit; } // set trigger stat SSmaEnv *pSmaEnv = SMA_RSMA_ENV(pSma); if (!pSmaEnv) { goto _exit; } SRSmaStat *pRSmaStat = (SRSmaStat *)SMA_ENV_STAT(pSmaEnv); int8_t rsmaTriggerStat = atomic_val_compare_exchange_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED, TASK_TRIGGER_STAT_ACTIVE); switch (rsmaTriggerStat) { case TASK_TRIGGER_STAT_PAUSED: { smaDebug("vgId:%d, rsma trigger stat from paused to active", TD_VID(pVnode)); break; } case TASK_TRIGGER_STAT_INIT: { atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_ACTIVE); smaDebug("vgId:%d, rsma trigger stat from init to active", TD_VID(pVnode)); break; } default: { atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_ACTIVE); smaWarn("vgId:%d, rsma trigger stat %" PRIi8 " is unexpected", TD_VID(pVnode), rsmaTriggerStat); break; } } _exit: terrno = code; return code; } int32_t smaFinishCommit(SSma *pSma) { int32_t code = 0; SVnode *pVnode = pSma->pVnode; if (VND_RSMA1(pVnode) && (code = tsdbFinishCommit(VND_RSMA1(pVnode))) < 0) { smaError("vgId:%d, failed to finish commit tsdb rsma1 since %s", TD_VID(pVnode), tstrerror(code)); goto _exit; } if (VND_RSMA2(pVnode) && (code = tsdbFinishCommit(VND_RSMA2(pVnode))) < 0) { smaError("vgId:%d, failed to finish commit tsdb rsma2 since %s", TD_VID(pVnode), tstrerror(code)); goto _exit; } _exit: terrno = code; return code; } #if 0 /** * @brief pre-commit for rollup sma(sync commit). * 1) set trigger stat of rsma timer TASK_TRIGGER_STAT_PAUSED. * 2) wait for all triggered fetch tasks to finish * 3) perform persist task for qTaskInfo * * @param pSma * @return int32_t */ static int32_t tdProcessRSmaSyncPreCommitImpl(SSma *pSma) { SSmaEnv *pSmaEnv = SMA_RSMA_ENV(pSma); if (!pSmaEnv) { return TSDB_CODE_SUCCESS; } SSmaStat *pStat = SMA_ENV_STAT(pSmaEnv); SRSmaStat *pRSmaStat = SMA_STAT_RSMA(pStat); // step 1: set rsma stat paused atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED); // step 2: wait for all triggered fetch tasks to finish int32_t nLoops = 0; while (1) { if (T_REF_VAL_GET(pStat) == 0) { smaDebug("vgId:%d, rsma fetch tasks are all finished", SMA_VID(pSma)); break; } else { smaDebug("vgId:%d, rsma fetch tasks are not all finished yet", SMA_VID(pSma)); } ++nLoops; if (nLoops > 1000) { sched_yield(); nLoops = 0; } } // step 3: perform persist task for qTaskInfo pRSmaStat->commitAppliedVer = pSma->pVnode->state.applied; tdRSmaPersistExecImpl(pRSmaStat, RSMA_INFO_HASH(pRSmaStat)); smaDebug("vgId:%d, rsma pre commit success", SMA_VID(pSma)); return TSDB_CODE_SUCCESS; } /** * @brief commit for rollup sma * * @param pSma * @return int32_t */ static int32_t tdProcessRSmaSyncCommitImpl(SSma *pSma) { #if 0 SSmaEnv *pSmaEnv = SMA_RSMA_ENV(pSma); if (!pSmaEnv) { return TSDB_CODE_SUCCESS; } #endif return TSDB_CODE_SUCCESS; } #endif // SQTaskFile ====================================================== /** * @brief At most time, there is only one qtaskinfo file committed latest in aTaskFile. Sometimes, there would be * multiple qtaskinfo files supporting snapshot replication. * * @param pSma * @param pStat * @return int32_t */ static int32_t tdUpdateQTaskInfoFiles(SSma *pSma, SRSmaStat *pStat) { SVnode *pVnode = pSma->pVnode; SRSmaFS *pFS = RSMA_FS(pStat); int64_t committed = pStat->commitAppliedVer; int64_t fsMaxVer = -1; char qTaskInfoFullName[TSDB_FILENAME_LEN]; taosWLockLatch(RSMA_FS_LOCK(pStat)); for (int32_t i = 0; i < taosArrayGetSize(pFS->aQTaskInf);) { SQTaskFile *pTaskF = taosArrayGet(pFS->aQTaskInf, i); int32_t oldVal = atomic_fetch_sub_32(&pTaskF->nRef, 1); if ((oldVal <= 1) && (pTaskF->version < committed)) { tdRSmaQTaskInfoGetFullName(TD_VID(pVnode), pTaskF->version, tfsGetPrimaryPath(pVnode->pTfs), qTaskInfoFullName); if (taosRemoveFile(qTaskInfoFullName) < 0) { smaWarn("vgId:%d, cleanup qinf, committed %" PRIi64 ", failed to remove %s since %s", TD_VID(pVnode), committed, qTaskInfoFullName, tstrerror(TAOS_SYSTEM_ERROR(errno))); } else { smaDebug("vgId:%d, cleanup qinf, committed %" PRIi64 ", success to remove %s", TD_VID(pVnode), committed, qTaskInfoFullName); } taosArrayRemove(pFS->aQTaskInf, i); continue; } ++i; } if (taosArrayGetSize(pFS->aQTaskInf) > 0) { fsMaxVer = ((SQTaskFile *)taosArrayGetLast(pFS->aQTaskInf))->version; } if (fsMaxVer < committed) { tdRSmaQTaskInfoGetFullName(TD_VID(pVnode), committed, tfsGetPrimaryPath(pVnode->pTfs), qTaskInfoFullName); if (taosCheckExistFile(qTaskInfoFullName)) { SQTaskFile qFile = {.nRef = 1, .padding = 0, .version = committed, .size = 0}; if (!taosArrayPush(pFS->aQTaskInf, &qFile)) { taosWUnLockLatch(RSMA_FS_LOCK(pStat)); terrno = TSDB_CODE_OUT_OF_MEMORY; return TSDB_CODE_FAILED; } } } else { smaDebug("vgId:%d, update qinf, no need as committed %" PRIi64 " not larger than fsMaxVer %" PRIi64, TD_VID(pVnode), committed, fsMaxVer); } taosWUnLockLatch(RSMA_FS_LOCK(pStat)); return TSDB_CODE_SUCCESS; } #if 0 /** * @brief post-commit for rollup sma * 1) clean up the outdated qtaskinfo files * * @param pSma * @return int32_t */ static int32_t tdProcessRSmaSyncPostCommitImpl(SSma *pSma) { SVnode *pVnode = pSma->pVnode; if (!VND_IS_RSMA(pVnode)) { return TSDB_CODE_SUCCESS; } SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pSma); tdUpdateQTaskInfoFiles(pSma, pRSmaStat); return TSDB_CODE_SUCCESS; } #endif /** * @brief Rsma async commit implementation(only do some necessary light weighted task) * 1) set rsma stat TASK_TRIGGER_STAT_PAUSED * 2) Wait all running fetch task finish to fetch and put submitMsg into level 2/3 wQueue(blocking level 1 write) * * @param pSma * @return int32_t */ static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma) { SSmaEnv *pEnv = SMA_RSMA_ENV(pSma); if (!pEnv) { return TSDB_CODE_SUCCESS; } SSmaStat *pStat = SMA_ENV_STAT(pEnv); SRSmaStat *pRSmaStat = SMA_STAT_RSMA(pStat); int32_t nLoops = 0; // step 1: set rsma stat atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED); while (atomic_val_compare_exchange_8(RSMA_COMMIT_STAT(pRSmaStat), 0, 1) != 0) { ++nLoops; if (nLoops > 1000) { sched_yield(); nLoops = 0; } } pRSmaStat->commitAppliedVer = pSma->pVnode->state.applied; ASSERT(pRSmaStat->commitAppliedVer > 0); // step 2: wait for all triggered fetch tasks to finish nLoops = 0; while (1) { if (T_REF_VAL_GET(pStat) == 0) { smaDebug("vgId:%d, rsma commit, fetch tasks are all finished", SMA_VID(pSma)); break; } else { smaDebug("vgId:%d, rsma commit, fetch tasks are not all finished yet", SMA_VID(pSma)); } ++nLoops; if (nLoops > 1000) { sched_yield(); nLoops = 0; } } /** * @brief step 3: commit should wait for all SubmitReq in buffer be consumed * 1) This is high cost task and should not put in asyncPreCommit originally. * 2) But, if put in asyncCommit, would trigger taskInfo cloning frequently. */ smaInfo("vgId:%d, rsma commit, wait for all items to be consumed, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId()); nLoops = 0; while (atomic_load_64(&pRSmaStat->nBufItems) > 0) { ++nLoops; if (nLoops > 1000) { sched_yield(); nLoops = 0; } } smaInfo("vgId:%d, rsma commit, all items are consumed, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId()); if (tdRSmaPersistExecImpl(pRSmaStat, RSMA_INFO_HASH(pRSmaStat)) < 0) { return TSDB_CODE_FAILED; } smaInfo("vgId:%d, rsma commit, operator state committed, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId()); #if 0 // consuming task of qTaskInfo clone // step 4: swap queue/qall and iQueue/iQall // lock taosWLockLatch(SMA_ENV_LOCK(pEnv)); ASSERT(RSMA_INFO_HASH(pRSmaStat)); void *pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), NULL); while (pIter) { SRSmaInfo *pInfo = *(SRSmaInfo **)pIter; TSWAP(pInfo->iQall, pInfo->qall); TSWAP(pInfo->iQueue, pInfo->queue); TSWAP(pInfo->iTaskInfo[0], pInfo->taskInfo[0]); TSWAP(pInfo->iTaskInfo[1], pInfo->taskInfo[1]); pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), pIter); } // unlock taosWUnLockLatch(SMA_ENV_LOCK(pEnv)); #endif // all rsma results are written completely STsdb *pTsdb = NULL; if ((pTsdb = VND_RSMA1(pSma->pVnode))) tsdbPrepareCommit(pTsdb); if ((pTsdb = VND_RSMA2(pSma->pVnode))) tsdbPrepareCommit(pTsdb); return TSDB_CODE_SUCCESS; } /** * @brief commit for rollup sma * * @param pSma * @return int32_t */ static int32_t tdProcessRSmaAsyncCommitImpl(SSma *pSma, SCommitInfo *pInfo) { int32_t code = 0; SVnode *pVnode = pSma->pVnode; SSmaEnv *pSmaEnv = SMA_RSMA_ENV(pSma); if (!pSmaEnv) { goto _exit; } #if 0 SRSmaStat *pRSmaStat = (SRSmaStat *)SMA_ENV_STAT(pSmaEnv); // perform persist task for qTaskInfo operator if (tdRSmaPersistExecImpl(pRSmaStat, RSMA_INFO_HASH(pRSmaStat)) < 0) { return TSDB_CODE_FAILED; } #endif if ((code = tsdbCommit(VND_RSMA1(pVnode), pInfo)) < 0) { smaError("vgId:%d, failed to commit tsdb rsma1 since %s", TD_VID(pVnode), tstrerror(code)); goto _exit; } if ((code = tsdbCommit(VND_RSMA2(pVnode), pInfo)) < 0) { smaError("vgId:%d, failed to commit tsdb rsma2 since %s", TD_VID(pVnode), tstrerror(code)); goto _exit; } _exit: terrno = code; return code; } /** * @brief Migrate rsmaInfo from iRsmaInfo to rsmaInfo if rsma infoHash not empty. * * @param pSma * @return int32_t */ static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma) { SSmaEnv *pEnv = SMA_RSMA_ENV(pSma); if (!pEnv) { return TSDB_CODE_SUCCESS; } SRSmaStat *pRSmaStat = (SRSmaStat *)SMA_ENV_STAT(pEnv); // step 1: merge qTaskInfo and iQTaskInfo // lock if (1 == atomic_val_compare_exchange_8(&pRSmaStat->delFlag, 1, 0)) { taosWLockLatch(SMA_ENV_LOCK(pEnv)); void *pIter = NULL; while ((pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), pIter))) { tb_uid_t *pSuid = (tb_uid_t *)taosHashGetKey(pIter, NULL); SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pIter; if (RSMA_INFO_IS_DEL(pRSmaInfo)) { int32_t refVal = T_REF_VAL_GET(pRSmaInfo); if (refVal == 0) { taosHashRemove(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(*pSuid)); } else { smaDebug( "vgId:%d, rsma async post commit, not free rsma info since ref is %d although already deleted for " "table:%" PRIi64, SMA_VID(pSma), refVal, *pSuid); } continue; } #if 0 if (pRSmaInfo->taskInfo[0]) { if (pRSmaInfo->iTaskInfo[0]) { SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pRSmaInfo->iTaskInfo[0]; tdFreeRSmaInfo(pSma, pRSmaInfo, false); pRSmaInfo->iTaskInfo[0] = NULL; } } else { TSWAP(pRSmaInfo->taskInfo[0], pRSmaInfo->iTaskInfo[0]); } taosHashPut(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t), pIter, sizeof(pIter)); smaDebug("vgId:%d, rsma async post commit, migrated from iRsmaInfoHash for table:%" PRIi64, SMA_VID(pSma), *pSuid); #endif } // unlock taosWUnLockLatch(SMA_ENV_LOCK(pEnv)); } tdUpdateQTaskInfoFiles(pSma, pRSmaStat); atomic_store_8(RSMA_COMMIT_STAT(pRSmaStat), 0); return TSDB_CODE_SUCCESS; }