diff --git a/source/dnode/mnode/impl/inc/mndInt.h b/source/dnode/mnode/impl/inc/mndInt.h index a0f3c98f8340f44e1ca940008a69835eb0f4cbcf..526fa142eb07b88c94f35cf5cd71c6d30769bddd 100644 --- a/source/dnode/mnode/impl/inc/mndInt.h +++ b/source/dnode/mnode/impl/inc/mndInt.h @@ -84,14 +84,15 @@ typedef struct { } STelemMgmt; typedef struct { - tsem_t syncSem; - int64_t sync; - int32_t errCode; - int32_t transId; - SRWLatch lock; - int8_t selfIndex; - int8_t numOfReplicas; - SReplica replicas[TSDB_MAX_REPLICA]; + tsem_t syncSem; + int64_t sync; + int32_t errCode; + int32_t transId; + int32_t transSec; + TdThreadMutex lock; + int8_t selfIndex; + int8_t numOfReplicas; + SReplica replicas[TSDB_MAX_REPLICA]; } SSyncMgmt; typedef struct { diff --git a/source/dnode/mnode/impl/inc/mndSync.h b/source/dnode/mnode/impl/inc/mndSync.h index 993efbcd084443fccd8ef9776b942e7af671bf60..1f9a698a8ad8b5a0f67ad18168938d3e1de0f31d 100644 --- a/source/dnode/mnode/impl/inc/mndSync.h +++ b/source/dnode/mnode/impl/inc/mndSync.h @@ -26,6 +26,7 @@ int32_t mndInitSync(SMnode *pMnode); void mndCleanupSync(SMnode *pMnode); bool mndIsLeader(SMnode *pMnode); int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId); +void mndSyncCheckTimeout(SMnode *pMnode); void mndSyncStart(SMnode *pMnode); void mndSyncStop(SMnode *pMnode); diff --git a/source/dnode/mnode/impl/src/mndMain.c b/source/dnode/mnode/impl/src/mndMain.c index 237d45bc385f02580ea898bcdd3b2637ebed8e1b..e12a908d881951e32ae72afdde0d78f082c45caf 100644 --- a/source/dnode/mnode/impl/src/mndMain.c +++ b/source/dnode/mnode/impl/src/mndMain.c @@ -280,6 +280,7 @@ static void *mndThreadFp(void *param) { if (sec % (tsStatusInterval * 5) == 0) { mndCheckDnodeOffline(pMnode); + mndSyncCheckTimeout(pMnode); } } diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index 10b1e3649680c9eed8bec2607203b08bbe241b2b..ba14f526a9e19346b247285ae0724c6d534249ad 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -17,6 +17,8 @@ #include "mndSync.h" #include "mndTrans.h" +#define TRANS_TIMEOUT_SEC 10 + static int32_t mndSyncEqCtrlMsg(const SMsgCb *msgcb, SRpcMsg *pMsg) { if (pMsg == NULL || pMsg->pCont == NULL) { return -1; @@ -89,9 +91,9 @@ int32_t mndProcessWriteMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta sdbSetApplyInfo(pMnode->pSdb, pMeta->index, pMeta->term, pMeta->lastConfigIndex); } - taosWLockLatch(&pMgmt->lock); + taosThreadMutexLock(&pMgmt->lock); if (transId <= 0) { - taosWUnLockLatch(&pMgmt->lock); + taosThreadMutexUnlock(&pMgmt->lock); mError("trans:%d, invalid commit msg", transId); } else if (transId == pMgmt->transId) { if (pMgmt->errCode != 0) { @@ -100,10 +102,11 @@ int32_t mndProcessWriteMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta mInfo("trans:%d, is proposed and post sem", transId); } pMgmt->transId = 0; + pMgmt->transSec = 0; tsem_post(&pMgmt->syncSem); - taosWUnLockLatch(&pMgmt->lock); + taosThreadMutexUnlock(&pMgmt->lock); } else { - taosWUnLockLatch(&pMgmt->lock); + taosThreadMutexUnlock(&pMgmt->lock); STrans *pTrans = mndAcquireTrans(pMnode, transId); if (pTrans != NULL) { mInfo("trans:%d, execute in mnode which not leader", transId); @@ -198,18 +201,19 @@ int32_t mndSnapshotDoWrite(const SSyncFSM *pFsm, void *pWriter, void *pBuf, int3 } static void mndBecomeFollower(const SSyncFSM *pFsm) { - SMnode *pMnode = pFsm->data; + SMnode *pMnode = pFsm->data; + SSyncMgmt *pMgmt = &pMnode->syncMgmt; mInfo("vgId:1, become follower"); - taosWLockLatch(&pMnode->syncMgmt.lock); - if (pMnode->syncMgmt.transId != 0) { - mInfo("vgId:1, become follower and post sem, trans:%d, failed to propose since not leader", - pMnode->syncMgmt.transId); - pMnode->syncMgmt.transId = 0; - pMnode->syncMgmt.errCode = TSDB_CODE_SYN_NOT_LEADER; - tsem_post(&pMnode->syncMgmt.syncSem); + taosThreadMutexLock(&pMgmt->lock); + if (pMgmt->transId != 0) { + mInfo("vgId:1, become follower and post sem, trans:%d, failed to propose since not leader", pMgmt->transId); + pMgmt->transId = 0; + pMgmt->transSec = 0; + pMgmt->errCode = TSDB_CODE_SYN_NOT_LEADER; + tsem_post(&pMgmt->syncSem); } - taosWUnLockLatch(&pMnode->syncMgmt.lock); + taosThreadMutexUnlock(&pMgmt->lock); } static void mndBecomeLeader(const SSyncFSM *pFsm) { @@ -265,8 +269,9 @@ SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) { int32_t mndInitSync(SMnode *pMnode) { SSyncMgmt *pMgmt = &pMnode->syncMgmt; - taosInitRWLatch(&pMgmt->lock); + taosThreadMutexInit(&pMgmt->lock, NULL); pMgmt->transId = 0; + pMgmt->transSec = 0; SSyncInfo syncInfo = { .snapshotStrategy = SYNC_STRATEGY_STANDARD_SNAPSHOT, @@ -313,9 +318,34 @@ void mndCleanupSync(SMnode *pMnode) { mInfo("mnode-sync is stopped, id:%" PRId64, pMgmt->sync); tsem_destroy(&pMgmt->syncSem); + taosThreadMutexDestroy(&pMgmt->lock); memset(pMgmt, 0, sizeof(SSyncMgmt)); } +void mndSyncCheckTimeout(SMnode *pMnode) { + SSyncMgmt *pMgmt = &pMnode->syncMgmt; + taosThreadMutexLock(&pMgmt->lock); + if (pMgmt->transId != 0) { + int32_t curSec = taosGetTimestampSec(); + int32_t delta = curSec - pMgmt->transSec; + if (delta > TRANS_TIMEOUT_SEC) { + mError("trans:%d, failed to propose since timeout, start:%d cur:%d delta:%d", pMgmt->transId, pMgmt->transSec, + curSec, delta); + pMgmt->transId = 0; + pMgmt->transSec = 0; + terrno = TSDB_CODE_SYN_TIMEOUT; + pMgmt->errCode = TSDB_CODE_SYN_TIMEOUT; + tsem_post(&pMgmt->syncSem); + } else { + mDebug("trans:%d, waiting for sync confirm, start:%d cur:%d delta:%d", pMgmt->transId, pMgmt->transSec, curSec, + curSec - pMgmt->transSec); + } + } else { + // mTrace("check sync timeout msg, no trans waiting for confirm"); + } + taosThreadMutexUnlock(&pMgmt->lock); +} + int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) { SSyncMgmt *pMgmt = &pMnode->syncMgmt; pMgmt->errCode = 0; @@ -327,35 +357,38 @@ int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) { if (req.pCont == NULL) return -1; memcpy(req.pCont, pRaw, req.contLen); - taosWLockLatch(&pMgmt->lock); + taosThreadMutexLock(&pMgmt->lock); if (pMgmt->transId != 0) { mError("trans:%d, can't be proposed since trans:%d already waiting for confirm", transId, pMgmt->transId); - taosWUnLockLatch(&pMgmt->lock); + taosThreadMutexUnlock(&pMgmt->lock); terrno = TSDB_CODE_MND_LAST_TRANS_NOT_FINISHED; - return -1; + return terrno; } mInfo("trans:%d, will be proposed", transId); pMgmt->transId = transId; - taosWUnLockLatch(&pMgmt->lock); + pMgmt->transSec = taosGetTimestampSec(); + taosThreadMutexUnlock(&pMgmt->lock); int32_t code = syncPropose(pMgmt->sync, &req, false); if (code == 0) { - mInfo("trans:%d, is proposing and wait sem", pMgmt->transId); + mInfo("trans:%d, is proposing and wait sem", transId); tsem_wait(&pMgmt->syncSem); } else if (code > 0) { mInfo("trans:%d, confirm at once since replica is 1, continue execute", transId); - taosWLockLatch(&pMgmt->lock); + taosThreadMutexLock(&pMgmt->lock); pMgmt->transId = 0; - taosWUnLockLatch(&pMgmt->lock); + pMgmt->transSec = 0; + taosThreadMutexUnlock(&pMgmt->lock); sdbWriteWithoutFree(pMnode->pSdb, pRaw); sdbSetApplyInfo(pMnode->pSdb, req.info.conn.applyIndex, req.info.conn.applyTerm, SYNC_INDEX_INVALID); code = 0; } else { mError("trans:%d, failed to proposed since %s", transId, terrstr()); - taosWLockLatch(&pMgmt->lock); + taosThreadMutexLock(&pMgmt->lock); pMgmt->transId = 0; - taosWUnLockLatch(&pMgmt->lock); + pMgmt->transSec = 0; + taosThreadMutexUnlock(&pMgmt->lock); if (terrno == 0) { terrno = TSDB_CODE_APP_ERROR; } @@ -382,13 +415,17 @@ void mndSyncStart(SMnode *pMnode) { } void mndSyncStop(SMnode *pMnode) { - taosWLockLatch(&pMnode->syncMgmt.lock); - if (pMnode->syncMgmt.transId != 0) { - mInfo("vgId:1, is stopped and post sem, trans:%d", pMnode->syncMgmt.transId); - pMnode->syncMgmt.transId = 0; - tsem_post(&pMnode->syncMgmt.syncSem); + SSyncMgmt *pMgmt = &pMnode->syncMgmt; + + taosThreadMutexLock(&pMgmt->lock); + if (pMgmt->transId != 0) { + mInfo("vgId:1, is stopped and post sem, trans:%d", pMgmt->transId); + pMgmt->transId = 0; + pMgmt->transSec = 0; + pMgmt->errCode = TSDB_CODE_APP_IS_STOPPING; + tsem_post(&pMgmt->syncSem); } - taosWUnLockLatch(&pMnode->syncMgmt.lock); + taosThreadMutexUnlock(&pMgmt->lock); } bool mndIsLeader(SMnode *pMnode) {