From 490caa967ba71b7af0a67675adadeceeb2cfeaa5 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Wed, 9 Jun 2021 20:31:11 +0800 Subject: [PATCH] [TD-4599]: fix false dnode offline --- src/balance/src/bnMain.c | 8 ++++---- src/balance/src/bnThread.c | 10 +++++----- src/mnode/inc/mnodeDef.h | 3 +-- src/mnode/inc/mnodeDnode.h | 2 +- src/mnode/src/mnodeDnode.c | 12 ++++++------ 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/balance/src/bnMain.c b/src/balance/src/bnMain.c index 3055f77e81..25f316cb5e 100644 --- a/src/balance/src/bnMain.c +++ b/src/balance/src/bnMain.c @@ -405,7 +405,7 @@ void bnReset() { if (pDnode == NULL) break; // while master change, should reset dnode to offline - mInfo("dnode:%d set access:%d to 0", pDnode->dnodeId, pDnode->lastAccess); + mInfo("dnode:%d set access:%" PRId64 " to 0", pDnode->dnodeId, pDnode->lastAccess); pDnode->lastAccess = 0; if (pDnode->status != TAOS_DN_STATUS_DROPPING) { pDnode->status = TAOS_DN_STATUS_OFFLINE; @@ -499,7 +499,7 @@ static bool bnMontiorDropping() { if (dnodeIsMasterEp(pDnode->dnodeEp)) continue; if (mnodeGetDnodesNum() <= 1) continue; - mLInfo("dnode:%d, set to removing state for it offline:%d seconds", pDnode->dnodeId, + mLInfo("dnode:%d, set to removing state for it offline:%" PRId64 " seconds", pDnode->dnodeId, tsAccessSquence - pDnode->lastAccess); pDnode->status = TAOS_DN_STATUS_DROPPING; @@ -574,8 +574,8 @@ void bnCheckStatus() { if (pDnode->status != TAOS_DN_STATUS_DROPPING && pDnode->status != TAOS_DN_STATUS_OFFLINE) { pDnode->status = TAOS_DN_STATUS_OFFLINE; pDnode->offlineReason = TAOS_DN_OFF_STATUS_MSG_TIMEOUT; - mInfo("dnode:%d, set to offline state, access seq:%d last seq:%d laststat:%d", pDnode->dnodeId, tsAccessSquence, - pDnode->lastAccess, pDnode->status); + mInfo("dnode:%d, set to offline state, access seq:%" PRId64 " last seq:%" PRId64 " laststat:%d", pDnode->dnodeId, + tsAccessSquence, pDnode->lastAccess, pDnode->status); bnSetVgroupOffline(pDnode); bnStartTimer(3000); } diff --git a/src/balance/src/bnThread.c b/src/balance/src/bnThread.c index d07591ecd5..f39b82f2c6 100644 --- a/src/balance/src/bnThread.c +++ b/src/balance/src/bnThread.c @@ -101,13 +101,13 @@ static void bnProcessTimer(void *handle, void *tmrId) { if (!sdbIsMaster()) return; if (tsBnThread.stop) return; - tsBnThread.timer = NULL; - tsAccessSquence++; + if (handle == NULL) { + tsBnThread.timer = NULL; + ++tsAccessSquence; - bnStartTimer(-1); - bnCheckStatus(); + bnStartTimer(-1); + bnCheckStatus(); - if (handle == NULL) { if (tsAccessSquence % tsBalanceInterval == 0) { mDebug("balance function is scheduled by timer"); bnPostSignal(); diff --git a/src/mnode/inc/mnodeDef.h b/src/mnode/inc/mnodeDef.h index e052f34a33..c1f2ea7fd7 100644 --- a/src/mnode/inc/mnodeDef.h +++ b/src/mnode/inc/mnodeDef.h @@ -48,9 +48,8 @@ typedef struct SDnodeObj { int32_t dnodeId; int32_t openVnodes; int64_t createdTime; - int32_t resever0; // from dnode status msg, config information + int64_t lastAccess; int32_t customScore; // config by user - uint32_t lastAccess; uint16_t numOfCores; // from dnode status msg uint16_t dnodePort; char dnodeFqdn[TSDB_FQDN_LEN]; diff --git a/src/mnode/inc/mnodeDnode.h b/src/mnode/inc/mnodeDnode.h index fa1995254e..d357cd65b8 100644 --- a/src/mnode/inc/mnodeDnode.h +++ b/src/mnode/inc/mnodeDnode.h @@ -77,7 +77,7 @@ void * mnodeGetDnodeByEp(char *ep); void mnodeUpdateDnode(SDnodeObj *pDnode); int32_t mnodeDropDnode(SDnodeObj *pDnode, void *pMsg); -extern int32_t tsAccessSquence; +extern int64_t tsAccessSquence; #ifdef __cplusplus } diff --git a/src/mnode/src/mnodeDnode.c b/src/mnode/src/mnodeDnode.c index 8a5d24c474..fb775d92d8 100644 --- a/src/mnode/src/mnodeDnode.c +++ b/src/mnode/src/mnodeDnode.c @@ -39,8 +39,8 @@ #include "mnodePeer.h" #include "mnodeCluster.h" -int32_t tsAccessSquence = 0; -int64_t tsDnodeRid = -1; +int64_t tsAccessSquence = 0; +int64_t tsDnodeRid = -1; static void * tsDnodeSdb = NULL; static int32_t tsDnodeUpdateSize = 0; extern void * tsMnodeSdb; @@ -567,7 +567,7 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { mnodeGetClusterId()); return TSDB_CODE_MND_INVALID_CLUSTER_ID; } else { - mTrace("dnode:%d, status received, access times %d openVnodes:%d:%d", pDnode->dnodeId, pDnode->lastAccess, + mTrace("dnode:%d, status received, access times %" PRId64 " openVnodes:%d:%d", pDnode->dnodeId, pDnode->lastAccess, htons(pStatus->openVnodes), pDnode->openVnodes); } } @@ -629,9 +629,9 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { bnNotify(); } - if (!tsEnableBalance) { - int32_t numOfMnodes = mnodeGetMnodesNum(); - if (numOfMnodes < tsNumOfMnodes) bnNotify(); + int32_t numOfMnodes = mnodeGetMnodesNum(); + if (numOfMnodes < tsNumOfMnodes && numOfMnodes < mnodeGetOnlineDnodesNum()) { + bnNotify(); } if (openVnodes != pDnode->openVnodes) { -- GitLab