From f6a3c2432ce66db82b51b633f53f2f885a803f9a Mon Sep 17 00:00:00 2001 From: Hui Li Date: Mon, 15 Jun 2020 18:26:52 +0800 Subject: [PATCH] [TD-638] --- src/dnode/src/dnodeMgmt.c | 10 + src/inc/taoserror.h | 1 + src/inc/taosmsg.h | 36 ++-- src/mnode/src/mnodeDnode.c | 25 ++- .../arbitrator/check_cluster_cfg_para.sim | 190 ++++++++++++++++++ ...3_mn1_nw_disable_timeout_autoDropDnode.sim | 10 +- .../arbitrator/dn3_mn1_vnode_change.sim | 30 +++ 7 files changed, 284 insertions(+), 18 deletions(-) create mode 100644 tests/script/unique/arbitrator/check_cluster_cfg_para.sim diff --git a/src/dnode/src/dnodeMgmt.c b/src/dnode/src/dnodeMgmt.c index d35e82fa47..b03b2cc244 100644 --- a/src/dnode/src/dnodeMgmt.c +++ b/src/dnode/src/dnodeMgmt.c @@ -616,6 +616,16 @@ static void dnodeSendStatusMsg(void *handle, void *tmrId) { pStatus->numOfCores = htons((uint16_t) tsNumOfCores); pStatus->diskAvailable = tsAvailDataDirGB; pStatus->alternativeRole = (uint8_t) tsAlternativeRole; + + // fill cluster cfg parameters + pStatus->ClusterCfgPara.numOfMnodes = tsNumOfMnodes; + pStatus->ClusterCfgPara.mnodeEqualVnodeNum = tsMnodeEqualVnodeNum; + pStatus->ClusterCfgPara.offlineThreshold = tsOfflineThreshold; + pStatus->ClusterCfgPara.statusInterval = tsStatusInterval; + strcpy(pStatus->ClusterCfgPara.arbitrator, tsArbitrator); + strcpy(pStatus->ClusterCfgPara.timezone, tsTimezone); + strcpy(pStatus->ClusterCfgPara.locale, tsLocale); + strcpy(pStatus->ClusterCfgPara.charset, tsCharset); vnodeBuildStatusMsg(pStatus); contLen = sizeof(SDMStatusMsg) + pStatus->openVnodes * sizeof(SVnodeLoad); diff --git a/src/inc/taoserror.h b/src/inc/taoserror.h index ac2af75742..df9992d53a 100644 --- a/src/inc/taoserror.h +++ b/src/inc/taoserror.h @@ -121,6 +121,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_MND_DNODE_NOT_EXIST, 0, 0x0331, "mnode dnod TAOS_DEFINE_ERROR(TSDB_CODE_MND_VGROUP_NOT_EXIST, 0, 0x0332, "mnode vgroup not exist") TAOS_DEFINE_ERROR(TSDB_CODE_MND_NO_REMOVE_MASTER, 0, 0x0333, "mnode cant not remove master") TAOS_DEFINE_ERROR(TSDB_CODE_MND_NO_ENOUGH_DNODES, 0, 0x0334, "mnode no enough dnodes") +TAOS_DEFINE_ERROR(TSDB_CODE_MND_CLUSTER_CFG_INCONSISTENT, 0, 0x0335, "mnode cluster cfg inconsistent") TAOS_DEFINE_ERROR(TSDB_CODE_MND_ACCT_ALREADY_EXIST, 0, 0x0340, "mnode accounts already exist") TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_ACCT, 0, 0x0341, "mnode invalid account") diff --git a/src/inc/taosmsg.h b/src/inc/taosmsg.h index 1198097895..611bd6a713 100644 --- a/src/inc/taosmsg.h +++ b/src/inc/taosmsg.h @@ -557,18 +557,30 @@ typedef struct { } SDMMnodeInfos; typedef struct { - uint32_t version; - int32_t dnodeId; - char dnodeEp[TSDB_EP_LEN]; - uint32_t moduleStatus; - uint32_t lastReboot; // time stamp for last reboot - uint16_t numOfTotalVnodes; // from config file - uint16_t openVnodes; - uint16_t numOfCores; - float diskAvailable; // GB - uint8_t alternativeRole; - uint8_t reserve[15]; - SVnodeLoad load[]; + int32_t numOfMnodes; // tsNumOfMnodes + int32_t mnodeEqualVnodeNum; // tsMnodeEqualVnodeNum + int32_t offlineThreshold; // tsOfflineThreshold + int32_t statusInterval; // tsStatusInterval + char arbitrator[TSDB_EP_LEN]; // tsArbitrator + char timezone[64]; // tsTimezone + char locale[TSDB_LOCALE_LEN]; // tsLocale + char charset[TSDB_LOCALE_LEN]; // tsCharset +} SClusterCfg; + +typedef struct { + uint32_t version; + int32_t dnodeId; + char dnodeEp[TSDB_EP_LEN]; + uint32_t moduleStatus; + uint32_t lastReboot; // time stamp for last reboot + uint16_t numOfTotalVnodes; // from config file + uint16_t openVnodes; + uint16_t numOfCores; + float diskAvailable; // GB + uint8_t alternativeRole; + uint8_t reserve[15]; + SClusterCfg ClusterCfgPara; + SVnodeLoad load[]; } SDMStatusMsg; typedef struct { diff --git a/src/mnode/src/mnodeDnode.c b/src/mnode/src/mnodeDnode.c index d2af86badb..aab53b9f7b 100644 --- a/src/mnode/src/mnodeDnode.c +++ b/src/mnode/src/mnodeDnode.c @@ -277,6 +277,20 @@ static void mnodeProcessCfgDnodeMsgRsp(SRpcMsg *rpcMsg) { mPrint("cfg dnode rsp is received"); } +static bool mnodeCheckClusterCfgPara(const SClusterCfg *clusterCfg) { + if (clusterCfg->numOfMnodes != tsNumOfMnodes) return false; + if (clusterCfg->mnodeEqualVnodeNum != tsMnodeEqualVnodeNum) return false; + if (clusterCfg->offlineThreshold != tsOfflineThreshold) return false; + if (clusterCfg->statusInterval != tsStatusInterval) return false; + + if (0 != strncasecmp(clusterCfg->arbitrator, tsArbitrator, strlen(tsArbitrator))) return false; + if (0 != strncasecmp(clusterCfg->timezone, tsTimezone, strlen(tsTimezone))) return false; + if (0 != strncasecmp(clusterCfg->locale, tsLocale, strlen(tsLocale))) return false; + if (0 != strncasecmp(clusterCfg->charset, tsCharset, strlen(tsCharset))) return false; + + return true; +} + static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { SDMStatusMsg *pStatus = pMsg->rpcMsg.pCont; pStatus->dnodeId = htonl(pStatus->dnodeId); @@ -312,7 +326,6 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { pDnode->alternativeRole = pStatus->alternativeRole; pDnode->totalVnodes = pStatus->numOfTotalVnodes; pDnode->moduleStatus = pStatus->moduleStatus; - pDnode->lastAccess = tsAccessSquence; if (pStatus->dnodeId == 0) { mTrace("dnode:%d %s, first access", pDnode->dnodeId, pDnode->dnodeEp); @@ -338,6 +351,14 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { } if (pDnode->status == TAOS_DN_STATUS_OFFLINE) { + // Verify whether the cluster parameters are consistent when status change from offline to ready + bool ret = mnodeCheckClusterCfgPara(&(pStatus->ClusterCfgPara)); + if (false == ret) { + mnodeDecDnodeRef(pDnode); + mError("dnode %s cluster cfg parameters inconsistent", pStatus->dnodeEp); + return TSDB_CODE_MND_CLUSTER_CFG_INCONSISTENT; + } + mTrace("dnode:%d, from offline to online", pDnode->dnodeId); pDnode->status = TAOS_DN_STATUS_READY; balanceUpdateMnode(); @@ -352,6 +373,8 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) { return TSDB_CODE_MND_OUT_OF_MEMORY; } + pDnode->lastAccess = tsAccessSquence; + mnodeGetMnodeInfos(&pRsp->mnodes); pRsp->dnodeCfg.dnodeId = htonl(pDnode->dnodeId); diff --git a/tests/script/unique/arbitrator/check_cluster_cfg_para.sim b/tests/script/unique/arbitrator/check_cluster_cfg_para.sim new file mode 100644 index 0000000000..e74717c077 --- /dev/null +++ b/tests/script/unique/arbitrator/check_cluster_cfg_para.sim @@ -0,0 +1,190 @@ +system sh/stop_dnodes.sh +system sh/deploy.sh -n dnode1 -i 1 +system sh/deploy.sh -n dnode2 -i 2 +system sh/deploy.sh -n dnode3 -i 3 +system sh/deploy.sh -n dnode4 -i 4 +system sh/deploy.sh -n dnode5 -i 5 +system sh/deploy.sh -n dnode6 -i 6 +system sh/deploy.sh -n dnode7 -i 7 + + +system sh/cfg.sh -n dnode1 -c numOfMnodes -v 2 +system sh/cfg.sh -n dnode1 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode1 -c offlineThreshold -v 15 +system sh/cfg.sh -n dnode1 -c statusInterval -v 3 +system sh/cfg.sh -n dnode1 -c arbitrator -v $arbitrator +#system sh/cfg.sh -n dnode1 -c timezone -v "" +#system sh/cfg.sh -n dnode1 -c locale -v "" +#system sh/cfg.sh -n dnode1 -c charset -v "" +system sh/cfg.sh -n dnode1 -c balanceInterval -v 10 + +######## dnode 2 the same with dnode1 +system sh/cfg.sh -n dnode2 -c numOfMnodes -v 2 +system sh/cfg.sh -n dnode2 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode2 -c offlineThreshold -v 15 +system sh/cfg.sh -n dnode2 -c statusInterval -v 3 +system sh/cfg.sh -n dnode2 -c arbitrator -v $arbitrator +#system sh/cfg.sh -n dnode2 -c timezone -v "" +#system sh/cfg.sh -n dnode2 -c locale -v "" +#system sh/cfg.sh -n dnode2 -c charset -v "" +system sh/cfg.sh -n dnode2 -c balanceInterval -v 10 + +######## dnode 3 one para no same with dnode1 +system sh/cfg.sh -n dnode3 -c numOfMnodes -v 3 +system sh/cfg.sh -n dnode3 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode3 -c offlineThreshold -v 15 +system sh/cfg.sh -n dnode3 -c statusInterval -v 3 +system sh/cfg.sh -n dnode3 -c arbitrator -v $arbitrator +#system sh/cfg.sh -n dnode3 -c timezone -v "" +#system sh/cfg.sh -n dnode3 -c locale -v "" +#system sh/cfg.sh -n dnode3 -c charset -v "" +system sh/cfg.sh -n dnode3 -c balanceInterval -v 10 + +######## dnode 4 one para no same with dnode1 +system sh/cfg.sh -n dnode4 -c numOfMnodes -v 2 +system sh/cfg.sh -n dnode4 -c mnodeEqualVnodeNum -v 5 +system sh/cfg.sh -n dnode4 -c offlineThreshold -v 15 +system sh/cfg.sh -n dnode4 -c statusInterval -v 3 +system sh/cfg.sh -n dnode4 -c arbitrator -v $arbitrator +#system sh/cfg.sh -n dnode4 -c timezone -v "" +#system sh/cfg.sh -n dnode4 -c locale -v "" +#system sh/cfg.sh -n dnode4 -c charset -v "" +system sh/cfg.sh -n dnode4 -c balanceInterval -v 10 + +######## dnode 5 one para no same with dnode1 +system sh/cfg.sh -n dnode5 -c numOfMnodes -v 2 +system sh/cfg.sh -n dnode5 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode5 -c offlineThreshold -v 16 +system sh/cfg.sh -n dnode5 -c statusInterval -v 3 +system sh/cfg.sh -n dnode5 -c arbitrator -v $arbitrator +#system sh/cfg.sh -n dnode5 -c timezone -v "" +#system sh/cfg.sh -n dnode5 -c locale -v "" +#system sh/cfg.sh -n dnode5 -c charset -v "" +system sh/cfg.sh -n dnode5 -c balanceInterval -v 10 + + +######## dnode 6 one para no same with dnode1 +system sh/cfg.sh -n dnode6 -c numOfMnodes -v 2 +system sh/cfg.sh -n dnode6 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode6 -c offlineThreshold -v 15 +system sh/cfg.sh -n dnode6 -c statusInterval -v 2 +system sh/cfg.sh -n dnode6 -c arbitrator -v $arbitrator +#system sh/cfg.sh -n dnode6 -c timezone -v "" +#system sh/cfg.sh -n dnode6 -c locale -v "" +#system sh/cfg.sh -n dnode6 -c charset -v "" +system sh/cfg.sh -n dnode6 -c balanceInterval -v 10 + + +######## dnode 7 one para no same with dnode1 +system sh/cfg.sh -n dnode7 -c numOfMnodes -v 2 +system sh/cfg.sh -n dnode7 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode7 -c offlineThreshold -v 15 +system sh/cfg.sh -n dnode7 -c statusInterval -v 3 +system sh/cfg.sh -n dnode7 -c arbitrator -v "plum-VirtualBox:8001" +#system sh/cfg.sh -n dnode7 -c timezone -v "" +#system sh/cfg.sh -n dnode7 -c locale -v "" +#system sh/cfg.sh -n dnode7 -c charset -v "" +system sh/cfg.sh -n dnode7 -c balanceInterval -v 10 + +print ============== step0: start tarbitrator +system sh/exec_tarbitrator.sh -s start + +print ============== step1: start dnode1 +system sh/exec.sh -n dnode1 -s start +sleep 3000 +sql connect + +print ============== step2: start dnode2~7 and add into cluster +system sh/exec.sh -n dnode2 -s start +system sh/exec.sh -n dnode3 -s start +system sh/exec.sh -n dnode4 -s start +system sh/exec.sh -n dnode5 -s start +system sh/exec.sh -n dnode6 -s start +system sh/exec.sh -n dnode7 -s start +sql create dnode $hostname2 +sql create dnode $hostname3 +sql create dnode $hostname4 +sql create dnode $hostname5 +sql create dnode $hostname6 +sql create dnode $hostname7 +sleep 10000 + +wait_dnode_created: +sql show dnodes +if $rows != 7 then + sleep 2000 + goto wait_dnode_created +endi +print $data0_1 $data1_1 $data2_1 $data3_1 $data4_1 +print $data0_2 $data1_2 $data2_2 $data3_2 $data4_2 +print $data0_3 $data1_3 $data2_3 $data3_3 $data4_3 +print $data0_4 $data1_4 $data2_4 $data3_4 $data4_4 +print $data0_5 $data1_5 $data2_5 $data3_5 $data4_5 +print $data0_6 $data1_6 $data2_6 $data3_6 $data4_6 +print $data0_7 $data1_7 $data2_7 $data3_7 $data4_7 +$dnode1Status = $data4_1 +$dnode2Status = $data4_2 +$dnode3Status = $data4_3 +$dnode4Status = $data4_4 +$dnode5Status = $data4_5 +$dnode6Status = $data4_6 +$dnode7Status = $data4_7 + +if $dnode1Status != ready then + return -1 +endi +if $dnode2Status != ready then + return -1 +endi +if $dnode3Status != offline then + return -1 +endi +if $dnode4Status != offline then + return -1 +endi +if $dnode5Status != offline then + return -1 +endi +if $dnode6Status != offline then + return -1 +endi +if $dnode7Status != offline then + return -1 +endi + +sleep 10000 + +wait_dnode_offline_overtime_dropped: +sql show dnodes +print $data0_1 $data1_1 $data2_1 $data3_1 $data4_1 +print $data0_2 $data1_2 $data2_2 $data3_2 $data4_2 +print $data0_3 $data1_3 $data2_3 $data3_3 $data4_3 +print $data0_4 $data1_4 $data2_4 $data3_4 $data4_4 +print $data0_5 $data1_5 $data2_5 $data3_5 $data4_5 +print $data0_6 $data1_6 $data2_6 $data3_6 $data4_6 +print $data0_7 $data1_7 $data2_7 $data3_7 $data4_7 +if $rows != 2 then + sleep 2000 + goto wait_dnode_offline_overtime_dropped +endi +print $data0_1 $data1_1 $data2_1 $data3_1 $data4_1 +print $data0_2 $data1_2 $data2_2 $data3_2 $data4_2 +print $data0_3 $data1_3 $data2_3 $data3_3 $data4_3 +print $data0_4 $data1_4 $data2_4 $data3_4 $data4_4 +print $data0_5 $data1_5 $data2_5 $data3_5 $data4_5 +print $data0_6 $data1_6 $data2_6 $data3_6 $data4_6 +print $data0_7 $data1_7 $data2_7 $data3_7 $data4_7 +$dnode1Status = $data4_1 +$dnode2Status = $data4_2 +$dnode3Status = $data4_3 +$dnode4Status = $data4_4 +$dnode5Status = $data4_5 +$dnode6Status = $data4_6 +$dnode7Status = $data4_7 + +if $dnode1Status != ready then + return -1 +endi +if $dnode2Status != ready then + return -1 +endi diff --git a/tests/script/unique/arbitrator/dn3_mn1_nw_disable_timeout_autoDropDnode.sim b/tests/script/unique/arbitrator/dn3_mn1_nw_disable_timeout_autoDropDnode.sim index 71e606e529..9b1e320946 100644 --- a/tests/script/unique/arbitrator/dn3_mn1_nw_disable_timeout_autoDropDnode.sim +++ b/tests/script/unique/arbitrator/dn3_mn1_nw_disable_timeout_autoDropDnode.sim @@ -5,11 +5,11 @@ system sh/deploy.sh -n dnode3 -i 3 system sh/deploy.sh -n dnode4 -i 4 system sh/deploy.sh -n dnode5 -i 5 -system sh/cfg.sh -n dnode1 -c numOfMPeers -v 1 -system sh/cfg.sh -n dnode2 -c numOfMPeers -v 1 -system sh/cfg.sh -n dnode3 -c numOfMPeers -v 1 -system sh/cfg.sh -n dnode4 -c numOfMPeers -v 1 -system sh/cfg.sh -n dnode5 -c numOfMPeers -v 1 +system sh/cfg.sh -n dnode1 -c numOfMnodes -v 1 +system sh/cfg.sh -n dnode2 -c numOfMnodes -v 1 +system sh/cfg.sh -n dnode3 -c numOfMnodes -v 1 +system sh/cfg.sh -n dnode4 -c numOfMnodes -v 1 +system sh/cfg.sh -n dnode5 -c numOfMnodes -v 1 system sh/cfg.sh -n dnode1 -c walLevel -v 1 system sh/cfg.sh -n dnode2 -c walLevel -v 1 diff --git a/tests/script/unique/arbitrator/dn3_mn1_vnode_change.sim b/tests/script/unique/arbitrator/dn3_mn1_vnode_change.sim index fda850d2c9..d2bd5c6b26 100644 --- a/tests/script/unique/arbitrator/dn3_mn1_vnode_change.sim +++ b/tests/script/unique/arbitrator/dn3_mn1_vnode_change.sim @@ -96,7 +96,12 @@ endi print ============== step3: stop dnode4, and remove its vnodeX subdirector system sh/exec.sh -n dnode4 -s stop -x SIGINT sleep $sleepTimer +$loopCnt = 0 wait_dnode4_offline_0: +$loopCnt = $loopCnt + 1 +if $loopCnt == 10 then + return -1 +endi sql show dnodes if $rows != 4 then sleep 2000 @@ -148,7 +153,14 @@ sleep 1000 print ============== step4: restart dnode4, waiting sync end system sh/exec.sh -n dnode4 -s start sleep $sleepTimer + +$loopCnt = 0 wait_dnode4_reready: +$loopCnt = $loopCnt + 1 +if $loopCnt == 10 then + return -1 +endi + sql show dnodes if $rows != 4 then sleep 2000 @@ -171,7 +183,13 @@ if $dnode4Status != ready then goto wait_dnode4_reready endi +$loopCnt = 0 wait_dnode4_vgroup_slave: +$loopCnt = $loopCnt + 1 +if $loopCnt == 10 then + return -1 +endi + sql show vgroups if $rows != 1 then sleep 2000 @@ -200,7 +218,13 @@ system sh/exec.sh -n dnode2 -s stop system sh/exec.sh -n dnode3 -s stop sleep $sleepTimer + +$loopCnt = 0 wait_dnode23_offline: +$loopCnt = $loopCnt + 1 +if $loopCnt == 10 then + return -1 +endi sql show dnodes if $rows != 4 then sleep 2000 @@ -231,7 +255,13 @@ if $dnode4Status != ready then goto wait_dnode23_offline endi +$loopCnt = 0 wait_dnode4_vgroup_master: +$loopCnt = $loopCnt + 1 +if $loopCnt == 10 then + return -1 +endi + sql show vgroups if $rows != 1 then sleep 2000 -- GitLab