提交 f2dafd9c 编写于 作者: S Shengliang Guan

TD-1473

上级 7a840d7f
......@@ -389,6 +389,7 @@ void balanceReset() {
pDnode->lastAccess = 0;
if (pDnode->status != TAOS_DN_STATUS_DROPPING) {
pDnode->status = TAOS_DN_STATUS_OFFLINE;
pDnode->offlineReason = TAOS_DN_OFF_STATUS_NOT_RECEIVED;
}
mnodeDecDnodeRef(pDnode);
......@@ -551,6 +552,7 @@ static void balanceCheckDnodeAccess() {
if (tsAccessSquence - pDnode->lastAccess > 3) {
if (pDnode->status != TAOS_DN_STATUS_DROPPING && pDnode->status != TAOS_DN_STATUS_OFFLINE) {
pDnode->status = TAOS_DN_STATUS_OFFLINE;
pDnode->offlineReason = TAOS_DN_OFF_STATUS_MSG_TIMEOUT;
mInfo("dnode:%d, set to offline state", pDnode->dnodeId);
balanceSetVgroupOffline(pDnode);
}
......
......@@ -69,7 +69,8 @@ typedef struct SDnodeObj {
int16_t cpuAvgUsage; // calc from sys.cpu
int16_t memoryAvgUsage; // calc from sys.mem
int16_t bandwidthUsage; // calc from sys.band
int8_t reserved2[2];
int8_t offlineReason;
int8_t reserved2[1];
} SDnodeObj;
typedef struct SMnodeObj {
......
......@@ -33,6 +33,28 @@ typedef enum {
TAOS_DN_ALTERNATIVE_ROLE_VNODE
} EDnodeAlternativeRole;
typedef enum EDnodeOfflineReason {
TAOS_DN_OFF_ONLINE = 0,
TAOS_DN_OFF_STATUS_MSG_TIMEOUT,
TAOS_DN_OFF_STATUS_NOT_RECEIVED,
TAOS_DN_OFF_RESET_BY_MNODE,
TAOS_DN_OFF_VERSION_NOT_MATCH,
TAOS_DN_OFF_DNODE_ID_NOT_MATCH,
TAOS_DN_OFF_CLUSTER_ID_NOT_MATCH,
TAOS_DN_OFF_NUM_OF_MNODES_NOT_MATCH,
TAOS_DN_OFF_ENABLE_BALANCE_NOT_MATCH,
TAOS_DN_OFF_MN_EQUAL_VN_NOT_MATCH,
TAOS_DN_OFF_OFFLINE_THRESHOLD_NOT_MATCH,
TAOS_DN_OFF_STATUS_INTERVAL_NOT_MATCH,
TAOS_DN_OFF_MAX_TAB_PER_VN_NOT_MATCH,
TAOS_DN_OFF_MAX_VG_PER_DB_NOT_MATCH,
TAOS_DN_OFF_ARBITRATOR_NOT_MATCH,
TAOS_DN_OFF_TIME_ZONE_NOT_MATCH,
TAOS_DN_OFF_LOCALE_NOT_MATCH,
TAOS_DN_OFF_CHARSET_NOT_MATCH,
TAOS_DN_OFF_OTHERS
} EDnodeOfflineReason;
int32_t mnodeInitDnodes();
void mnodeCleanupDnodes();
......
......@@ -60,6 +60,28 @@ static int32_t mnodeGetDnodeMeta(STableMetaMsg *pMeta, SShowObj *pShow, void *pC
static int32_t mnodeRetrieveDnodes(SShowObj *pShow, char *data, int32_t rows, void *pConn);
static char* mnodeGetDnodeAlternativeRoleStr(int32_t alternativeRole);
static char* offlineReason[] = {
"",
"status msg timeout",
"status not received",
"status reset by mnode",
"version not match",
"dnodeId not match",
"clusterId not match",
"numOfMnodes not match",
"balance not match",
"mnEqualVn not match",
"offThreshold not match",
"interval not match",
"maxTabPerVn not match",
"maxVgPerDb not match",
"arbitrator not match",
"timezone not match",
"locale not match",
"charset not match",
"unknown",
};
static int32_t mnodeDnodeActionDestroy(SSdbOper *pOper) {
taosTFree(pOper->pObj);
return TSDB_CODE_SUCCESS;
......@@ -70,6 +92,7 @@ static int32_t mnodeDnodeActionInsert(SSdbOper *pOper) {
if (pDnode->status != TAOS_DN_STATUS_DROPPING) {
pDnode->status = TAOS_DN_STATUS_OFFLINE;
pDnode->lastAccess = tsAccessSquence;
pDnode->offlineReason = TAOS_DN_OFF_STATUS_NOT_RECEIVED;
}
mInfo("dnode:%d, fqdn:%s ep:%s port:%d, do insert action", pDnode->dnodeId, pDnode->dnodeFqdn, pDnode->dnodeEp, pDnode->dnodePort);
......@@ -334,74 +357,85 @@ static void mnodeProcessCfgDnodeMsgRsp(SRpcMsg *rpcMsg) {
mInfo("cfg dnode rsp is received");
}
static bool mnodeCheckClusterCfgPara(const SClusterCfg *clusterCfg) {
static int32_t mnodeCheckClusterCfgPara(const SClusterCfg *clusterCfg) {
if (clusterCfg->numOfMnodes != htonl(tsNumOfMnodes)) {
mError("\"numOfMnodes\"[%d - %d] cfg parameters inconsistent", clusterCfg->numOfMnodes, htonl(tsNumOfMnodes));
return false;
}
if (clusterCfg->enableBalance != htonl(tsEnableBalance)) {
return TAOS_DN_OFF_NUM_OF_MNODES_NOT_MATCH;
}
if (clusterCfg->enableBalance != htonl(tsEnableBalance)) {
mError("\"balance\"[%d - %d] cfg parameters inconsistent", clusterCfg->enableBalance, htonl(tsEnableBalance));
return false;
return TAOS_DN_OFF_ENABLE_BALANCE_NOT_MATCH;
}
if (clusterCfg->mnodeEqualVnodeNum != htonl(tsMnodeEqualVnodeNum)) {
mError("\"mnodeEqualVnodeNum\"[%d - %d] cfg parameters inconsistent", clusterCfg->mnodeEqualVnodeNum, htonl(tsMnodeEqualVnodeNum));
return false;
mError("\"mnodeEqualVnodeNum\"[%d - %d] cfg parameters inconsistent", clusterCfg->mnodeEqualVnodeNum,
htonl(tsMnodeEqualVnodeNum));
return TAOS_DN_OFF_MN_EQUAL_VN_NOT_MATCH;
}
if (clusterCfg->offlineThreshold != htonl(tsOfflineThreshold)) {
mError("\"offlineThreshold\"[%d - %d] cfg parameters inconsistent", clusterCfg->offlineThreshold, htonl(tsOfflineThreshold));
return false;
if (clusterCfg->offlineThreshold != htonl(tsOfflineThreshold)) {
mError("\"offlineThreshold\"[%d - %d] cfg parameters inconsistent", clusterCfg->offlineThreshold,
htonl(tsOfflineThreshold));
return TAOS_DN_OFF_OFFLINE_THRESHOLD_NOT_MATCH;
}
if (clusterCfg->statusInterval != htonl(tsStatusInterval)) {
mError("\"statusInterval\"[%d - %d] cfg parameters inconsistent", clusterCfg->statusInterval, htonl(tsStatusInterval));
return false;
if (clusterCfg->statusInterval != htonl(tsStatusInterval)) {
mError("\"statusInterval\"[%d - %d] cfg parameters inconsistent", clusterCfg->statusInterval,
htonl(tsStatusInterval));
return TAOS_DN_OFF_STATUS_INTERVAL_NOT_MATCH;
}
if (clusterCfg->maxtablesPerVnode != htonl(tsMaxTablePerVnode)) {
mError("\"maxTablesPerVnode\"[%d - %d] cfg parameters inconsistent", clusterCfg->maxtablesPerVnode, htonl(tsMaxTablePerVnode));
return false;
if (clusterCfg->maxtablesPerVnode != htonl(tsMaxTablePerVnode)) {
mError("\"maxTablesPerVnode\"[%d - %d] cfg parameters inconsistent", clusterCfg->maxtablesPerVnode,
htonl(tsMaxTablePerVnode));
return TAOS_DN_OFF_MAX_TAB_PER_VN_NOT_MATCH;
}
if (clusterCfg->maxVgroupsPerDb != htonl(tsMaxVgroupsPerDb)) {
mError("\"maxVgroupsPerDb\"[%d - %d] cfg parameters inconsistent", clusterCfg->maxVgroupsPerDb, htonl(tsMaxVgroupsPerDb));
return false;
if (clusterCfg->maxVgroupsPerDb != htonl(tsMaxVgroupsPerDb)) {
mError("\"maxVgroupsPerDb\"[%d - %d] cfg parameters inconsistent", clusterCfg->maxVgroupsPerDb,
htonl(tsMaxVgroupsPerDb));
return TAOS_DN_OFF_MAX_VG_PER_DB_NOT_MATCH;
}
if (0 != strncasecmp(clusterCfg->arbitrator, tsArbitrator, strlen(tsArbitrator))) {
mError("\"arbitrator\"[%s - %s] cfg parameters inconsistent", clusterCfg->arbitrator, tsArbitrator);
return false;
return TAOS_DN_OFF_ARBITRATOR_NOT_MATCH;
}
int64_t checkTime = 0;
char timestr[32] = "1970-01-01 00:00:00.00";
char timestr[32] = "1970-01-01 00:00:00.00";
(void)taosParseTime(timestr, &checkTime, strlen(timestr), TSDB_TIME_PRECISION_MILLI, 0);
if ((0 != strncasecmp(clusterCfg->timezone, tsTimezone, strlen(tsTimezone))) && (checkTime != clusterCfg->checkTime)) {
mError("\"timezone\"[%s - %s] [%" PRId64 " - %" PRId64"] cfg parameters inconsistent", clusterCfg->timezone, tsTimezone, clusterCfg->checkTime, checkTime);
return false;
if ((0 != strncasecmp(clusterCfg->timezone, tsTimezone, strlen(tsTimezone))) &&
(checkTime != clusterCfg->checkTime)) {
mError("\"timezone\"[%s - %s] [%" PRId64 " - %" PRId64 "] cfg parameters inconsistent", clusterCfg->timezone,
tsTimezone, clusterCfg->checkTime, checkTime);
return TAOS_DN_OFF_TIME_ZONE_NOT_MATCH;
}
if (0 != strncasecmp(clusterCfg->locale, tsLocale, strlen(tsLocale))) {
mError("\"locale\"[%s - %s] cfg parameters inconsistent", clusterCfg->locale, tsLocale);
return false;
return TAOS_DN_OFF_LOCALE_NOT_MATCH;
}
if (0 != strncasecmp(clusterCfg->charset, tsCharset, strlen(tsCharset))) {
mError("\"charset\"[%s - %s] cfg parameters inconsistent.", clusterCfg->charset, tsCharset);
return false;
return TAOS_DN_OFF_CHARSET_NOT_MATCH;
}
return true;
return 0;
}
static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) {
SDnodeObj *pDnode = NULL;
SDMStatusMsg *pStatus = pMsg->rpcMsg.pCont;
pStatus->dnodeId = htonl(pStatus->dnodeId);
pStatus->moduleStatus = htonl(pStatus->moduleStatus);
pStatus->lastReboot = htonl(pStatus->lastReboot);
pStatus->numOfCores = htons(pStatus->numOfCores);
uint32_t version = htonl(pStatus->version);
if (version != tsVersion) {
mError("status msg version:%d not equal with mnode:%d", version, tsVersion);
pDnode = mnodeGetDnodeByEp(pStatus->dnodeEp);
if (pDnode != NULL && pDnode->status != TAOS_DN_STATUS_READY) {
pDnode->offlineReason = TAOS_DN_OFF_VERSION_NOT_MATCH;
}
mError("dnode:%d, status msg version:%d not equal with cluster:%d", pStatus->dnodeId, version, tsVersion);
return TSDB_CODE_MND_INVALID_MSG_VERSION;
}
SDnodeObj *pDnode = NULL;
if (pStatus->dnodeId == 0) {
pDnode = mnodeGetDnodeByEp(pStatus->dnodeEp);
if (pDnode == NULL) {
......@@ -411,7 +445,11 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) {
} else {
pDnode = mnodeGetDnode(pStatus->dnodeId);
if (pDnode == NULL) {
mError("dnode id:%d, %s not exist", pStatus->dnodeId, pStatus->dnodeEp);
pDnode = mnodeGetDnodeByEp(pStatus->dnodeEp);
if (pDnode != NULL && pDnode->status != TAOS_DN_STATUS_READY) {
pDnode->offlineReason = TAOS_DN_OFF_DNODE_ID_NOT_MATCH;
}
mError("dnode:%d, %s not exist", pStatus->dnodeId, pStatus->dnodeEp);
return TSDB_CODE_MND_DNODE_NOT_EXIST;
}
}
......@@ -426,6 +464,9 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) {
mDebug("dnode:%d %s, first access, set clusterId %s", pDnode->dnodeId, pDnode->dnodeEp, mnodeGetClusterId());
} else {
if (strncmp(pStatus->clusterId, mnodeGetClusterId(), TSDB_CLUSTER_ID_LEN - 1) != 0) {
if (pDnode != NULL && pDnode->status != TAOS_DN_STATUS_READY) {
pDnode->offlineReason = TAOS_DN_OFF_CLUSTER_ID_NOT_MATCH;
}
mError("dnode:%d, input clusterId %s not match with exist %s", pDnode->dnodeId, pStatus->clusterId,
mnodeGetClusterId());
return TSDB_CODE_MND_INVALID_CLUSTER_ID;
......@@ -469,16 +510,19 @@ static int32_t mnodeProcessDnodeStatusMsg(SMnodeMsg *pMsg) {
if (pDnode->status == TAOS_DN_STATUS_OFFLINE) {
// Verify whether the cluster parameters are consistent when status change from offline to ready
bool ret = mnodeCheckClusterCfgPara(&(pStatus->clusterCfg));
if (false == ret) {
int32_t ret = mnodeCheckClusterCfgPara(&(pStatus->clusterCfg));
if (0 != ret) {
pDnode->offlineReason = ret;
mnodeDecDnodeRef(pDnode);
rpcFreeCont(pRsp);
mError("dnode:%d, %s cluster cfg parameters inconsistent", pDnode->dnodeId, pStatus->dnodeEp);
mError("dnode:%d, %s cluster cfg parameters inconsistent, reason:%s", pDnode->dnodeId, pStatus->dnodeEp,
offlineReason[ret]);
return TSDB_CODE_MND_CLUSTER_CFG_INCONSISTENT;
}
mDebug("dnode:%d, from offline to online", pDnode->dnodeId);
pDnode->status = TAOS_DN_STATUS_READY;
pDnode->offlineReason = TAOS_DN_OFF_ONLINE;
balanceSyncNotify();
balanceAsyncNotify();
}
......@@ -529,6 +573,7 @@ static int32_t mnodeCreateDnode(char *ep, SMnodeMsg *pMsg) {
pDnode = (SDnodeObj *) calloc(1, sizeof(SDnodeObj));
pDnode->createdTime = taosGetTimestampMs();
pDnode->status = TAOS_DN_STATUS_OFFLINE;
pDnode->offlineReason = TAOS_DN_OFF_STATUS_NOT_RECEIVED;
tstrncpy(pDnode->dnodeEp, ep, TSDB_EP_LEN);
taosGetFqdnPortFromEp(ep, pDnode->dnodeFqdn, &pDnode->dnodePort);
......@@ -654,13 +699,13 @@ static int32_t mnodeGetDnodeMeta(STableMetaMsg *pMeta, SShowObj *pShow, void *pC
pSchema[cols].bytes = htons(pShow->bytes[cols]);
cols++;
pShow->bytes[cols] = 12 + VARSTR_HEADER_SIZE;
pShow->bytes[cols] = 10 + VARSTR_HEADER_SIZE;
pSchema[cols].type = TSDB_DATA_TYPE_BINARY;
strcpy(pSchema[cols].name, "status");
pSchema[cols].bytes = htons(pShow->bytes[cols]);
cols++;
pShow->bytes[cols] = 6 + VARSTR_HEADER_SIZE;
pShow->bytes[cols] = 5 + VARSTR_HEADER_SIZE;
pSchema[cols].type = TSDB_DATA_TYPE_BINARY;
strcpy(pSchema[cols].name, "role");
pSchema[cols].bytes = htons(pShow->bytes[cols]);
......@@ -672,6 +717,12 @@ static int32_t mnodeGetDnodeMeta(STableMetaMsg *pMeta, SShowObj *pShow, void *pC
pSchema[cols].bytes = htons(pShow->bytes[cols]);
cols++;
pShow->bytes[cols] = 24 + VARSTR_HEADER_SIZE;
pSchema[cols].type = TSDB_DATA_TYPE_BINARY;
strcpy(pSchema[cols].name, "offline reason");
pSchema[cols].bytes = htons(pShow->bytes[cols]);
cols++;
pMeta->numOfColumns = htons(cols);
pShow->numOfColumns = cols;
......@@ -731,8 +782,11 @@ static int32_t mnodeRetrieveDnodes(SShowObj *pShow, char *data, int32_t rows, vo
*(int64_t *)pWrite = pDnode->createdTime;
cols++;
numOfRows++;
pWrite = data + pShow->offset[cols] * rows + pShow->bytes[cols] * numOfRows;
STR_TO_VARSTR(pWrite, offlineReason[pDnode->offlineReason]);
cols++;
numOfRows++;
mnodeDecDnodeRef(pDnode);
}
......
......@@ -278,6 +278,7 @@ cd ../../../debug; make
./test.sh -f unique/dnode/balancex.sim
./test.sh -f unique/dnode/offline1.sim
./test.sh -f unique/dnode/offline2.sim
./test.sh -f unique/dnode/reason.sim
./test.sh -f unique/dnode/remove1.sim
./test.sh -f unique/dnode/remove2.sim
./test.sh -f unique/dnode/vnode_clean.sim
......
system sh/stop_dnodes.sh
system sh/deploy.sh -n dnode1 -i 1
system sh/deploy.sh -n dnode2 -i 2
print ========== step1
system sh/exec.sh -n dnode1 -s start
sleep 3000
sql connect
sql create dnode $hostname2
sql show dnodes
print dnode1 off: $data7_1
print dnode2 off: $data7_2
if $data7_2 != @status not received@ then
return -1
endi
print ========== step2
system sh/exec.sh -n dnode2 -s start
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode2 off: $data7_2
print ========== step3
system sh/exec.sh -n dnode2 -s stop
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode2 off: $data7_2
if $data7_2 != @status msg timeout@ then
return -1
endi
print ========== step4
sql drop dnode $hostname2
sleep 5000
sql show dnodes
if $rows != 1 then
return -1
endi
print ========== step5
system sh/exec.sh -n dnode2 -s start
sql create dnode $hostname2
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode2 off: $data7_3
if $data7_3 != @dnodeId not match@ then
return -1
endi
print ========== step6
system sh/deploy.sh -n dnode4 -i 4
system sh/cfg.sh -n dnode4 -c mnodeEqualVnodeNum -v 3
system sh/exec.sh -n dnode4 -s start
sql create dnode $hostname4
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode4 off: $data7_4
if $data7_4 != @mnEqualVn not match@ then
return -1
endi
print ========== step7
system sh/deploy.sh -n dnode5 -i 5
system sh/cfg.sh -n dnode5 -c statusInterval -v 3
system sh/exec.sh -n dnode5 -s start
sql create dnode $hostname5
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode5 off: $data7_5
if $data7_5 != @interval not match@ then
return -1
endi
print ========== step8
system sh/deploy.sh -n dnode6 -i 6
system sh/cfg.sh -n dnode6 -c balance -v 0
system sh/exec.sh -n dnode6 -s start
sql create dnode $hostname6
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode6 off: $data7_6
if $data7_6 != @balance not match@ then
return -1
endi
print ========== step9
system sh/deploy.sh -n dnode7 -i 7
system sh/cfg.sh -n dnode7 -c maxTablesPerVnode -v 3000
system sh/exec.sh -n dnode7 -s start
sql create dnode $hostname7
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode7 off: $data7_7
if $data7_7 != @maxTabPerVn not match@ then
return -1
endi
print ========== step10
system sh/deploy.sh -n dnode8 -i 8
system sh/cfg.sh -n dnode8 -c maxVgroupsPerDb -v 3
system sh/exec.sh -n dnode8 -s start
sql create dnode $hostname8
sleep 3000
sql show dnodes
print dnode1 off: $data7_1
print dnode8 off: $data7_8
if $data7_8 != @maxVgPerDb not match@ then
return -1
endi
system sh/exec.sh -n dnode1 -s stop -x SIGINT
system sh/exec.sh -n dnode2 -s stop -x SIGINT
system sh/exec.sh -n dnode3 -s stop -x SIGINT
system sh/exec.sh -n dnode4 -s stop -x SIGINT
system sh/exec.sh -n dnode5 -s stop -x SIGINT
system sh/exec.sh -n dnode6 -s stop -x SIGINT
system sh/exec.sh -n dnode7 -s stop -x SIGINT
system sh/exec.sh -n dnode8 -s stop -x SIGINT
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册