提交 ed70febb 编写于 作者: S Shengliang Guan

[TD-2653]<fix>: cluster cannot recover after mnode creation failure

上级 65b4b8f0
......@@ -23,8 +23,7 @@ extern "C" {
int32_t dnodeInitModules();
void dnodeCleanupModules();
bool dnodeStartMnode(SMInfos *pMinfos);
void dnodeProcessModuleStatus(uint32_t moduleStatus);
int32_t dnodeStartMnode(SMInfos *pMinfos);
#ifdef __cplusplus
}
......
......@@ -127,14 +127,16 @@ int32_t dnodeInitModules() {
return dnodeStartModules();
}
void dnodeProcessModuleStatus(uint32_t moduleStatus) {
int32_t dnodeProcessModuleStatus(uint32_t moduleStatus) {
int32_t code = 0;
for (int32_t module = TSDB_MOD_MNODE; module < TSDB_MOD_HTTP; ++module) {
bool enableModule = moduleStatus & (1 << module);
if (!tsModule[module].enable && enableModule) {
dInfo("module status:%u is set, start %s module", moduleStatus, tsModule[module].name);
tsModule[module].enable = true;
dnodeSetModuleStatus(module);
(*tsModule[module].startFp)();
code = (*tsModule[module].startFp)();
}
if (tsModule[module].enable && !enableModule) {
......@@ -144,21 +146,29 @@ void dnodeProcessModuleStatus(uint32_t moduleStatus) {
(*tsModule[module].stopFp)();
}
}
}
bool dnodeStartMnode(SMInfos *pMinfos) {
SMInfos *pMnodes = pMinfos;
return code;
}
int32_t dnodeStartMnode(SMInfos *pMinfos) {
if (tsModuleStatus & (1 << TSDB_MOD_MNODE)) {
dDebug("mnode module is already started, module status:%d", tsModuleStatus);
return false;
return 0;
}
uint32_t moduleStatus = tsModuleStatus | (1 << TSDB_MOD_MNODE);
dInfo("start mnode module, module status:%d, new status:%d", tsModuleStatus, moduleStatus);
dnodeProcessModuleStatus(moduleStatus);
sdbUpdateSync(pMnodes);
int32_t code = dnodeProcessModuleStatus(moduleStatus);
if (code == 0) {
code = sdbUpdateSync(pMinfos);
}
if (code != 0) {
dError("failed to start mnode module since %s", tstrerror(code));
moduleStatus = tsModuleStatus & ~(1 << TSDB_MOD_MNODE);
dnodeProcessModuleStatus(moduleStatus);
}
return true;
return code;
}
......@@ -214,7 +214,5 @@ static int32_t dnodeProcessCreateMnodeMsg(SRpcMsg *pMsg) {
dDebug("mnode index:%d, mnode:%d:%s", i, pCfg->mnodes.mnodeInfos[i].mnodeId, pCfg->mnodes.mnodeInfos[i].mnodeEp);
}
dnodeStartMnode(&pCfg->mnodes);
return TSDB_CODE_SUCCESS;
return dnodeStartMnode(&pCfg->mnodes);
}
......@@ -40,7 +40,7 @@ void dnodeGetClusterId(char *clusterId);
void dnodeUpdateEp(int32_t dnodeId, char *ep, char *fqdn, uint16_t *port);
bool dnodeCheckEpChanged(int32_t dnodeId, char *epstr);
bool dnodeStartMnode(SMInfos *pMinfos);
int32_t dnodeStartMnode(SMInfos *pMinfos);
void dnodeAddClientRspHandle(uint8_t msgType, void (*fp)(SRpcMsg *rpcMsg));
void dnodeSendMsgToDnode(SRpcEpSet *epSet, SRpcMsg *rpcMsg);
......
......@@ -65,7 +65,7 @@ int32_t mnodeStartSystem();
void mnodeCleanupSystem();
void mnodeStopSystem();
void sdbUpdateAsync();
void sdbUpdateSync(void *pMnodes);
int32_t sdbUpdateSync(void *pMnodes);
bool mnodeIsRunning();
int32_t mnodeProcessRead(SMnodeMsg *pMsg);
int32_t mnodeProcessWrite(SMnodeMsg *pMsg);
......
......@@ -126,6 +126,11 @@ TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_SHOWOBJ, 0, 0x030B, "Data expir
TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_QUERY_ID, 0, 0x030C, "Invalid query id")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_STREAM_ID, 0, 0x030D, "Invalid stream id")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_CONN_ID, 0, 0x030E, "Invalid connection id")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_MNODE_IS_RUNNING, 0, 0x0310, "mnode is alreay running")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_CONFIG_SYNC, 0, 0x0311, "failed to config sync")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_START_SYNC, 0, 0x0312, "failed to start sync")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_CREATE_DIR, 0, 0x0313, "failed to create mnode dir")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_INIT_STEP, 0, 0x0314, "failed to init components")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_SDB_OBJ_ALREADY_THERE, 0, 0x0320, "Object already there")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_SDB_ERROR, 0, 0x0321, "Unexpected generic error in sdb")
......
......@@ -74,13 +74,13 @@ static int32_t mnodeInitComponents() {
int32_t mnodeStartSystem() {
if (tsMgmtIsRunning) {
mInfo("mnode module already started...");
return 0;
return TSDB_CODE_SUCCESS;
}
mInfo("starting to initialize mnode ...");
if (mkdir(tsMnodeDir, 0755) != 0 && errno != EEXIST) {
mError("failed to init mnode dir:%s, reason:%s", tsMnodeDir, strerror(errno));
return -1;
return TSDB_CODE_MND_FAILED_TO_CREATE_DIR;
}
dnodeAllocMWritequeue();
......@@ -88,7 +88,7 @@ int32_t mnodeStartSystem() {
dnodeAllocateMPeerQueue();
if (mnodeInitComponents() != 0) {
return -1;
return TSDB_CODE_MND_FAILED_TO_INIT_STEP;
}
dnodeReportStep("mnode-grant", "start to set grant infomation", 0);
......@@ -99,7 +99,7 @@ int32_t mnodeStartSystem() {
sdbUpdateSync(NULL);
return 0;
return TSDB_CODE_SUCCESS;
}
int32_t mnodeInitSystem() {
......
......@@ -318,11 +318,11 @@ void sdbUpdateAsync() {
taosTmrReset(sdbUpdateSyncTmrFp, 200, NULL, tsMnodeTmr, &tsSdbTmr);
}
void sdbUpdateSync(void *pMnodes) {
int32_t sdbUpdateSync(void *pMnodes) {
SMInfos *pMinfos = pMnodes;
if (!mnodeIsRunning()) {
mDebug("vgId:1, mnode not start yet, update sync config later");
return;
return TSDB_CODE_MND_MNODE_IS_RUNNING;
}
mDebug("vgId:1, update sync config, pMnodes:%p", pMnodes);
......@@ -377,12 +377,12 @@ void sdbUpdateSync(void *pMnodes) {
if (!hasThisDnode) {
sdbDebug("vgId:1, update sync config, this dnode not exist");
return;
return TSDB_CODE_MND_FAILED_TO_CONFIG_SYNC;
}
if (memcmp(&syncCfg, &tsSdbMgmt.cfg, sizeof(SSyncCfg)) == 0) {
sdbDebug("vgId:1, update sync config, info not changed");
return;
return TSDB_CODE_SUCCESS;
}
sdbInfo("vgId:1, work as mnode, replica:%d", syncCfg.replica);
......@@ -407,12 +407,15 @@ void sdbUpdateSync(void *pMnodes) {
tsSdbMgmt.cfg = syncCfg;
if (tsSdbMgmt.sync) {
syncReconfig(tsSdbMgmt.sync, &syncCfg);
int32_t code = syncReconfig(tsSdbMgmt.sync, &syncCfg);
if (code != 0) return code;
} else {
tsSdbMgmt.sync = syncStart(&syncInfo);
if (tsSdbMgmt.sync <= 0) return TSDB_CODE_MND_FAILED_TO_START_SYNC;
}
sdbUpdateMnodeRoles();
return TSDB_CODE_SUCCESS;
}
int32_t sdbInitRef() {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册