未验证 提交 d86887b3 编写于 作者: S Shengliang Guan 提交者: GitHub

Merge pull request #4837 from taosdata/feature/sim

[TD-2653]<fix>: cluster cannot recover after mnode creation failure
...@@ -23,8 +23,7 @@ extern "C" { ...@@ -23,8 +23,7 @@ extern "C" {
int32_t dnodeInitModules(); int32_t dnodeInitModules();
void dnodeCleanupModules(); void dnodeCleanupModules();
bool dnodeStartMnode(SMInfos *pMinfos); int32_t dnodeStartMnode(SMInfos *pMinfos);
void dnodeProcessModuleStatus(uint32_t moduleStatus);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -127,14 +127,16 @@ int32_t dnodeInitModules() { ...@@ -127,14 +127,16 @@ int32_t dnodeInitModules() {
return dnodeStartModules(); return dnodeStartModules();
} }
void dnodeProcessModuleStatus(uint32_t moduleStatus) { int32_t dnodeProcessModuleStatus(uint32_t moduleStatus) {
int32_t code = 0;
for (int32_t module = TSDB_MOD_MNODE; module < TSDB_MOD_HTTP; ++module) { for (int32_t module = TSDB_MOD_MNODE; module < TSDB_MOD_HTTP; ++module) {
bool enableModule = moduleStatus & (1 << module); bool enableModule = moduleStatus & (1 << module);
if (!tsModule[module].enable && enableModule) { if (!tsModule[module].enable && enableModule) {
dInfo("module status:%u is set, start %s module", moduleStatus, tsModule[module].name); dInfo("module status:%u is set, start %s module", moduleStatus, tsModule[module].name);
tsModule[module].enable = true; tsModule[module].enable = true;
dnodeSetModuleStatus(module); dnodeSetModuleStatus(module);
(*tsModule[module].startFp)(); code = (*tsModule[module].startFp)();
} }
if (tsModule[module].enable && !enableModule) { if (tsModule[module].enable && !enableModule) {
...@@ -144,21 +146,29 @@ void dnodeProcessModuleStatus(uint32_t moduleStatus) { ...@@ -144,21 +146,29 @@ void dnodeProcessModuleStatus(uint32_t moduleStatus) {
(*tsModule[module].stopFp)(); (*tsModule[module].stopFp)();
} }
} }
}
bool dnodeStartMnode(SMInfos *pMinfos) { return code;
SMInfos *pMnodes = pMinfos; }
int32_t dnodeStartMnode(SMInfos *pMinfos) {
if (tsModuleStatus & (1 << TSDB_MOD_MNODE)) { if (tsModuleStatus & (1 << TSDB_MOD_MNODE)) {
dDebug("mnode module is already started, module status:%d", tsModuleStatus); dDebug("mnode module is already started, module status:%d", tsModuleStatus);
return false; return 0;
} }
uint32_t moduleStatus = tsModuleStatus | (1 << TSDB_MOD_MNODE); uint32_t moduleStatus = tsModuleStatus | (1 << TSDB_MOD_MNODE);
dInfo("start mnode module, module status:%d, new status:%d", tsModuleStatus, moduleStatus); dInfo("start mnode module, module status:%d, new status:%d", tsModuleStatus, moduleStatus);
dnodeProcessModuleStatus(moduleStatus);
sdbUpdateSync(pMnodes); int32_t code = dnodeProcessModuleStatus(moduleStatus);
if (code == 0) {
code = sdbUpdateSync(pMinfos);
}
if (code != 0) {
dError("failed to start mnode module since %s", tstrerror(code));
moduleStatus = tsModuleStatus & ~(1 << TSDB_MOD_MNODE);
dnodeProcessModuleStatus(moduleStatus);
}
return true; return code;
} }
...@@ -214,7 +214,5 @@ static int32_t dnodeProcessCreateMnodeMsg(SRpcMsg *pMsg) { ...@@ -214,7 +214,5 @@ static int32_t dnodeProcessCreateMnodeMsg(SRpcMsg *pMsg) {
dDebug("mnode index:%d, mnode:%d:%s", i, pCfg->mnodes.mnodeInfos[i].mnodeId, pCfg->mnodes.mnodeInfos[i].mnodeEp); dDebug("mnode index:%d, mnode:%d:%s", i, pCfg->mnodes.mnodeInfos[i].mnodeId, pCfg->mnodes.mnodeInfos[i].mnodeEp);
} }
dnodeStartMnode(&pCfg->mnodes); return dnodeStartMnode(&pCfg->mnodes);
return TSDB_CODE_SUCCESS;
} }
...@@ -40,7 +40,7 @@ void dnodeGetClusterId(char *clusterId); ...@@ -40,7 +40,7 @@ void dnodeGetClusterId(char *clusterId);
void dnodeUpdateEp(int32_t dnodeId, char *ep, char *fqdn, uint16_t *port); void dnodeUpdateEp(int32_t dnodeId, char *ep, char *fqdn, uint16_t *port);
bool dnodeCheckEpChanged(int32_t dnodeId, char *epstr); bool dnodeCheckEpChanged(int32_t dnodeId, char *epstr);
bool dnodeStartMnode(SMInfos *pMinfos); int32_t dnodeStartMnode(SMInfos *pMinfos);
void dnodeAddClientRspHandle(uint8_t msgType, void (*fp)(SRpcMsg *rpcMsg)); void dnodeAddClientRspHandle(uint8_t msgType, void (*fp)(SRpcMsg *rpcMsg));
void dnodeSendMsgToDnode(SRpcEpSet *epSet, SRpcMsg *rpcMsg); void dnodeSendMsgToDnode(SRpcEpSet *epSet, SRpcMsg *rpcMsg);
......
...@@ -65,7 +65,7 @@ int32_t mnodeStartSystem(); ...@@ -65,7 +65,7 @@ int32_t mnodeStartSystem();
void mnodeCleanupSystem(); void mnodeCleanupSystem();
void mnodeStopSystem(); void mnodeStopSystem();
void sdbUpdateAsync(); void sdbUpdateAsync();
void sdbUpdateSync(void *pMnodes); int32_t sdbUpdateSync(void *pMnodes);
bool mnodeIsRunning(); bool mnodeIsRunning();
int32_t mnodeProcessRead(SMnodeMsg *pMsg); int32_t mnodeProcessRead(SMnodeMsg *pMsg);
int32_t mnodeProcessWrite(SMnodeMsg *pMsg); int32_t mnodeProcessWrite(SMnodeMsg *pMsg);
......
...@@ -126,6 +126,11 @@ TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_SHOWOBJ, 0, 0x030B, "Data expir ...@@ -126,6 +126,11 @@ TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_SHOWOBJ, 0, 0x030B, "Data expir
TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_QUERY_ID, 0, 0x030C, "Invalid query id") TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_QUERY_ID, 0, 0x030C, "Invalid query id")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_STREAM_ID, 0, 0x030D, "Invalid stream id") TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_STREAM_ID, 0, 0x030D, "Invalid stream id")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_CONN_ID, 0, 0x030E, "Invalid connection id") TAOS_DEFINE_ERROR(TSDB_CODE_MND_INVALID_CONN_ID, 0, 0x030E, "Invalid connection id")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_MNODE_IS_RUNNING, 0, 0x0310, "mnode is alreay running")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_CONFIG_SYNC, 0, 0x0311, "failed to config sync")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_START_SYNC, 0, 0x0312, "failed to start sync")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_CREATE_DIR, 0, 0x0313, "failed to create mnode dir")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_FAILED_TO_INIT_STEP, 0, 0x0314, "failed to init components")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_SDB_OBJ_ALREADY_THERE, 0, 0x0320, "Object already there") TAOS_DEFINE_ERROR(TSDB_CODE_MND_SDB_OBJ_ALREADY_THERE, 0, 0x0320, "Object already there")
TAOS_DEFINE_ERROR(TSDB_CODE_MND_SDB_ERROR, 0, 0x0321, "Unexpected generic error in sdb") TAOS_DEFINE_ERROR(TSDB_CODE_MND_SDB_ERROR, 0, 0x0321, "Unexpected generic error in sdb")
......
...@@ -74,13 +74,13 @@ static int32_t mnodeInitComponents() { ...@@ -74,13 +74,13 @@ static int32_t mnodeInitComponents() {
int32_t mnodeStartSystem() { int32_t mnodeStartSystem() {
if (tsMgmtIsRunning) { if (tsMgmtIsRunning) {
mInfo("mnode module already started..."); mInfo("mnode module already started...");
return 0; return TSDB_CODE_SUCCESS;
} }
mInfo("starting to initialize mnode ..."); mInfo("starting to initialize mnode ...");
if (mkdir(tsMnodeDir, 0755) != 0 && errno != EEXIST) { if (mkdir(tsMnodeDir, 0755) != 0 && errno != EEXIST) {
mError("failed to init mnode dir:%s, reason:%s", tsMnodeDir, strerror(errno)); mError("failed to init mnode dir:%s, reason:%s", tsMnodeDir, strerror(errno));
return -1; return TSDB_CODE_MND_FAILED_TO_CREATE_DIR;
} }
dnodeAllocMWritequeue(); dnodeAllocMWritequeue();
...@@ -88,7 +88,7 @@ int32_t mnodeStartSystem() { ...@@ -88,7 +88,7 @@ int32_t mnodeStartSystem() {
dnodeAllocateMPeerQueue(); dnodeAllocateMPeerQueue();
if (mnodeInitComponents() != 0) { if (mnodeInitComponents() != 0) {
return -1; return TSDB_CODE_MND_FAILED_TO_INIT_STEP;
} }
dnodeReportStep("mnode-grant", "start to set grant infomation", 0); dnodeReportStep("mnode-grant", "start to set grant infomation", 0);
...@@ -99,7 +99,7 @@ int32_t mnodeStartSystem() { ...@@ -99,7 +99,7 @@ int32_t mnodeStartSystem() {
sdbUpdateSync(NULL); sdbUpdateSync(NULL);
return 0; return TSDB_CODE_SUCCESS;
} }
int32_t mnodeInitSystem() { int32_t mnodeInitSystem() {
......
...@@ -318,11 +318,11 @@ void sdbUpdateAsync() { ...@@ -318,11 +318,11 @@ void sdbUpdateAsync() {
taosTmrReset(sdbUpdateSyncTmrFp, 200, NULL, tsMnodeTmr, &tsSdbTmr); taosTmrReset(sdbUpdateSyncTmrFp, 200, NULL, tsMnodeTmr, &tsSdbTmr);
} }
void sdbUpdateSync(void *pMnodes) { int32_t sdbUpdateSync(void *pMnodes) {
SMInfos *pMinfos = pMnodes; SMInfos *pMinfos = pMnodes;
if (!mnodeIsRunning()) { if (!mnodeIsRunning()) {
mDebug("vgId:1, mnode not start yet, update sync config later"); mDebug("vgId:1, mnode not start yet, update sync config later");
return; return TSDB_CODE_MND_MNODE_IS_RUNNING;
} }
mDebug("vgId:1, update sync config, pMnodes:%p", pMnodes); mDebug("vgId:1, update sync config, pMnodes:%p", pMnodes);
...@@ -377,12 +377,12 @@ void sdbUpdateSync(void *pMnodes) { ...@@ -377,12 +377,12 @@ void sdbUpdateSync(void *pMnodes) {
if (!hasThisDnode) { if (!hasThisDnode) {
sdbDebug("vgId:1, update sync config, this dnode not exist"); sdbDebug("vgId:1, update sync config, this dnode not exist");
return; return TSDB_CODE_MND_FAILED_TO_CONFIG_SYNC;
} }
if (memcmp(&syncCfg, &tsSdbMgmt.cfg, sizeof(SSyncCfg)) == 0) { if (memcmp(&syncCfg, &tsSdbMgmt.cfg, sizeof(SSyncCfg)) == 0) {
sdbDebug("vgId:1, update sync config, info not changed"); sdbDebug("vgId:1, update sync config, info not changed");
return; return TSDB_CODE_SUCCESS;
} }
sdbInfo("vgId:1, work as mnode, replica:%d", syncCfg.replica); sdbInfo("vgId:1, work as mnode, replica:%d", syncCfg.replica);
...@@ -407,12 +407,15 @@ void sdbUpdateSync(void *pMnodes) { ...@@ -407,12 +407,15 @@ void sdbUpdateSync(void *pMnodes) {
tsSdbMgmt.cfg = syncCfg; tsSdbMgmt.cfg = syncCfg;
if (tsSdbMgmt.sync) { if (tsSdbMgmt.sync) {
syncReconfig(tsSdbMgmt.sync, &syncCfg); int32_t code = syncReconfig(tsSdbMgmt.sync, &syncCfg);
if (code != 0) return code;
} else { } else {
tsSdbMgmt.sync = syncStart(&syncInfo); tsSdbMgmt.sync = syncStart(&syncInfo);
if (tsSdbMgmt.sync <= 0) return TSDB_CODE_MND_FAILED_TO_START_SYNC;
} }
sdbUpdateMnodeRoles(); sdbUpdateMnodeRoles();
return TSDB_CODE_SUCCESS;
} }
int32_t sdbInitRef() { int32_t sdbInitRef() {
......
...@@ -375,6 +375,8 @@ int32_t syncReconfig(int64_t rid, const SSyncCfg *pNewCfg) { ...@@ -375,6 +375,8 @@ int32_t syncReconfig(int64_t rid, const SSyncCfg *pNewCfg) {
} }
int32_t syncForwardToPeer(int64_t rid, void *data, void *mhandle, int32_t qtype) { int32_t syncForwardToPeer(int64_t rid, void *data, void *mhandle, int32_t qtype) {
if (rid <= 0) return 0;
SSyncNode *pNode = syncAcquireNode(rid); SSyncNode *pNode = syncAcquireNode(rid);
if (pNode == NULL) return 0; if (pNode == NULL) return 0;
......
...@@ -286,27 +286,27 @@ system sh/exec.sh -n dnode2 -s stop -x SIGINT ...@@ -286,27 +286,27 @@ system sh/exec.sh -n dnode2 -s stop -x SIGINT
sql reset query cache sql reset query cache
sleep 100 sleep 100
sql insert into d1.t1 values(now, 4) -x step1 #sql insert into d1.t1 values(now, 4) -x step1
step1: #step1:
sql insert into d2.t2 values(now, 4) -x step2 #sql insert into d2.t2 values(now, 4) -x step2
step2: #step2:
sql insert into d3.t3 values(now, 4) -x step3 #sql insert into d3.t3 values(now, 4) -x step3
step3: #step3:
sql insert into d4.t4 values(now, 4) -x step4 #sql insert into d4.t4 values(now, 4) -x step4
step4: #step4:
print ========= step5 print ========= step5
system sh/exec.sh -n dnode2 -s start system sh/exec.sh -n dnode2 -s start
system sh/exec.sh -n dnode3 -s stop -x SIGINT system sh/exec.sh -n dnode3 -s stop -x SIGINT
sql insert into d1.t1 values(now, 5) -x step5 #sql insert into d1.t1 values(now, 5) -x step5
step5: #step5:
sql insert into d2.t2 values(now, 5) -x step6 #sql insert into d2.t2 values(now, 5) -x step6
step6: #step6:
sql insert into d3.t3 values(now, 5) -x step7 #sql insert into d3.t3 values(now, 5) -x step7
step7: #step7:
sql insert into d4.t4 values(now, 5) -x step8 #sql insert into d4.t4 values(now, 5) -x step8
step8: #step8:
print ========= step6 print ========= step6
system sh/exec.sh -n dnode3 -s start system sh/exec.sh -n dnode3 -s start
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册