未验证 提交 969fdd1b 编写于 作者: X Xiaoyu Wang 提交者: GitHub

Merge pull request #20971 from taosdata/feat/learner

add learner
......@@ -1286,6 +1286,9 @@ typedef struct {
int16_t hashSuffix;
int32_t tsdbPageSize;
int64_t reserved[8];
int8_t learnerReplica;
int8_t learnerSelfIndex;
SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA];
} SCreateVnodeReq;
int32_t tSerializeSCreateVnodeReq(void* buf, int32_t bufLen, SCreateVnodeReq* pReq);
......@@ -1357,7 +1360,10 @@ typedef struct {
int8_t replica;
SReplica replicas[TSDB_MAX_REPLICA];
int64_t reserved[8];
} SAlterVnodeReplicaReq;
int8_t learnerSelfIndex;
int8_t learnerReplica;
SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA];
} SAlterVnodeReplicaReq, SAlterVnodeTypeReq;
int32_t tSerializeSAlterVnodeReplicaReq(void* buf, int32_t bufLen, SAlterVnodeReplicaReq* pReq);
int32_t tDeserializeSAlterVnodeReplicaReq(void* buf, int32_t bufLen, SAlterVnodeReplicaReq* pReq);
......@@ -1629,7 +1635,10 @@ int32_t tDeserializeSCreateDropMQSNodeReq(void* buf, int32_t bufLen, SMCreateQno
typedef struct {
int8_t replica;
SReplica replicas[TSDB_MAX_REPLICA];
} SDCreateMnodeReq, SDAlterMnodeReq;
int8_t learnerReplica;
SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA];
int64_t lastIndex;
} SDCreateMnodeReq, SDAlterMnodeReq, SDAlterMnodeTypeReq;
int32_t tSerializeSDCreateMnodeReq(void* buf, int32_t bufLen, SDCreateMnodeReq* pReq);
int32_t tDeserializeSDCreateMnodeReq(void* buf, int32_t bufLen, SDCreateMnodeReq* pReq);
......
......@@ -83,6 +83,8 @@ enum {
TD_DEF_MSG_TYPE(TDMT_DND_CONFIG_DNODE, "config-dnode", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_SYSTABLE_RETRIEVE, "dnode-retrieve", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_MAX_MSG, "dnd-max", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_ALTER_MNODE_TYPE, "dnode-alter-mnode-type", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_ALTER_VNODE_TYPE, "dnode-alter-vnode-type", NULL, NULL)
TD_NEW_MSG_SEG(TDMT_MND_MSG)
TD_DEF_MSG_TYPE(TDMT_MND_CONNECT, "connect", NULL, NULL)
......
......@@ -33,8 +33,11 @@ typedef struct {
bool deploy;
int8_t selfIndex;
int8_t numOfReplicas;
SReplica replicas[TSDB_MAX_REPLICA];
int8_t numOfTotalReplicas;
SReplica replicas[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int32_t nodeRoles[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SMsgCb msgCb;
int64_t lastIndex;
} SMnodeOpt;
/* ------------------------ SMnode ------------------------ */
......@@ -69,6 +72,8 @@ int32_t mndStart(SMnode *pMnode);
*/
void mndStop(SMnode *pMnode);
int32_t mndIsCatchUp(SMnode *pMnode);
/**
* @brief Get mnode monitor info.
*
......
......@@ -55,6 +55,8 @@ extern "C" {
#define SYNC_INDEX_INVALID -1
#define SYNC_TERM_INVALID -1
#define SYNC_LEARNER_CATCHUP 10
typedef enum {
SYNC_STRATEGY_NO_SNAPSHOT = 0,
SYNC_STRATEGY_STANDARD_SNAPSHOT = 1,
......@@ -76,19 +78,29 @@ typedef enum {
TAOS_SYNC_STATE_CANDIDATE = 101,
TAOS_SYNC_STATE_LEADER = 102,
TAOS_SYNC_STATE_ERROR = 103,
TAOS_SYNC_STATE_LEARNER = 104,
} ESyncState;
typedef enum {
TAOS_SYNC_ROLE_VOTER = 0,
TAOS_SYNC_ROLE_LEARNER = 1,
TAOS_SYNC_ROLE_ERROR = 2,
} ESyncRole;
typedef struct SNodeInfo {
int64_t clusterId;
int32_t nodeId;
uint16_t nodePort;
char nodeFqdn[TSDB_FQDN_LEN];
int64_t clusterId;
int32_t nodeId;
uint16_t nodePort;
char nodeFqdn[TSDB_FQDN_LEN];
ESyncRole nodeRole;
} SNodeInfo;
typedef struct SSyncCfg {
int32_t totalReplicaNum;
int32_t replicaNum;
int32_t myIndex;
SNodeInfo nodeInfo[TSDB_MAX_REPLICA];
SNodeInfo nodeInfo[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncIndex lastIndex;
} SSyncCfg;
typedef struct SFsmCbMeta {
......@@ -155,6 +167,7 @@ typedef struct SSyncFSM {
void (*FpBecomeLeaderCb)(const struct SSyncFSM* pFsm);
void (*FpBecomeFollowerCb)(const struct SSyncFSM* pFsm);
void (*FpBecomeLearnerCb)(const struct SSyncFSM* pFsm);
int32_t (*FpGetSnapshot)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot, void* pReaderParam, void** ppReader);
void (*FpGetSnapshotInfo)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot);
......@@ -236,6 +249,7 @@ void syncStop(int64_t rid);
void syncPreStop(int64_t rid);
void syncPostStop(int64_t rid);
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak, int64_t* seq);
int32_t syncIsCatchUp(int64_t rid);
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg);
int32_t syncReconfig(int64_t rid, SSyncCfg* pCfg);
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex);
......
......@@ -285,7 +285,7 @@ typedef enum ELogicConditionType {
#define TSDB_DNODE_ROLE_VNODE 2
#define TSDB_MAX_REPLICA 5
#define TSDB_MAX_LEARNER_REPLICA 10
#define TSDB_SYNC_LOG_BUFFER_SIZE 4096
#define TSDB_SYNC_LOG_BUFFER_RETENTION 256
#define TSDB_SYNC_APPLYQ_SIZE_LIMIT 512
......
......@@ -4129,6 +4129,12 @@ int32_t tSerializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *pR
for (int32_t i = 0; i < 8; ++i) {
if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1;
}
if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1;
if (tEncodeI8(&encoder, pReq->learnerSelfIndex) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
tEndEncode(&encoder);
......@@ -4207,6 +4213,14 @@ int32_t tDeserializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *
for (int32_t i = 0; i < 8; ++i) {
if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1;
}
if (!tDecodeIsEnd(&decoder)) {
if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1;
if (tDecodeI8(&decoder, &pReq->learnerSelfIndex) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
}
tEndDecode(&decoder);
tDecoderClear(&decoder);
......@@ -4433,6 +4447,12 @@ int32_t tSerializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnodeRe
for (int32_t i = 0; i < 8; ++i) {
if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1;
}
if (tEncodeI8(&encoder, pReq->learnerSelfIndex) < 0) return -1;
if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
tEndEncode(&encoder);
int32_t tlen = encoder.pos;
......@@ -4456,7 +4476,15 @@ int32_t tDeserializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnode
for (int32_t i = 0; i < 8; ++i) {
if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1;
}
if (!tDecodeIsEnd(&decoder)) {
if (tDecodeI8(&decoder, &pReq->learnerSelfIndex) < 0) return -1;
if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
}
tEndDecode(&decoder);
tDecoderClear(&decoder);
return 0;
......@@ -4767,6 +4795,12 @@ int32_t tSerializeSDCreateMnodeReq(void *buf, int32_t bufLen, SDCreateMnodeReq *
SReplica *pReplica = &pReq->replicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
if (tEncodeI64(&encoder, pReq->lastIndex) < 0) return -1;
tEndEncode(&encoder);
int32_t tlen = encoder.pos;
......@@ -4784,6 +4818,14 @@ int32_t tDeserializeSDCreateMnodeReq(void *buf, int32_t bufLen, SDCreateMnodeReq
SReplica *pReplica = &pReq->replicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
if (!tDecodeIsEnd(&decoder)) {
if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
if (tDecodeI64(&decoder, &pReq->lastIndex) < 0) return -1;
}
tEndDecode(&decoder);
tDecoderClear(&decoder);
......
......@@ -32,6 +32,7 @@ typedef struct SDnodeMgmt {
TdThread crashReportThread;
SSingleWorker mgmtWorker;
ProcessCreateNodeFp processCreateNodeFp;
ProcessAlterNodeTypeFp processAlterNodeTypeFp;
ProcessDropNodeFp processDropNodeFp;
SendMonitorReportFp sendMonitorReportFp;
GetVnodeLoadsFp getVnodeLoadsFp;
......
......@@ -352,6 +352,7 @@ SArray *dmGetMsgHandles() {
if (dmSetMgmtHandle(pArray, TDMT_DND_CONFIG_DNODE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_SERVER_STATUS, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_SYSTABLE_RETRIEVE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_MNODE_TYPE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
// Requests handled by MNODE
if (dmSetMgmtHandle(pArray, TDMT_MND_GRANT, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
......
......@@ -48,6 +48,7 @@ static int32_t dmOpenMgmt(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) {
pMgmt->path = pInput->path;
pMgmt->name = pInput->name;
pMgmt->processCreateNodeFp = pInput->processCreateNodeFp;
pMgmt->processAlterNodeTypeFp = pInput->processAlterNodeTypeFp;
pMgmt->processDropNodeFp = pInput->processDropNodeFp;
pMgmt->sendMonitorReportFp = pInput->sendMonitorReportFp;
pMgmt->getVnodeLoadsFp = pInput->getVnodeLoadsFp;
......
......@@ -227,6 +227,9 @@ static void dmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) {
case TDMT_DND_DROP_SNODE:
code = (*pMgmt->processDropNodeFp)(SNODE, pMsg);
break;
case TDMT_DND_ALTER_MNODE_TYPE:
code = (*pMgmt->processAlterNodeTypeFp)(MNODE, pMsg);
break;
case TDMT_DND_SERVER_STATUS:
code = dmProcessServerRunStatus(pMgmt, pMsg);
break;
......
......@@ -24,12 +24,16 @@ static int32_t mmDecodeOption(SJson *pJson, SMnodeOpt *pOption) {
if (code < 0) return -1;
tjsonGetInt32ValueFromDouble(pJson, "selfIndex", pOption->selfIndex, code);
if (code < 0) return 0;
tjsonGetInt32ValueFromDouble(pJson, "lastIndex", pOption->lastIndex, code);
if (code < 0) return 0;
SJson *replicas = tjsonGetObjectItem(pJson, "replicas");
if (replicas == NULL) return 0;
pOption->numOfReplicas = tjsonGetArraySize(replicas);
pOption->numOfTotalReplicas = tjsonGetArraySize(replicas);
pOption->numOfReplicas = 0;
for (int32_t i = 0; i < pOption->numOfReplicas; ++i) {
for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) {
SJson *replica = tjsonGetArrayItem(replicas, i);
if (replica == NULL) return -1;
......@@ -40,6 +44,14 @@ static int32_t mmDecodeOption(SJson *pJson, SMnodeOpt *pOption) {
if (code < 0) return -1;
tjsonGetUInt16ValueFromDouble(replica, "port", pReplica->port, code);
if (code < 0) return -1;
tjsonGetInt32ValueFromDouble(replica, "role", pOption->nodeRoles[i], code);
if (code < 0) return -1;
if(pOption->nodeRoles[i] == TAOS_SYNC_ROLE_VOTER){
pOption->numOfReplicas++;
}
}
for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) {
}
return 0;
......@@ -112,14 +124,14 @@ _OVER:
}
static int32_t mmEncodeOption(SJson *pJson, const SMnodeOpt *pOption) {
if (pOption->deploy && pOption->numOfReplicas > 0) {
if (pOption->deploy && pOption->numOfTotalReplicas > 0) {
if (tjsonAddDoubleToObject(pJson, "selfIndex", pOption->selfIndex) < 0) return -1;
SJson *replicas = tjsonCreateArray();
if (replicas == NULL) return -1;
if (tjsonAddItemToObject(pJson, "replicas", replicas) < 0) return -1;
for (int32_t i = 0; i < pOption->numOfReplicas; ++i) {
for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) {
SJson *replica = tjsonCreateObject();
if (replica == NULL) return -1;
......@@ -127,10 +139,13 @@ static int32_t mmEncodeOption(SJson *pJson, const SMnodeOpt *pOption) {
if (tjsonAddDoubleToObject(replica, "id", pReplica->id) < 0) return -1;
if (tjsonAddStringToObject(replica, "fqdn", pReplica->fqdn) < 0) return -1;
if (tjsonAddDoubleToObject(replica, "port", pReplica->port) < 0) return -1;
if (tjsonAddDoubleToObject(replica, "role", pOption->nodeRoles[i]) < 0) return -1;
if (tjsonAddItemToArray(replicas, replica) < 0) return -1;
}
}
if (tjsonAddDoubleToObject(pJson, "lastIndex", pOption->lastIndex) < 0) return -1;
if (tjsonAddDoubleToObject(pJson, "deployed", pOption->deploy) < 0) return -1;
return 0;
......
......@@ -33,12 +33,24 @@ int32_t mmProcessCreateReq(const SMgmtInputOpt *pInput, SRpcMsg *pMsg) {
return -1;
}
SMnodeOpt option = {.deploy = true, .numOfReplicas = createReq.replica, .selfIndex = -1};
SMnodeOpt option = {.deploy = true, .numOfReplicas = createReq.replica,
.numOfTotalReplicas = createReq.replica + createReq.learnerReplica,
.selfIndex = -1, .lastIndex = createReq.lastIndex};
memcpy(option.replicas, createReq.replicas, sizeof(createReq.replicas));
for (int32_t i = 0; i < option.numOfReplicas; ++i) {
for (int32_t i = 0; i < createReq.replica; ++i) {
if (createReq.replicas[i].id == pInput->pData->dnodeId) {
option.selfIndex = i;
}
option.nodeRoles[i] = TAOS_SYNC_ROLE_VOTER;
}
memcpy(&(option.replicas[createReq.replica]), createReq.learnerReplicas, sizeof(createReq.learnerReplicas));
for (int32_t i = 0; i < createReq.learnerReplica; ++i) {
if (createReq.learnerReplicas[i].id == pInput->pData->dnodeId) {
option.selfIndex = createReq.replica + i;
}
option.nodeRoles[createReq.replica + i] = TAOS_SYNC_ROLE_LEARNER;
}
if (option.selfIndex == -1) {
......@@ -92,6 +104,8 @@ SArray *mmGetMsgHandles() {
if (dmSetMgmtHandle(pArray, TDMT_DND_CREATE_VNODE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_DROP_VNODE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_CONFIG_DNODE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_MNODE_TYPE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_VNODE_TYPE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_MND_CONNECT, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_MND_CREATE_ACCT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
......
......@@ -45,9 +45,11 @@ static void mmBuildOptionForDeploy(SMnodeMgmt *pMgmt, const SMgmtInputOpt *pInpu
pOption->dnodeId = pMgmt->pData->dnodeId;
pOption->selfIndex = 0;
pOption->numOfReplicas = 1;
pOption->numOfTotalReplicas = 1;
pOption->replicas[0].id = 1;
pOption->replicas[0].port = tsServerPort;
tstrncpy(pOption->replicas[0].fqdn, tsLocalFqdn, TSDB_FQDN_LEN);
pOption->lastIndex = SYNC_INDEX_INVALID;
}
static void mmBuildOptionForOpen(SMnodeMgmt *pMgmt, SMnodeOpt *pOption) {
......@@ -123,9 +125,10 @@ static int32_t mmOpen(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) {
}
tmsgReportStartup("mnode-worker", "initialized");
if (option.numOfReplicas > 0) {
if (option.numOfTotalReplicas > 0) {
option.deploy = true;
option.numOfReplicas = 0;
option.numOfTotalReplicas = 0;
if (mmWriteFile(pMgmt->path, &option) != 0) {
dError("failed to write mnode file since %s", terrstr());
return -1;
......@@ -152,6 +155,10 @@ static void mmStop(SMnodeMgmt *pMgmt) {
mndStop(pMgmt->pMnode);
}
static int32_t mmSyncIsCatchUp(SMnodeMgmt *pMgmt) {
return mndIsCatchUp(pMgmt->pMnode);
}
SMgmtFunc mmGetMgmtFunc() {
SMgmtFunc mgmtFunc = {0};
mgmtFunc.openFp = mmOpen;
......@@ -162,6 +169,7 @@ SMgmtFunc mmGetMgmtFunc() {
mgmtFunc.dropFp = (NodeDropFp)mmProcessDropReq;
mgmtFunc.requiredFp = mmRequire;
mgmtFunc.getHandlesFp = mmGetMsgHandles;
mgmtFunc.isCatchUpFp = (NodeIsCatchUpFp)mmSyncIsCatchUp;
return mgmtFunc;
}
......@@ -90,6 +90,7 @@ int32_t vmProcessDropVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessAlterHashRangeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
// vmFile.c
int32_t vmGetVnodeListFromFile(SVnodeMgmt *pMgmt, SWrapperCfg **ppCfgs, int32_t *numOfVnodes);
......
......@@ -131,15 +131,34 @@ static void vmGenerateVnodeCfg(SCreateVnodeReq *pCreate, SVnodeCfg *pCfg) {
pCfg->tsdbPageSize = pCreate->tsdbPageSize * 1024;
pCfg->standby = 0;
pCfg->syncCfg.myIndex = pCreate->selfIndex;
pCfg->syncCfg.replicaNum = pCreate->replica;
pCfg->syncCfg.replicaNum = 0;
pCfg->syncCfg.totalReplicaNum = 0;
memset(&pCfg->syncCfg.nodeInfo, 0, sizeof(pCfg->syncCfg.nodeInfo));
for (int32_t i = 0; i < pCreate->replica; ++i) {
SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i];
pNode->nodeId = pCreate->replicas[i].id;
pNode->nodePort = pCreate->replicas[i].port;
tstrncpy(pNode->nodeFqdn, pCreate->replicas[i].fqdn, TSDB_FQDN_LEN);
pNode->nodeId = pCreate->replicas[pCfg->syncCfg.replicaNum].id;
pNode->nodePort = pCreate->replicas[pCfg->syncCfg.replicaNum].port;
pNode->nodeRole = TAOS_SYNC_ROLE_VOTER;
tstrncpy(pNode->nodeFqdn, pCreate->replicas[pCfg->syncCfg.replicaNum].fqdn, TSDB_FQDN_LEN);
tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
pCfg->syncCfg.replicaNum++;
}
if(pCreate->selfIndex != -1){
pCfg->syncCfg.myIndex = pCreate->selfIndex;
}
for (int32_t i = pCfg->syncCfg.replicaNum; i < pCreate->replica + pCreate->learnerReplica; ++i) {
SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i];
pNode->nodeId = pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].id;
pNode->nodePort = pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].port;
pNode->nodeRole = TAOS_SYNC_ROLE_LEARNER;
tstrncpy(pNode->nodeFqdn, pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].fqdn, TSDB_FQDN_LEN);
tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
pCfg->syncCfg.totalReplicaNum++;
}
pCfg->syncCfg.totalReplicaNum += pCfg->syncCfg.replicaNum;
if(pCreate->learnerSelfIndex != -1){
pCfg->syncCfg.myIndex = pCreate->replica + pCreate->learnerSelfIndex;
}
}
......@@ -182,23 +201,40 @@ int32_t vmProcessCreateVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
dInfo("vgId:%d, start to create vnode, page:%d pageSize:%d buffer:%d szPage:%d szBuf:%" PRIu64
if(req.learnerReplica == 0)
{
req.learnerSelfIndex = -1;
}
dInfo("vgId:%d, vnode management handle msgType:%s, start to create vnode, page:%d pageSize:%d buffer:%d szPage:%d szBuf:%" PRIu64
", cacheLast:%d cacheLastSize:%d sstTrigger:%d tsdbPageSize:%d %d dbname:%s dbId:%" PRId64
", days:%d keep0:%d keep1:%d keep2:%d tsma:%d precision:%d compression:%d minRows:%d maxRows:%d"
", wal fsync:%d level:%d retentionPeriod:%d retentionSize:%" PRId64 " rollPeriod:%d segSize:%" PRId64
", hash method:%d begin:%u end:%u prefix:%d surfix:%d replica:%d selfIndex:%d strict:%d",
req.vgId, req.pages, req.pageSize, req.buffer, req.pageSize * 1024, (uint64_t)req.buffer * 1024 * 1024,
", hash method:%d begin:%u end:%u prefix:%d surfix:%d replica:%d selfIndex:%d "
"learnerReplica:%d learnerSelfIndex:%d strict:%d",
req.vgId, TMSG_INFO(pMsg->msgType), req.pages, req.pageSize, req.buffer, req.pageSize * 1024,
(uint64_t)req.buffer * 1024 * 1024,
req.cacheLast, req.cacheLastSize, req.sstTrigger, req.tsdbPageSize, req.tsdbPageSize * 1024, req.db, req.dbUid,
req.daysPerFile, req.daysToKeep0, req.daysToKeep1, req.daysToKeep2, req.isTsma, req.precision, req.compression,
req.minRows, req.maxRows, req.walFsyncPeriod, req.walLevel, req.walRetentionPeriod, req.walRetentionSize,
req.walRollPeriod, req.walSegmentSize, req.hashMethod, req.hashBegin, req.hashEnd, req.hashPrefix,
req.hashSuffix, req.replica, req.selfIndex, req.strict);
req.hashSuffix, req.replica, req.selfIndex, req.learnerReplica, req.learnerSelfIndex, req.strict);
for (int32_t i = 0; i < req.replica; ++i) {
dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", req.vgId, i, req.replicas[i].fqdn, req.replicas[i].port,
req.replicas[i].id);
}
for (int32_t i = 0; i < req.learnerReplica; ++i) {
dInfo("vgId:%d, learnerReplica:%d ep:%s:%u dnode:%d", req.vgId, i, req.learnerReplicas[i].fqdn,
req.learnerReplicas[i].port, req.replicas[i].id);
}
SReplica *pReplica = &req.replicas[req.selfIndex];
SReplica *pReplica = NULL;
if(req.selfIndex != -1){
pReplica = &req.replicas[req.selfIndex];
}
else{
pReplica = &req.learnerReplicas[req.learnerSelfIndex];
}
if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort ||
strcmp(pReplica->fqdn, tsLocalFqdn) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
......@@ -275,7 +311,8 @@ _OVER:
vnodeClose(pImpl);
vnodeDestroy(path, pMgmt->pTfs);
} else {
dInfo("vgId:%d, vnode is created", req.vgId);
dInfo("vgId:%d, vnode management handle msgType:%s, end to create vnode, vnode is created",
req.vgId, TMSG_INFO(pMsg->msgType));
}
tFreeSCreateVnodeReq(&req);
......@@ -283,6 +320,110 @@ _OVER:
return code;
}
int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
SAlterVnodeTypeReq req = {0};
if (tDeserializeSAlterVnodeReplicaReq(pMsg->pCont, pMsg->contLen, &req) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
return -1;
}
if(req.learnerReplicas == 0){
req.learnerSelfIndex = -1;
}
dInfo("vgId:%d, vnode management handle msgType:%s, start to process alter-node-type-request",
req.vgId, TMSG_INFO(pMsg->msgType));
SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId);
if (pVnode == NULL) {
dError("vgId:%d, failed to alter hashrange since %s", req.vgId, terrstr());
terrno = TSDB_CODE_VND_NOT_EXIST;
return -1;
}
dInfo("vgId:%d, checking node catch up", req.vgId);
if(vnodeIsCatchUp(pVnode->pImpl) != 0){
return -1;
}
dInfo("node:%s, catched up leader, continue to process alter-node-type-request", pMgmt->name);
int32_t vgId = req.vgId;
dInfo("vgId:%d, start to alter vnode type replica:%d selfIndex:%d strict:%d", vgId, req.replica, req.selfIndex,
req.strict);
for (int32_t i = 0; i < req.replica; ++i) {
SReplica *pReplica = &req.replicas[i];
dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->id);
}
for (int32_t i = 0; i < req.learnerReplica; ++i) {
SReplica *pReplica = &req.learnerReplicas[i];
dInfo("vgId:%d, learnerReplicas:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->id);
}
if (req.replica <= 0 ||
(req.selfIndex < 0 && req.learnerSelfIndex <0)||
req.selfIndex >= req.replica || req.learnerSelfIndex >= req.learnerReplica) {
terrno = TSDB_CODE_INVALID_MSG;
dError("vgId:%d, failed to alter replica since invalid msg", vgId);
return -1;
}
SReplica *pReplica = NULL;
if(req.selfIndex > 0){
pReplica = &req.replicas[req.selfIndex];
}
else{
pReplica = &req.learnerReplicas[req.learnerSelfIndex];
}
if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort ||
strcmp(pReplica->fqdn, tsLocalFqdn) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
dError("vgId:%d, dnodeId:%d ep:%s:%u not matched with local dnode", vgId, pReplica->id, pReplica->fqdn,
pReplica->port);
return -1;
}
dInfo("vgId:%d, start to close vnode", vgId);
SWrapperCfg wrapperCfg = {
.dropped = pVnode->dropped,
.vgId = pVnode->vgId,
.vgVersion = pVnode->vgVersion,
};
tstrncpy(wrapperCfg.path, pVnode->path, sizeof(wrapperCfg.path));
vmCloseVnode(pMgmt, pVnode, false);
char path[TSDB_FILENAME_LEN] = {0};
snprintf(path, TSDB_FILENAME_LEN, "vnode%svnode%d", TD_DIRSEP, vgId);
dInfo("vgId:%d, start to alter vnode replica at %s", vgId, path);
if (vnodeAlterReplica(path, &req, pMgmt->pTfs) < 0) {
dError("vgId:%d, failed to alter vnode at %s since %s", vgId, path, terrstr());
return -1;
}
dInfo("vgId:%d, begin to open vnode", vgId);
SVnode *pImpl = vnodeOpen(path, pMgmt->pTfs, pMgmt->msgCb);
if (pImpl == NULL) {
dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr());
return -1;
}
if (vmOpenVnode(pMgmt, &wrapperCfg, pImpl) != 0) {
dError("vgId:%d, failed to open vnode mgmt since %s", vgId, terrstr());
return -1;
}
if (vnodeStart(pImpl) != 0) {
dError("vgId:%d, failed to start sync since %s", vgId, terrstr());
return -1;
}
dInfo("vgId:%d, vnode management handle msgType:%s, end to process alter-node-type-request, vnode config is altered",
req.vgId, TMSG_INFO(pMsg->msgType));
return 0;
}
int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
SDisableVnodeWriteReq req = {0};
if (tDeserializeSDisableVnodeWriteReq(pMsg->pCont, pMsg->contLen, &req) != 0) {
......@@ -377,21 +518,40 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
if(alterReq.learnerReplica == 0){
alterReq.learnerSelfIndex = -1;
}
int32_t vgId = alterReq.vgId;
dInfo("vgId:%d, start to alter vnode replica:%d selfIndex:%d strict:%d", vgId, alterReq.replica, alterReq.selfIndex,
alterReq.strict);
dInfo("vgId:%d,vnode management handle msgType:%s, start to alter vnode replica:%d selfIndex:%d leanerReplica:%d "
"learnerSelfIndex:%d strict:%d",
vgId, TMSG_INFO(pMsg->msgType), alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica,
alterReq.learnerSelfIndex, alterReq.strict);
for (int32_t i = 0; i < alterReq.replica; ++i) {
SReplica *pReplica = &alterReq.replicas[i];
dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->port);
}
if (alterReq.replica <= 0 || alterReq.selfIndex < 0 || alterReq.selfIndex >= alterReq.replica) {
for (int32_t i = 0; i < alterReq.learnerReplica; ++i) {
SReplica *pReplica = &alterReq.learnerReplicas[i];
dInfo("vgId:%d, learnerReplicas:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->port);
}
if (alterReq.replica <= 0 ||
(alterReq.selfIndex < 0 && alterReq.learnerSelfIndex <0)||
alterReq.selfIndex >= alterReq.replica || alterReq.learnerSelfIndex >= alterReq.learnerReplica) {
terrno = TSDB_CODE_INVALID_MSG;
dError("vgId:%d, failed to alter replica since invalid msg", vgId);
return -1;
}
SReplica *pReplica = &alterReq.replicas[alterReq.selfIndex];
SReplica *pReplica = NULL;
if(alterReq.selfIndex != -1){
pReplica = &alterReq.replicas[alterReq.selfIndex];
}
else{
pReplica = &alterReq.learnerReplicas[alterReq.learnerSelfIndex];
}
if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort ||
strcmp(pReplica->fqdn, tsLocalFqdn) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
......@@ -425,7 +585,7 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
dInfo("vgId:%d, close vnode", vgId);
dInfo("vgId:%d, begin to open vnode", vgId);
SVnode *pImpl = vnodeOpen(path, pMgmt->pTfs, pMgmt->msgCb);
if (pImpl == NULL) {
dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr());
......@@ -442,7 +602,10 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
dInfo("vgId:%d, vnode config is altered", vgId);
dInfo("vgId:%d, vnode management handle msgType:%s, end to alter vnode replica:%d selfIndex:%d leanerReplica:%d "
"learnerSelfIndex:%d strict:%d",
vgId, TMSG_INFO(pMsg->msgType), alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica,
alterReq.learnerSelfIndex, alterReq.strict);
return 0;
}
......@@ -548,6 +711,7 @@ SArray *vmGetMsgHandles() {
if (dmSetMgmtHandle(pArray, TDMT_VND_TRIM, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_CREATE_VNODE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_DROP_VNODE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_VNODE_TYPE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_SYNC_TIMEOUT_ELECTION, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_SYNC_CLIENT_REQUEST, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER;
......
......@@ -49,6 +49,9 @@ static void vmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) {
case TDMT_VND_ALTER_HASHRANGE:
code = vmProcessAlterHashRangeReq(pMgmt, pMsg);
break;
case TDMT_DND_ALTER_VNODE_TYPE:
code = vmProcessAlterVnodeTypeReq(pMgmt, pMsg);
break;
default:
terrno = TSDB_CODE_MSG_NOT_PROCESSED;
dGError("msg:%p, not processed in vnode-mgmt queue", pMsg);
......
......@@ -169,6 +169,8 @@ static int32_t dmProcessCreateNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
return -1;
}
dInfo("start to process create-node-request");
pWrapper = &pDnode->wrappers[ntype];
if (taosMkDir(pWrapper->path) != 0) {
dmReleaseWrapper(pWrapper);
......@@ -198,6 +200,64 @@ static int32_t dmProcessCreateNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
return code;
}
static int32_t dmProcessAlterNodeTypeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
SDnode *pDnode = dmInstance();
SMgmtWrapper *pWrapper = dmAcquireWrapper(pDnode, ntype);
if (pWrapper == NULL) {
dError("fail to process alter node type since node not exist");
return -1;
}
dmReleaseWrapper(pWrapper);
dInfo("node:%s, start to process alter-node-type-request", pWrapper->name);
pWrapper = &pDnode->wrappers[ntype];
if(pWrapper->func.isCatchUpFp != NULL){
dInfo("node:%s, checking node catch up", pWrapper->name);
if(!(*pWrapper->func.isCatchUpFp)(pWrapper->pMgmt) == 0){
return -1;
}
}
dInfo("node:%s, catched up leader, continue to process alter-node-type-request", pWrapper->name);
taosThreadMutexLock(&pDnode->mutex);
dInfo("node:%s, stopping node", pWrapper->name);
dmStopNode(pWrapper);
dInfo("node:%s, closing node", pWrapper->name);
dmCloseNode(pWrapper);
pWrapper = &pDnode->wrappers[ntype];
if (taosMkDir(pWrapper->path) != 0) {
dmReleaseWrapper(pWrapper);
terrno = TAOS_SYSTEM_ERROR(errno);
dError("failed to create dir:%s since %s", pWrapper->path, terrstr());
return -1;
}
SMgmtInputOpt input = dmBuildMgmtInputOpt(pWrapper);
dInfo("node:%s, start to create", pWrapper->name);
int32_t code = (*pWrapper->func.createFp)(&input, pMsg);
if (code != 0) {
dError("node:%s, failed to create since %s", pWrapper->name, terrstr());
} else {
dInfo("node:%s, has been created", pWrapper->name);
code = dmOpenNode(pWrapper);
if (code == 0) {
code = dmStartNode(pWrapper);
}
pWrapper->deployed = true;
pWrapper->required = true;
}
taosThreadMutexUnlock(&pDnode->mutex);
return code;
}
static int32_t dmProcessDropNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
SDnode *pDnode = dmInstance();
......@@ -251,6 +311,7 @@ SMgmtInputOpt dmBuildMgmtInputOpt(SMgmtWrapper *pWrapper) {
.name = pWrapper->name,
.pData = &pWrapper->pDnode->data,
.processCreateNodeFp = dmProcessCreateNodeReq,
.processAlterNodeTypeFp = dmProcessAlterNodeTypeReq,
.processDropNodeFp = dmProcessDropNodeReq,
.sendMonitorReportFp = dmSendMonitorReport,
.getVnodeLoadsFp = dmGetVnodeLoads,
......
......@@ -89,6 +89,7 @@ typedef void (*SendMonitorReportFp)();
typedef void (*GetVnodeLoadsFp)(SMonVloadInfo *pInfo);
typedef void (*GetMnodeLoadsFp)(SMonMloadInfo *pInfo);
typedef void (*GetQnodeLoadsFp)(SQnodeLoad *pInfo);
typedef int32_t (*ProcessAlterNodeTypeFp)(EDndNodeType ntype, SRpcMsg *pMsg);
typedef struct {
int32_t dnodeId;
......@@ -112,6 +113,7 @@ typedef struct {
SDnodeData *pData;
SMsgCb msgCb;
ProcessCreateNodeFp processCreateNodeFp;
ProcessAlterNodeTypeFp processAlterNodeTypeFp;
ProcessDropNodeFp processDropNodeFp;
SendMonitorReportFp sendMonitorReportFp;
GetVnodeLoadsFp getVnodeLoadsFp;
......@@ -132,6 +134,7 @@ typedef int32_t (*NodeCreateFp)(const SMgmtInputOpt *pInput, SRpcMsg *pMsg);
typedef int32_t (*NodeDropFp)(const SMgmtInputOpt *pInput, SRpcMsg *pMsg);
typedef int32_t (*NodeRequireFp)(const SMgmtInputOpt *pInput, bool *required);
typedef SArray *(*NodeGetHandlesFp)(); // array of SMgmtHandle
typedef bool (*NodeIsCatchUpFp)(void *pMgmt);
typedef struct {
NodeOpenFp openFp;
......@@ -142,6 +145,7 @@ typedef struct {
NodeDropFp dropFp;
NodeRequireFp requiredFp;
NodeGetHandlesFp getHandlesFp;
NodeIsCatchUpFp isCatchUpFp;
} SMgmtFunc;
typedef struct {
......
......@@ -215,6 +215,8 @@ typedef struct {
bool syncRestore;
int64_t stateStartTime;
SDnodeObj* pDnode;
int32_t role;
SyncIndex lastIndex;
} SMnodeObj;
typedef struct {
......@@ -341,6 +343,7 @@ typedef struct {
ESyncState syncState;
bool syncRestore;
bool syncCanRead;
ESyncRole nodeRole;
} SVnodeGid;
typedef struct {
......@@ -361,7 +364,7 @@ typedef struct {
int8_t compact;
int8_t isTsma;
int8_t replica;
SVnodeGid vnodeGid[TSDB_MAX_REPLICA];
SVnodeGid vnodeGid[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
void* pTsma;
int32_t numOfCachedTables;
} SVgObj;
......
......@@ -92,8 +92,11 @@ typedef struct {
int64_t transSeq;
TdThreadMutex lock;
int8_t selfIndex;
int8_t numOfTotalReplicas;
int8_t numOfReplicas;
SReplica replicas[TSDB_MAX_REPLICA];
SReplica replicas[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
ESyncRole nodeRoles[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncIndex lastIndex;
} SSyncMgmt;
typedef struct {
......
......@@ -491,7 +491,10 @@ static void mndSetOptions(SMnode *pMnode, const SMnodeOpt *pOption) {
pMnode->selfDnodeId = pOption->dnodeId;
pMnode->syncMgmt.selfIndex = pOption->selfIndex;
pMnode->syncMgmt.numOfReplicas = pOption->numOfReplicas;
pMnode->syncMgmt.numOfTotalReplicas = pOption->numOfTotalReplicas;
pMnode->syncMgmt.lastIndex = pOption->lastIndex;
memcpy(pMnode->syncMgmt.replicas, pOption->replicas, sizeof(pOption->replicas));
memcpy(pMnode->syncMgmt.nodeRoles, pOption->nodeRoles, sizeof(pOption->nodeRoles));
}
SMnode *mndOpen(const char *path, const SMnodeOpt *pOption) {
......@@ -582,6 +585,11 @@ int32_t mndStart(SMnode *pMnode) {
return mndInitTimer(pMnode);
}
int32_t mndIsCatchUp(SMnode *pMnode) {
int64_t rid = pMnode->syncMgmt.sync;
return syncIsCatchUp(rid);
}
void mndStop(SMnode *pMnode) {
mndSetStop(pMnode);
mndSyncStop(pMnode);
......
......@@ -23,7 +23,7 @@
#include "mndTrans.h"
#include "tmisce.h"
#define MNODE_VER_NUMBER 1
#define MNODE_VER_NUMBER 2
#define MNODE_RESERVE_SIZE 64
static int32_t mndCreateDefaultMnode(SMnode *pMnode);
......@@ -53,6 +53,7 @@ int32_t mndInitMnode(SMnode *pMnode) {
mndSetMsgHandle(pMnode, TDMT_MND_CREATE_MNODE, mndProcessCreateMnodeReq);
mndSetMsgHandle(pMnode, TDMT_DND_CREATE_MNODE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_DND_ALTER_MNODE_TYPE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_MND_ALTER_MNODE, mndProcessAlterMnodeReq);
mndSetMsgHandle(pMnode, TDMT_MND_ALTER_MNODE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_MND_DROP_MNODE, mndProcessDropMnodeReq);
......@@ -126,6 +127,8 @@ static SSdbRaw *mndMnodeActionEncode(SMnodeObj *pObj) {
SDB_SET_INT32(pRaw, dataPos, pObj->id, _OVER)
SDB_SET_INT64(pRaw, dataPos, pObj->createdTime, _OVER)
SDB_SET_INT64(pRaw, dataPos, pObj->updateTime, _OVER)
SDB_SET_INT32(pRaw, dataPos, pObj->role, _OVER)
SDB_SET_INT64(pRaw, dataPos, pObj->lastIndex, _OVER)
SDB_SET_RESERVE(pRaw, dataPos, MNODE_RESERVE_SIZE, _OVER)
terrno = 0;
......@@ -149,7 +152,7 @@ static SSdbRow *mndMnodeActionDecode(SSdbRaw *pRaw) {
int8_t sver = 0;
if (sdbGetRawSoftVer(pRaw, &sver) != 0) return NULL;
if (sver != MNODE_VER_NUMBER) {
if (sver != 1 && sver != 2) {
terrno = TSDB_CODE_SDB_INVALID_DATA_VER;
goto _OVER;
}
......@@ -164,6 +167,10 @@ static SSdbRow *mndMnodeActionDecode(SSdbRaw *pRaw) {
SDB_GET_INT32(pRaw, dataPos, &pObj->id, _OVER)
SDB_GET_INT64(pRaw, dataPos, &pObj->createdTime, _OVER)
SDB_GET_INT64(pRaw, dataPos, &pObj->updateTime, _OVER)
if(sver >=2){
SDB_GET_INT32(pRaw, dataPos, &pObj->role, _OVER)
SDB_GET_INT64(pRaw, dataPos, &pObj->lastIndex, _OVER)
}
SDB_GET_RESERVE(pRaw, dataPos, MNODE_RESERVE_SIZE, _OVER)
terrno = 0;
......@@ -204,7 +211,9 @@ static int32_t mndMnodeActionDelete(SSdb *pSdb, SMnodeObj *pObj) {
static int32_t mndMnodeActionUpdate(SSdb *pSdb, SMnodeObj *pOld, SMnodeObj *pNew) {
mTrace("mnode:%d, perform update action, old row:%p new row:%p", pOld->id, pOld, pNew);
pOld->role = pNew->role;
pOld->updateTime = pNew->updateTime;
pOld->lastIndex = pNew->lastIndex;
mndReloadSyncConfig(pSdb->pMnode);
return 0;
......@@ -302,6 +311,27 @@ static int32_t mndBuildCreateMnodeRedoAction(STrans *pTrans, SDCreateMnodeReq *p
return 0;
}
static int32_t mndBuildAlterMnodeTypeRedoAction(STrans *pTrans,
SDAlterMnodeTypeReq *pAlterMnodeTypeReq, SEpSet *pAlterMnodeTypeEpSet) {
int32_t contLen = tSerializeSDCreateMnodeReq(NULL, 0, pAlterMnodeTypeReq);
void *pReq = taosMemoryMalloc(contLen);
tSerializeSDCreateMnodeReq(pReq, contLen, pAlterMnodeTypeReq);
STransAction action = {
.epSet = *pAlterMnodeTypeEpSet,
.pCont = pReq,
.contLen = contLen,
.msgType = TDMT_DND_ALTER_MNODE_TYPE,
.acceptableCode = TSDB_CODE_MNODE_ALREADY_DEPLOYED,
};
if (mndTransAppendRedoAction(pTrans, &action) != 0) {
taosMemoryFree(pReq);
return -1;
}
return 0;
}
static int32_t mndBuildAlterMnodeRedoAction(STrans *pTrans, SDCreateMnodeReq *pAlterReq, SEpSet *pAlterEpSet) {
int32_t contLen = tSerializeSDCreateMnodeReq(NULL, 0, pAlterReq);
void *pReq = taosMemoryMalloc(contLen);
......@@ -347,6 +377,7 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno
SSdb *pSdb = pMnode->pSdb;
void *pIter = NULL;
int32_t numOfReplicas = 0;
int32_t numOfLearnerReplicas = 0;
SDCreateMnodeReq createReq = {0};
SEpSet createEpset = {0};
......@@ -355,18 +386,29 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno
pIter = sdbFetch(pSdb, SDB_MNODE, pIter, (void **)&pMObj);
if (pIter == NULL) break;
createReq.replicas[numOfReplicas].id = pMObj->id;
createReq.replicas[numOfReplicas].port = pMObj->pDnode->port;
memcpy(createReq.replicas[numOfReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
if(pMObj->role == TAOS_SYNC_ROLE_VOTER){
createReq.replicas[numOfReplicas].id = pMObj->id;
createReq.replicas[numOfReplicas].port = pMObj->pDnode->port;
memcpy(createReq.replicas[numOfReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
numOfReplicas++;
}
else{
createReq.learnerReplicas[numOfLearnerReplicas].id = pMObj->id;
createReq.learnerReplicas[numOfLearnerReplicas].port = pMObj->pDnode->port;
memcpy(createReq.learnerReplicas[numOfLearnerReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
numOfLearnerReplicas++;
}
numOfReplicas++;
sdbRelease(pSdb, pMObj);
}
createReq.replica = numOfReplicas + 1;
createReq.replicas[numOfReplicas].id = pDnode->id;
createReq.replicas[numOfReplicas].port = pDnode->port;
memcpy(createReq.replicas[numOfReplicas].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
createReq.replica = numOfReplicas;
createReq.learnerReplica = numOfLearnerReplicas + 1;
createReq.learnerReplicas[numOfLearnerReplicas].id = pDnode->id;
createReq.learnerReplicas[numOfLearnerReplicas].port = pDnode->port;
memcpy(createReq.learnerReplicas[numOfLearnerReplicas].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
createReq.lastIndex = pObj->lastIndex;
createEpset.inUse = 0;
createEpset.numOfEps = 1;
......@@ -378,23 +420,78 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno
return 0;
}
static int32_t mndSetAlterMnodeTypeRedoActions(SMnode *pMnode, STrans *pTrans, SDnodeObj *pDnode, SMnodeObj *pObj) {
SSdb *pSdb = pMnode->pSdb;
void *pIter = NULL;
SDAlterMnodeTypeReq alterReq = {0};
SEpSet createEpset = {0};
while (1) {
SMnodeObj *pMObj = NULL;
pIter = sdbFetch(pSdb, SDB_MNODE, pIter, (void **)&pMObj);
if (pIter == NULL) break;
if(pMObj->role == TAOS_SYNC_ROLE_VOTER){
alterReq.replicas[alterReq.replica].id = pMObj->id;
alterReq.replicas[alterReq.replica].port = pMObj->pDnode->port;
memcpy(alterReq.replicas[alterReq.replica].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
alterReq.replica++;
}
else{
alterReq.learnerReplicas[alterReq.learnerReplica].id = pMObj->id;
alterReq.learnerReplicas[alterReq.learnerReplica].port = pMObj->pDnode->port;
memcpy(alterReq.learnerReplicas[alterReq.learnerReplica].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
alterReq.learnerReplica++;
}
sdbRelease(pSdb, pMObj);
}
alterReq.replicas[alterReq.replica].id = pDnode->id;
alterReq.replicas[alterReq.replica].port = pDnode->port;
memcpy(alterReq.replicas[alterReq.replica].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
alterReq.replica++;
alterReq.lastIndex = pObj->lastIndex;
createEpset.inUse = 0;
createEpset.numOfEps = 1;
createEpset.eps[0].port = pDnode->port;
memcpy(createEpset.eps[0].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
if (mndBuildAlterMnodeTypeRedoAction(pTrans, &alterReq, &createEpset) != 0) return -1;
return 0;
}
static int32_t mndCreateMnode(SMnode *pMnode, SRpcMsg *pReq, SDnodeObj *pDnode, SMCreateMnodeReq *pCreate) {
int32_t code = -1;
SMnodeObj mnodeObj = {0};
mnodeObj.id = pDnode->id;
mnodeObj.createdTime = taosGetTimestampMs();
mnodeObj.updateTime = mnodeObj.createdTime;
STrans *pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_GLOBAL, pReq, "create-mnode");
if (pTrans == NULL) goto _OVER;
mndTransSetSerial(pTrans);
mInfo("trans:%d, used to create mnode:%d", pTrans->id, pCreate->dnodeId);
if (mndTrancCheckConflict(pMnode, pTrans) != 0) goto _OVER;
SMnodeObj mnodeObj = {0};
mnodeObj.id = pDnode->id;
mnodeObj.createdTime = taosGetTimestampMs();
mnodeObj.updateTime = mnodeObj.createdTime;
mnodeObj.role = TAOS_SYNC_ROLE_LEARNER;
mnodeObj.lastIndex = pMnode->applied;
if (mndSetCreateMnodeRedoActions(pMnode, pTrans, pDnode, &mnodeObj) != 0) goto _OVER;
if (mndSetCreateMnodeRedoLogs(pMnode, pTrans, &mnodeObj) != 0) goto _OVER;
if (mndSetCreateMnodeCommitLogs(pMnode, pTrans, &mnodeObj) != 0) goto _OVER;
SMnodeObj mnodeLeaderObj = {0};
mnodeLeaderObj.id = pDnode->id;
mnodeLeaderObj.createdTime = taosGetTimestampMs();
mnodeLeaderObj.updateTime = mnodeLeaderObj.createdTime;
mnodeLeaderObj.role = TAOS_SYNC_ROLE_VOTER;
mnodeLeaderObj.lastIndex = pMnode->applied + 1;
if (mndSetAlterMnodeTypeRedoActions(pMnode, pTrans, pDnode, &mnodeLeaderObj) != 0) goto _OVER;
if (mndSetCreateMnodeCommitLogs(pMnode, pTrans, &mnodeLeaderObj) != 0) goto _OVER;
if (mndTransPrepare(pMnode, pTrans) != 0) goto _OVER;
code = 0;
......@@ -514,6 +611,7 @@ static int32_t mndSetDropMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDnode
int32_t mndSetDropMnodeInfoToTrans(SMnode *pMnode, STrans *pTrans, SMnodeObj *pObj, bool force) {
if (pObj == NULL) return 0;
pObj->lastIndex = pMnode->applied;
if (mndSetDropMnodeRedoActions(pMnode, pTrans, pObj->pDnode, pObj, force) != 0) return -1;
if (mndSetDropMnodeCommitLogs(pMnode, pTrans, pObj) != 0) return -1;
return 0;
......@@ -730,7 +828,8 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
void *pIter = NULL;
int32_t updatingMnodes = 0;
int32_t readyMnodes = 0;
SSyncCfg cfg = {.myIndex = -1};
SSyncCfg cfg = {.myIndex = -1, .lastIndex = 0,};
SyncIndex maxIndex = 0;
while (1) {
pIter = sdbFetchAll(pSdb, SDB_MNODE, pIter, (void **)&pObj, &objStatus, false);
......@@ -745,26 +844,41 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
}
if (objStatus == SDB_STATUS_READY || objStatus == SDB_STATUS_CREATING) {
SNodeInfo *pNode = &cfg.nodeInfo[cfg.replicaNum];
SNodeInfo *pNode = &cfg.nodeInfo[cfg.totalReplicaNum];
pNode->nodeId = pObj->pDnode->id;
pNode->clusterId = mndGetClusterId(pMnode);
pNode->nodePort = pObj->pDnode->port;
pNode->nodeRole = pObj->role;
tstrncpy(pNode->nodeFqdn, pObj->pDnode->fqdn, TSDB_FQDN_LEN);
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
mInfo("vgId:1, ep:%s:%u dnode:%d", pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
if (pObj->pDnode->id == pMnode->selfDnodeId) {
cfg.myIndex = cfg.replicaNum;
cfg.myIndex = cfg.totalReplicaNum;
}
if(pNode->nodeRole == TAOS_SYNC_ROLE_VOTER){
cfg.replicaNum++;
}
cfg.totalReplicaNum++;
if(pObj->lastIndex > cfg.lastIndex){
cfg.lastIndex = pObj->lastIndex;
}
}
if (objStatus == SDB_STATUS_DROPPING) {
if(pObj->lastIndex > cfg.lastIndex){
cfg.lastIndex = pObj->lastIndex;
}
cfg.replicaNum++;
}
mInfo("vgId:1, mnode:%d, role:%d, lastIndex:%" PRId64, pObj->id, pObj->role, pObj->lastIndex);
sdbReleaseLock(pSdb, pObj, false);
}
if (readyMnodes <= 0 || updatingMnodes <= 0) {
mInfo("vgId:1, mnode sync not reconfig since readyMnodes:%d updatingMnodes:%d", readyMnodes, updatingMnodes);
return;
}
//if (readyMnodes <= 0 || updatingMnodes <= 0) {
// mInfo("vgId:1, mnode sync not reconfig since readyMnodes:%d updatingMnodes:%d", readyMnodes, updatingMnodes);
// return;
//}
if (cfg.myIndex == -1) {
#if 1
......@@ -777,12 +891,14 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
return;
}
if (updatingMnodes > 0) {
mInfo("vgId:1, mnode sync reconfig, replica:%d myIndex:%d", cfg.replicaNum, cfg.myIndex);
for (int32_t i = 0; i < cfg.replicaNum; ++i) {
if (pMnode->syncMgmt.sync > 0) {
mInfo("vgId:1, mnode sync reconfig, totalReplica:%d replica:%d myIndex:%d",
cfg.totalReplicaNum, cfg.replicaNum, cfg.myIndex);
for (int32_t i = 0; i < cfg.totalReplicaNum; ++i) {
SNodeInfo *pNode = &cfg.nodeInfo[i];
mInfo("vgId:1, index:%d, ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId,
pNode->clusterId);
mInfo("vgId:1, index:%d, ep:%s:%u dnode:%d cluster:%" PRId64 " role:%d", i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId, pNode->clusterId, pNode->nodeRole);
}
int32_t code = syncReconfig(pMnode->syncMgmt.sync, &cfg);
......
......@@ -237,6 +237,23 @@ static void mndBecomeFollower(const SSyncFSM *pFsm) {
taosThreadMutexUnlock(&pMgmt->lock);
}
static void mndBecomeLearner(const SSyncFSM *pFsm) {
SMnode *pMnode = pFsm->data;
SSyncMgmt *pMgmt = &pMnode->syncMgmt;
mInfo("vgId:1, become learner");
taosThreadMutexLock(&pMgmt->lock);
if (pMgmt->transId != 0) {
mInfo("vgId:1, become learner and post sem, trans:%d, failed to propose since not leader", pMgmt->transId);
pMgmt->transId = 0;
pMgmt->transSec = 0;
pMgmt->transSeq = 0;
pMgmt->errCode = TSDB_CODE_SYN_NOT_LEADER;
tsem_post(&pMgmt->syncSem);
}
taosThreadMutexUnlock(&pMgmt->lock);
}
static void mndBecomeLeader(const SSyncFSM *pFsm) {
mInfo("vgId:1, become leader");
SMnode *pMnode = pFsm->data;
......@@ -278,6 +295,7 @@ SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) {
pFsm->FpReConfigCb = NULL;
pFsm->FpBecomeLeaderCb = mndBecomeLeader;
pFsm->FpBecomeFollowerCb = mndBecomeFollower;
pFsm->FpBecomeLearnerCb = mndBecomeLearner;
pFsm->FpGetSnapshot = mndSyncGetSnapshot;
pFsm->FpGetSnapshotInfo = mndSyncGetSnapshotInfo;
pFsm->FpSnapshotStartRead = mndSnapshotStartRead;
......@@ -317,13 +335,16 @@ int32_t mndInitSync(SMnode *pMnode) {
mInfo("vgId:1, start to open sync, replica:%d selfIndex:%d", pMgmt->numOfReplicas, pMgmt->selfIndex);
SSyncCfg *pCfg = &syncInfo.syncCfg;
pCfg->totalReplicaNum = pMgmt->numOfTotalReplicas;
pCfg->replicaNum = pMgmt->numOfReplicas;
pCfg->myIndex = pMgmt->selfIndex;
for (int32_t i = 0; i < pMgmt->numOfReplicas; ++i) {
pCfg->lastIndex = pMgmt->lastIndex;
for (int32_t i = 0; i < pMgmt->numOfTotalReplicas; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
pNode->nodeId = pMgmt->replicas[i].id;
pNode->nodePort = pMgmt->replicas[i].port;
tstrncpy(pNode->nodeFqdn, pMgmt->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
pNode->nodeRole = pMgmt->nodeRoles[i];
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
mInfo("vgId:1, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId,
pNode->clusterId);
......
......@@ -62,6 +62,7 @@ int32_t mndInitVgroup(SMnode *pMnode) {
mndSetMsgHandle(pMnode, TDMT_VND_COMPACT_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_VND_DISABLE_WRITE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_SYNC_FORCE_FOLLOWER_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_DND_ALTER_VNODE_TYPE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_MND_REDISTRIBUTE_VGROUP, mndProcessRedistributeVgroupMsg);
mndSetMsgHandle(pMnode, TDMT_MND_SPLIT_VGROUP, mndProcessSplitVgroupMsg);
......@@ -203,7 +204,7 @@ static int32_t mndVgroupActionUpdate(SSdb *pSdb, SVgObj *pOld, SVgObj *pNew) {
pNew->compStorage = pOld->compStorage;
pNew->pointsWritten = pOld->pointsWritten;
pNew->compact = pOld->compact;
memcpy(pOld->vnodeGid, pNew->vnodeGid, TSDB_MAX_REPLICA * sizeof(SVnodeGid));
memcpy(pOld->vnodeGid, pNew->vnodeGid, (TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA) * sizeof(SVnodeGid));
return 0;
}
......@@ -244,8 +245,10 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
createReq.compression = pDb->cfg.compression;
createReq.strict = pDb->cfg.strict;
createReq.cacheLast = pDb->cfg.cacheLast;
createReq.replica = pVgroup->replica;
createReq.replica = 0;
createReq.learnerReplica = 0;
createReq.selfIndex = -1;
createReq.learnerSelfIndex = -1;
createReq.hashBegin = pVgroup->hashBegin;
createReq.hashEnd = pVgroup->hashEnd;
createReq.hashMethod = pDb->cfg.hashMethod;
......@@ -263,7 +266,15 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
createReq.tsdbPageSize = pDb->cfg.tsdbPageSize;
for (int32_t v = 0; v < pVgroup->replica; ++v) {
SReplica *pReplica = &createReq.replicas[v];
SReplica *pReplica = NULL;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
pReplica = &createReq.replicas[createReq.replica];
}
else{
pReplica = &createReq.learnerReplicas[createReq.learnerReplica];
}
SVnodeGid *pVgid = &pVgroup->vnodeGid[v];
SDnodeObj *pVgidDnode = mndAcquireDnode(pMnode, pVgid->dnodeId);
if (pVgidDnode == NULL) {
......@@ -275,21 +286,40 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
memcpy(pReplica->fqdn, pVgidDnode->fqdn, TSDB_FQDN_LEN);
mndReleaseDnode(pMnode, pVgidDnode);
if (pDnode->id == pVgid->dnodeId) {
createReq.selfIndex = v;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
if (pDnode->id == pVgid->dnodeId) {
createReq.selfIndex = createReq.replica;
}
}
else{
if (pDnode->id == pVgid->dnodeId) {
createReq.learnerSelfIndex = createReq.learnerReplica;
}
}
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
createReq.replica++;
}
else{
createReq.learnerReplica++;
}
}
if (createReq.selfIndex == -1) {
if (createReq.selfIndex == -1 && createReq.learnerSelfIndex == -1) {
terrno = TSDB_CODE_APP_ERROR;
return NULL;
}
mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d strict:%d", createReq.vgId, createReq.replica,
createReq.selfIndex, createReq.strict);
mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d learnerReplica:%d learnerSelfIndex:%d strict:%d",
createReq.vgId, createReq.replica, createReq.selfIndex, createReq.learnerReplica,
createReq.learnerReplica, createReq.strict);
for (int32_t i = 0; i < createReq.replica; ++i) {
mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.replicas[i].fqdn, createReq.replicas[i].port);
}
for (int32_t i = 0; i < createReq.learnerReplica; ++i) {
mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.learnerReplicas[i].fqdn,
createReq.learnerReplicas[i].port);
}
int32_t contLen = tSerializeSCreateVnodeReq(NULL, 0, &createReq);
if (contLen < 0) {
......@@ -356,12 +386,24 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p
SAlterVnodeReplicaReq alterReq = {
.vgId = pVgroup->vgId,
.strict = pDb->cfg.strict,
.replica = pVgroup->replica,
.replica = 0,
.learnerReplica = 0,
.selfIndex = -1,
.learnerSelfIndex = -1,
};
for (int32_t v = 0; v < pVgroup->replica; ++v) {
SReplica *pReplica = &alterReq.replicas[v];
SReplica *pReplica = NULL;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
pReplica = &alterReq.replicas[alterReq.replica];
alterReq.replica++;
}
else{
pReplica = &alterReq.learnerReplicas[alterReq.learnerReplica];
alterReq.learnerReplica++;
}
SVnodeGid *pVgid = &pVgroup->vnodeGid[v];
SDnodeObj *pVgidDnode = mndAcquireDnode(pMnode, pVgid->dnodeId);
if (pVgidDnode == NULL) return NULL;
......@@ -371,18 +413,30 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p
memcpy(pReplica->fqdn, pVgidDnode->fqdn, TSDB_FQDN_LEN);
mndReleaseDnode(pMnode, pVgidDnode);
if (dnodeId == pVgid->dnodeId) {
alterReq.selfIndex = v;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
if (dnodeId == pVgid->dnodeId) {
alterReq.selfIndex = v;
}
}
else{
if (dnodeId == pVgid->dnodeId) {
alterReq.learnerSelfIndex = v;
}
}
}
alterReq.replica = pVgroup->replica;
mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d strict:%d", alterReq.vgId, alterReq.replica,
alterReq.selfIndex, alterReq.strict);
mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d learnerReplica:%d learnerSelfIndex:%d strict:%d",
alterReq.vgId, alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica,
alterReq.learnerSelfIndex, alterReq.strict);
for (int32_t i = 0; i < alterReq.replica; ++i) {
mInfo("vgId:%d, replica:%d ep:%s:%u", alterReq.vgId, i, alterReq.replicas[i].fqdn, alterReq.replicas[i].port);
}
for (int32_t i = 0; i < alterReq.learnerReplica; ++i) {
mInfo("vgId:%d, learnerReplica:%d ep:%s:%u", alterReq.vgId, i,
alterReq.learnerReplicas[i].fqdn, alterReq.learnerReplicas[i].port);
}
if (alterReq.selfIndex == -1) {
if (alterReq.selfIndex == -1 && alterReq.learnerSelfIndex == -1) {
terrno = TSDB_CODE_APP_ERROR;
return NULL;
}
......@@ -1194,6 +1248,30 @@ int32_t mndAddAlterVnodeReplicaAction(SMnode *pMnode, STrans *pTrans, SDbObj *pD
return 0;
}
int32_t mndAddAlterVnodeTypeAction(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroup, int32_t dnodeId) {
SDnodeObj *pDnode = mndAcquireDnode(pMnode, dnodeId);
if (pDnode == NULL) return -1;
STransAction action = {0};
action.epSet = mndGetDnodeEpset(pDnode);
mndReleaseDnode(pMnode, pDnode);
int32_t contLen = 0;
void *pReq = mndBuildAlterVnodeReplicaReq(pMnode, pDb, pVgroup, dnodeId, &contLen);
if (pReq == NULL) return -1;
action.pCont = pReq;
action.contLen = contLen;
action.msgType = TDMT_DND_ALTER_VNODE_TYPE;
if (mndTransAppendRedoAction(pTrans, &action) != 0) {
taosMemoryFree(pReq);
return -1;
}
return 0;
}
static int32_t mndAddDisableVnodeWriteAction(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroup,
int32_t dnodeId) {
SDnodeObj *pDnode = mndAcquireDnode(pMnode, dnodeId);
......@@ -1953,18 +2031,33 @@ int32_t mndBuildAlterVgroupAction(SMnode *pMnode, STrans *pTrans, SDbObj *pOldDb
mInfo("db:%s, vgId:%d, will add 2 vnodes, vn:0 dnode:%d", pVgroup->dbName, pVgroup->vgId,
pVgroup->vnodeGid[0].dnodeId);
//add first
if (mndAddVnodeToVgroup(pMnode, pTrans, &newVgroup, pArray) != 0) return -1;
newVgroup.vnodeGid[0].nodeRole = TAOS_SYNC_ROLE_VOTER;
newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_LEARNER;
if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[0].dnodeId) != 0)
return -1;
if (mndAddCreateVnodeAction(pMnode, pTrans, pNewDb, &newVgroup, &newVgroup.vnodeGid[1]) != 0) return -1;
newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_VOTER;
if (mndAddAlterVnodeTypeAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[1].dnodeId) != 0)
return -1;
if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pNewDb, &newVgroup) != 0) return -1;
//add second
if (mndAddVnodeToVgroup(pMnode, pTrans, &newVgroup, pArray) != 0) return -1;
newVgroup.vnodeGid[0].nodeRole = TAOS_SYNC_ROLE_VOTER;
newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_VOTER;
newVgroup.vnodeGid[2].nodeRole = TAOS_SYNC_ROLE_VOTER;
if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[0].dnodeId) != 0)
return -1;
if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[1].dnodeId) != 0)
return -1;
if (mndAddCreateVnodeAction(pMnode, pTrans, pNewDb, &newVgroup, &newVgroup.vnodeGid[2]) != 0) return -1;
if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pNewDb, &newVgroup) != 0) return -1;
} else if (newVgroup.replica == 3 && pNewDb->cfg.replications == 1) {
mInfo("db:%s, vgId:%d, will remove 2 vnodes, vn:0 dnode:%d vn:1 dnode:%d vn:2 dnode:%d", pVgroup->dbName,
......
......@@ -68,6 +68,7 @@ void vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnapshot);
void vnodeGetInfo(SVnode *pVnode, const char **dbname, int32_t *vgId);
int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen);
int32_t vnodeGetAllTableList(SVnode *pVnode, uint64_t uid, SArray *list);
int32_t vnodeIsCatchUp(SVnode *pVnode);
int32_t vnodeGetCtbIdList(SVnode *pVnode, int64_t suid, SArray *list);
int32_t vnodeGetCtbIdListByFilter(SVnode *pVnode, int64_t suid, SArray *list, bool (*filter)(void *arg), void *arg);
......
......@@ -57,6 +57,28 @@ int vnodeCheckCfg(const SVnodeCfg *pCfg) {
return 0;
}
const char* vnodeRoleToStr(ESyncRole role) {
switch (role) {
case TAOS_SYNC_ROLE_VOTER:
return "voter";
case TAOS_SYNC_ROLE_LEARNER:
return "learner";
default:
return "unknown";
}
}
const ESyncRole vnodeStrToRole(char* str) {
if(strcmp(str, "voter") == 0){
return TAOS_SYNC_ROLE_VOTER;
}
if(strcmp(str, "learner") == 0){
return TAOS_SYNC_ROLE_LEARNER;
}
return TAOS_SYNC_ROLE_ERROR;
}
int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
const SVnodeCfg *pCfg = (SVnodeCfg *)pObj;
......@@ -117,6 +139,7 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
if (tjsonAddIntegerToObject(pJson, "hashSuffix", pCfg->hashSuffix) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "syncCfg.replicaNum", pCfg->syncCfg.replicaNum) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "syncCfg.totalReplicaNum", pCfg->syncCfg.totalReplicaNum) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "syncCfg.myIndex", pCfg->syncCfg.myIndex) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "vndStats.stables", pCfg->vndStats.numOfSTables) < 0) return -1;
......@@ -128,9 +151,9 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
SJson *nodeInfo = tjsonCreateArray();
if (nodeInfo == NULL) return -1;
if (tjsonAddItemToObject(pJson, "syncCfg.nodeInfo", nodeInfo) < 0) return -1;
vDebug("vgId:%d, encode config, replicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.myIndex);
for (int i = 0; i < pCfg->syncCfg.replicaNum; ++i) {
vDebug("vgId:%d, encode config, replicas:%d totalReplicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.totalReplicaNum, pCfg->syncCfg.myIndex);
for (int i = 0; i < pCfg->syncCfg.totalReplicaNum; ++i) {
SJson *info = tjsonCreateObject();
SNodeInfo *pNode = (SNodeInfo *)&pCfg->syncCfg.nodeInfo[i];
if (info == NULL) return -1;
......@@ -138,6 +161,7 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
if (tjsonAddStringToObject(info, "nodeFqdn", pNode->nodeFqdn) < 0) return -1;
if (tjsonAddIntegerToObject(info, "nodeId", pNode->nodeId) < 0) return -1;
if (tjsonAddIntegerToObject(info, "clusterId", pNode->clusterId) < 0) return -1;
if (tjsonAddStringToObject(info, "nodeRole", vnodeRoleToStr(pNode->nodeRole)) < 0) return -1;
if (tjsonAddItemToArray(nodeInfo, info) < 0) return -1;
vDebug("vgId:%d, encode config, replica:%d ep:%s:%u dnode:%d", pCfg->vgId, i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId);
......@@ -235,6 +259,8 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) {
tjsonGetNumberValue(pJson, "syncCfg.replicaNum", pCfg->syncCfg.replicaNum, code);
if (code < 0) return -1;
tjsonGetNumberValue(pJson, "syncCfg.totalReplicaNum", pCfg->syncCfg.totalReplicaNum, code);
if (code < 0) return -1;
tjsonGetNumberValue(pJson, "syncCfg.myIndex", pCfg->syncCfg.myIndex, code);
if (code < 0) return -1;
......@@ -251,10 +277,13 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) {
SJson *nodeInfo = tjsonGetObjectItem(pJson, "syncCfg.nodeInfo");
int arraySize = tjsonGetArraySize(nodeInfo);
if (arraySize != pCfg->syncCfg.replicaNum) return -1;
if(pCfg->syncCfg.totalReplicaNum == 0 && pCfg->syncCfg.replicaNum > 0){
pCfg->syncCfg.totalReplicaNum = pCfg->syncCfg.replicaNum;
}
if (arraySize != pCfg->syncCfg.totalReplicaNum) return -1;
vDebug("vgId:%d, decode config, replicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.myIndex);
vDebug("vgId:%d, decode config, replicas:%d totalReplicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.totalReplicaNum, pCfg->syncCfg.myIndex);
for (int i = 0; i < arraySize; ++i) {
SJson *info = tjsonGetArrayItem(nodeInfo, i);
SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i];
......@@ -266,6 +295,15 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) {
if (code < 0) return -1;
tjsonGetNumberValue(info, "clusterId", pNode->clusterId, code);
if (code < 0) return -1;
char role[10] = {0};
code = tjsonGetStringValue(info, "nodeRole", role);
if (code < 0) return -1;
if(strlen(role) != 0){
pNode->nodeRole = vnodeStrToRole(role);
}
else{
pNode->nodeRole = TAOS_SYNC_ROLE_VOTER;
}
vDebug("vgId:%d, decode config, replica:%d ep:%s:%u dnode:%d", pCfg->vgId, i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId);
}
......
......@@ -76,19 +76,41 @@ int32_t vnodeAlterReplica(const char *path, SAlterVnodeReplicaReq *pReq, STfs *p
}
SSyncCfg *pCfg = &info.config.syncCfg;
pCfg->myIndex = pReq->selfIndex;
pCfg->replicaNum = pReq->replica;
pCfg->replicaNum = 0;
pCfg->totalReplicaNum = 0;
memset(&pCfg->nodeInfo, 0, sizeof(pCfg->nodeInfo));
vInfo("vgId:%d, save config while alter, replicas:%d selfIndex:%d", pReq->vgId, pCfg->replicaNum, pCfg->myIndex);
for (int i = 0; i < pReq->replica; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
pNode->nodeId = pReq->replicas[i].id;
pNode->nodePort = pReq->replicas[i].port;
tstrncpy(pNode->nodeFqdn, pReq->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
pNode->nodeRole = TAOS_SYNC_ROLE_VOTER;
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
pCfg->replicaNum++;
}
if(pReq->selfIndex != -1){
pCfg->myIndex = pReq->selfIndex;
}
for (int i = pCfg->replicaNum; i < pReq->replica + pReq->learnerReplica; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
pNode->nodeId = pReq->learnerReplicas[pCfg->totalReplicaNum].id;
pNode->nodePort = pReq->learnerReplicas[pCfg->totalReplicaNum].port;
pNode->nodeRole = TAOS_SYNC_ROLE_LEARNER;
tstrncpy(pNode->nodeFqdn, pReq->learnerReplicas[pCfg->totalReplicaNum].fqdn, sizeof(pNode->nodeFqdn));
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
pCfg->totalReplicaNum++;
}
pCfg->totalReplicaNum += pReq->replica;
if(pReq->learnerSelfIndex != -1){
pCfg->myIndex = pReq->replica + pReq->learnerSelfIndex;
}
vInfo("vgId:%d, save config while alter, replicas:%d totalReplicas:%d selfIndex:%d",
pReq->vgId, pCfg->replicaNum, pCfg->totalReplicaNum, pCfg->myIndex);
info.config.syncCfg = *pCfg;
ret = vnodeSaveInfo(dir, &info);
......@@ -181,6 +203,7 @@ int32_t vnodeAlterHashRange(const char *srcPath, const char *dstPath, SAlterVnod
SSyncCfg *pCfg = &info.config.syncCfg;
pCfg->myIndex = 0;
pCfg->replicaNum = 1;
pCfg->totalReplicaNum = 1;
memset(&pCfg->nodeInfo, 0, sizeof(pCfg->nodeInfo));
vInfo("vgId:%d, alter vnode replicas to 1", pReq->srcVgId);
......@@ -248,7 +271,7 @@ SVnode *vnodeOpen(const char *path, STfs *pTfs, SMsgCb msgCb) {
// save vnode info on dnode ep changed
bool updated = false;
SSyncCfg *pCfg = &info.config.syncCfg;
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
if (tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort)) {
updated = true;
......@@ -404,6 +427,10 @@ void vnodeClose(SVnode *pVnode) {
// start the sync timer after the queue is ready
int32_t vnodeStart(SVnode *pVnode) { return vnodeSyncStart(pVnode); }
int32_t vnodeIsCatchUp(SVnode *pVnode){
return syncIsCatchUp(pVnode->sync);
}
void vnodeStop(SVnode *pVnode) {}
int64_t vnodeGetSyncHandle(SVnode *pVnode) { return pVnode->sync; }
......
......@@ -1447,7 +1447,8 @@ int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen) {
}
static int32_t vnodeProcessAlterConfirmReq(SVnode *pVnode, int64_t version, void *pReq, int32_t len, SRpcMsg *pRsp) {
vInfo("vgId:%d, alter replica confim msg is processed", TD_VID(pVnode));
vInfo("vgId:%d, vnode management handle msgType:alter-confirm, alter replica confim msg is processed",
TD_VID(pVnode));
pRsp->msgType = TDMT_VND_ALTER_CONFIRM_RSP;
pRsp->code = TSDB_CODE_SUCCESS;
pRsp->pCont = NULL;
......
......@@ -566,6 +566,19 @@ static void vnodeBecomeFollower(const SSyncFSM *pFsm) {
taosThreadMutexUnlock(&pVnode->lock);
}
static void vnodeBecomeLearner(const SSyncFSM *pFsm) {
SVnode *pVnode = pFsm->data;
vInfo("vgId:%d, become learner", pVnode->config.vgId);
taosThreadMutexLock(&pVnode->lock);
if (pVnode->blocked) {
pVnode->blocked = false;
vDebug("vgId:%d, become learner and post block", pVnode->config.vgId);
tsem_post(&pVnode->syncSem);
}
taosThreadMutexUnlock(&pVnode->lock);
}
static void vnodeBecomeLeader(const SSyncFSM *pFsm) {
SVnode *pVnode = pFsm->data;
vDebug("vgId:%d, become leader", pVnode->config.vgId);
......@@ -607,6 +620,7 @@ static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) {
pFsm->FpApplyQueueItems = vnodeApplyQueueItems;
pFsm->FpBecomeLeaderCb = vnodeBecomeLeader;
pFsm->FpBecomeFollowerCb = vnodeBecomeFollower;
pFsm->FpBecomeLearnerCb = vnodeBecomeLearner;
pFsm->FpReConfigCb = NULL;
pFsm->FpSnapshotStartRead = vnodeSnapshotStartRead;
pFsm->FpSnapshotStopRead = vnodeSnapshotStopRead;
......@@ -639,7 +653,7 @@ int32_t vnodeSyncOpen(SVnode *pVnode, char *path) {
SSyncCfg *pCfg = &syncInfo.syncCfg;
vInfo("vgId:%d, start to open sync, replica:%d selfIndex:%d", pVnode->config.vgId, pCfg->replicaNum, pCfg->myIndex);
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
vInfo("vgId:%d, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, pVnode->config.vgId, i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId, pNode->clusterId);
......
......@@ -24,12 +24,13 @@ extern "C" {
// SIndexMgr -----------------------------
typedef struct SSyncIndexMgr {
SRaftId (*replicas)[TSDB_MAX_REPLICA];
SyncIndex index[TSDB_MAX_REPLICA];
SyncTerm privateTerm[TSDB_MAX_REPLICA]; // for advanced function
int64_t startTimeArr[TSDB_MAX_REPLICA];
int64_t recvTimeArr[TSDB_MAX_REPLICA];
SRaftId (*replicas)[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncIndex index[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncTerm privateTerm[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; // for advanced function
int64_t startTimeArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int64_t recvTimeArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int32_t replicaNum;
int32_t totalReplicaNum;
SSyncNode *pNode;
} SSyncIndexMgr;
......
......@@ -54,13 +54,13 @@ typedef struct SSyncLogReplMgr SSyncLogReplMgr;
#define MAX_CONFIG_INDEX_COUNT 256
typedef struct SRaftCfg {
SSyncCfg cfg;
int32_t batchSize;
int8_t isStandBy;
int8_t snapshotStrategy;
SyncIndex lastConfigIndex;
int32_t configIndexCount;
SyncIndex configIndexArr[MAX_CONFIG_INDEX_COUNT];
SSyncCfg cfg;
int32_t batchSize;
int8_t isStandBy;
int8_t snapshotStrategy;
SyncIndex lastConfigIndex;
int32_t configIndexCount;
SyncIndex configIndexArr[MAX_CONFIG_INDEX_COUNT];
} SRaftCfg;
typedef struct SRaftId {
......@@ -127,12 +127,13 @@ typedef struct SSyncNode {
SRaftId myRaftId;
int32_t peersNum;
SNodeInfo peersNodeInfo[TSDB_MAX_REPLICA];
SEpSet peersEpset[TSDB_MAX_REPLICA];
SRaftId peersId[TSDB_MAX_REPLICA];
SNodeInfo peersNodeInfo[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SEpSet peersEpset[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SRaftId peersId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int32_t replicaNum;
SRaftId replicasId[TSDB_MAX_REPLICA];
int32_t totalReplicaNum;
SRaftId replicasId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
// raft algorithm
SSyncFSM* pFsm;
......@@ -188,7 +189,7 @@ typedef struct SSyncNode {
uint64_t heartbeatTimerCounter;
// peer heartbeat timer
SSyncTimer peerHeartbeatTimerArr[TSDB_MAX_REPLICA];
SSyncTimer peerHeartbeatTimerArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
// tools
SSyncRespMgr* pSyncRespMgr;
......@@ -196,13 +197,13 @@ typedef struct SSyncNode {
// restore state
bool restoreFinish;
// SSnapshot* pSnapshot;
SSyncSnapshotSender* senders[TSDB_MAX_REPLICA];
SSyncSnapshotSender* senders[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SSyncSnapshotReceiver* pNewNodeReceiver;
// log replication mgr
SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA];
SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SPeerState peerStates[TSDB_MAX_REPLICA];
SPeerState peerStates[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
// is config changing
bool changing;
......@@ -275,6 +276,7 @@ void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term);
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term);
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm);
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr);
void syncNodeBecomeLearner(SSyncNode* pSyncNode, const char* debugStr);
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr);
void syncNodeCandidate2Leader(SSyncNode* pSyncNode);
void syncNodeFollower2Candidate(SSyncNode* pSyncNode);
......
......@@ -237,6 +237,7 @@ typedef struct SyncLeaderTransfer {
typedef enum {
SYNC_LOCAL_CMD_STEP_DOWN = 100,
SYNC_LOCAL_CMD_FOLLOWER_CMT,
SYNC_LOCAL_CMD_LEARNER_CMT,
} ESyncLocalCmd;
typedef struct SyncLocalCmd {
......
......@@ -56,6 +56,8 @@ typedef struct SSyncLogBuffer {
int64_t size;
TdThreadMutex mutex;
TdThreadMutexAttr attr;
int64_t totalIndex;
bool isCatchup;
} SSyncLogBuffer;
// SSyncLogRepMgr
......
......@@ -137,8 +137,10 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
pReply->term = pMsg->term;
}
syncNodeStepDown(ths, pMsg->term);
resetElect = true;
if(ths->raftCfg.cfg.nodeInfo[ths->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){
syncNodeStepDown(ths, pMsg->term);
resetElect = true;
}
if (pMsg->dataLen < sizeof(SSyncRaftEntry)) {
sError("vgId:%d, incomplete append entries received. prev index:%" PRId64 ", term:%" PRId64 ", datalen:%d",
......
......@@ -41,6 +41,8 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) {
int32_t ret = 0;
for (int i = 0; i < pNode->peersNum; ++i) {
if(pNode->peersNodeInfo[i].nodeRole == TAOS_SYNC_ROLE_LEARNER) continue;
SRpcMsg rpcMsg = {0};
ret = syncBuildRequestVote(&rpcMsg, pNode->vgId);
if (ret < 0) {
......
......@@ -26,6 +26,7 @@ SSyncIndexMgr *syncIndexMgrCreate(SSyncNode *pNode) {
pIndexMgr->replicas = &pNode->replicasId;
pIndexMgr->replicaNum = pNode->replicaNum;
pIndexMgr->totalReplicaNum = pNode->totalReplicaNum;
pIndexMgr->pNode = pNode;
syncIndexMgrClear(pIndexMgr);
......@@ -35,6 +36,7 @@ SSyncIndexMgr *syncIndexMgrCreate(SSyncNode *pNode) {
void syncIndexMgrUpdate(SSyncIndexMgr *pIndexMgr, SSyncNode *pNode) {
pIndexMgr->replicas = &pNode->replicasId;
pIndexMgr->replicaNum = pNode->replicaNum;
pIndexMgr->totalReplicaNum = pNode->totalReplicaNum;
pIndexMgr->pNode = pNode;
syncIndexMgrClear(pIndexMgr);
}
......@@ -50,14 +52,14 @@ void syncIndexMgrClear(SSyncIndexMgr *pIndexMgr) {
memset(pIndexMgr->privateTerm, 0, sizeof(pIndexMgr->privateTerm));
int64_t timeNow = taosGetTimestampMs();
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
pIndexMgr->startTimeArr[i] = 0;
pIndexMgr->recvTimeArr[i] = timeNow;
}
}
void syncIndexMgrSetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncIndex index) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->index)[i] = index;
return;
......@@ -69,7 +71,7 @@ void syncIndexMgrSetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, Sync
}
SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pRaftId) {
for (int i = 0; i < pNode->replicaNum; i++) {
for (int i = 0; i < pNode->totalReplicaNum; i++) {
if (syncUtilSameId(&pNode->replicasId[i], pRaftId)) {
return pNode->logReplMgrs[i];
}
......@@ -80,7 +82,7 @@ SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pRaftId) {
}
SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
SyncIndex idx = (pIndexMgr->index)[i];
return idx;
......@@ -93,7 +95,7 @@ SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId)
}
void syncIndexMgrSetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, int64_t startTime) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->startTimeArr)[i] = startTime;
return;
......@@ -105,7 +107,7 @@ void syncIndexMgrSetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId,
}
int64_t syncIndexMgrGetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
int64_t startTime = (pIndexMgr->startTimeArr)[i];
return startTime;
......@@ -118,7 +120,7 @@ int64_t syncIndexMgrGetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftI
}
void syncIndexMgrSetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, int64_t recvTime) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->recvTimeArr)[i] = recvTime;
return;
......@@ -130,7 +132,7 @@ void syncIndexMgrSetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, i
}
int64_t syncIndexMgrGetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
int64_t recvTime = (pIndexMgr->recvTimeArr)[i];
return recvTime;
......@@ -143,7 +145,7 @@ int64_t syncIndexMgrGetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId
}
void syncIndexMgrSetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncTerm term) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->privateTerm)[i] = term;
return;
......@@ -155,7 +157,7 @@ void syncIndexMgrSetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncT
}
SyncTerm syncIndexMgrGetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
SyncTerm term = (pIndexMgr->privateTerm)[i];
return term;
......
......@@ -141,6 +141,13 @@ int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
SSyncNode* pSyncNode = syncNodeAcquire(rid);
if (pSyncNode == NULL) return -1;
if(pSyncNode->raftCfg.lastConfigIndex >= pNewCfg->lastIndex){
syncNodeRelease(pSyncNode);
sInfo("vgId:%d, no need Reconfig, current index:%" PRId64 ", new index:%" PRId64, pSyncNode->vgId,
pSyncNode->raftCfg.lastConfigIndex, pNewCfg->lastIndex);
return 0;
}
if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
syncNodeRelease(pSyncNode);
terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
......@@ -149,12 +156,12 @@ int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
}
syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg);
syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID);
syncNodeDoConfigChange(pSyncNode, pNewCfg, pNewCfg->lastIndex);
if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
syncNodeStopHeartbeatTimer(pSyncNode);
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]);
}
......@@ -315,8 +322,9 @@ int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
}
}
if (pSyncNode->replicaNum > 1) {
if (pSyncNode->state != TAOS_SYNC_STATE_LEADER && pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
if (pSyncNode->totalReplicaNum > 1) {
if (pSyncNode->state != TAOS_SYNC_STATE_LEADER && pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER
&& pSyncNode->state != TAOS_SYNC_STATE_LEARNER) {
sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate or unknown state, do not delete wal",
lastApplyIndex);
syncNodeRelease(pSyncNode);
......@@ -537,7 +545,8 @@ void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
SSyncNode* pSyncNode = syncNodeAcquire(rid);
if (pSyncNode == NULL) return;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) {
if(pSyncNode->raftCfg.cfg.nodeInfo[i].nodeRole == TAOS_SYNC_ROLE_LEARNER) continue;
SEp* pEp = &pEpSet->eps[i];
tstrncpy(pEp->fqdn, pSyncNode->raftCfg.cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN);
pEp->port = (pSyncNode->raftCfg.cfg.nodeInfo)[i].nodePort;
......@@ -564,6 +573,34 @@ int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak, int64_t* seq) {
return ret;
}
int32_t syncIsCatchUp(int64_t rid) {
SSyncNode* pSyncNode = syncNodeAcquire(rid);
if (pSyncNode == NULL) {
sError("sync Node Acquire error since %d", errno);
return -1;
}
while(1){
if(pSyncNode->pLogBuf->totalIndex < 0 || pSyncNode->pLogBuf->commitIndex < 0 ||
pSyncNode->pLogBuf->totalIndex < pSyncNode->pLogBuf->commitIndex ||
pSyncNode->pLogBuf->totalIndex - pSyncNode->pLogBuf->commitIndex > SYNC_LEARNER_CATCHUP){
sInfo("vgId:%d, Not catch up, wait one second, totalIndex:%" PRId64 " commitIndex:%" PRId64 " matchIndex:%" PRId64,
pSyncNode->vgId, pSyncNode->pLogBuf->totalIndex, pSyncNode->pLogBuf->commitIndex,
pSyncNode->pLogBuf->matchIndex);
taosSsleep(1);
}
else{
sInfo("vgId:%d, Catch up, totalIndex:%" PRId64 " commitIndex:%" PRId64 " matchIndex:%" PRId64,
pSyncNode->vgId, pSyncNode->pLogBuf->totalIndex, pSyncNode->pLogBuf->commitIndex,
pSyncNode->pLogBuf->matchIndex);
break;
}
}
syncNodeRelease(pSyncNode);
return 0;
}
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak, int64_t* seq) {
if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
terrno = TSDB_CODE_SYN_NOT_LEADER;
......@@ -655,6 +692,8 @@ static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
pData->logicClock = pSyncTimer->logicClock;
pData->execTime = tsNow + pSyncTimer->timerMS;
sTrace("vgId:%d, start hb timer, rid:%" PRId64 " addr:%" PRId64, pSyncNode->vgId, pData->rid, pData->destId.addr);
taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)(pData->rid),
syncEnv()->pTimerManager, &pSyncTimer->pTimer);
} else {
......@@ -719,7 +758,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
sInfo("vgId:%d, create a new raft config file", pSyncNode->vgId);
pSyncNode->raftCfg.isStandBy = pSyncInfo->isStandBy;
pSyncNode->raftCfg.snapshotStrategy = pSyncInfo->snapshotStrategy;
pSyncNode->raftCfg.lastConfigIndex = SYNC_INDEX_INVALID;
pSyncNode->raftCfg.lastConfigIndex = pSyncInfo->syncCfg.lastIndex;
pSyncNode->raftCfg.batchSize = pSyncInfo->batchSize;
pSyncNode->raftCfg.cfg = pSyncInfo->syncCfg;
pSyncNode->raftCfg.configIndexCount = 1;
......@@ -736,7 +775,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
goto _error;
}
if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->raftCfg.cfg, &pSyncInfo->syncCfg)) {
if (pSyncInfo->syncCfg.totalReplicaNum > 0 && syncIsConfigChanged(&pSyncNode->raftCfg.cfg, &pSyncInfo->syncCfg)) {
sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId);
pSyncNode->raftCfg.cfg = pSyncInfo->syncCfg;
if (syncWriteCfgFile(pSyncNode) != 0) {
......@@ -753,8 +792,9 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
pSyncNode->vgId = pSyncInfo->vgId;
SSyncCfg* pCfg = &pSyncNode->raftCfg.cfg;
bool updated = false;
sInfo("vgId:%d, start to open sync node, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
sInfo("vgId:%d, start to open sync node, totalReplicaNum:%d replicaNum:%d selfIndex:%d",
pSyncNode->vgId, pCfg->totalReplicaNum, pCfg->replicaNum, pCfg->myIndex);
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SNodeInfo* pNode = &pCfg->nodeInfo[i];
if (tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort)) {
updated = true;
......@@ -792,9 +832,9 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
}
// init peersNum, peers, peersId
pSyncNode->peersNum = pSyncNode->raftCfg.cfg.replicaNum - 1;
pSyncNode->peersNum = pSyncNode->raftCfg.cfg.totalReplicaNum - 1;
int32_t j = 0;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) {
if (i != pSyncNode->raftCfg.cfg.myIndex) {
pSyncNode->peersNodeInfo[j] = pSyncNode->raftCfg.cfg.nodeInfo[i];
syncUtilNodeInfo2EpSet(&pSyncNode->peersNodeInfo[j], &pSyncNode->peersEpset[j]);
......@@ -810,7 +850,8 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
// init replicaNum, replicasId
pSyncNode->replicaNum = pSyncNode->raftCfg.cfg.replicaNum;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
pSyncNode->totalReplicaNum = pSyncNode->raftCfg.cfg.totalReplicaNum;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) {
if (!syncUtilNodeInfo2RaftId(&pSyncNode->raftCfg.cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) {
sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i);
goto _error;
......@@ -934,7 +975,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
pSyncNode->heartbeatTimerCounter = 0;
// init peer heartbeat timer
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]);
}
......@@ -949,7 +990,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
pSyncNode->restoreFinish = false;
// snapshot senders
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
if (pSender == NULL) return NULL;
......@@ -1059,14 +1100,19 @@ int32_t syncNodeRestore(SSyncNode* pSyncNode) {
int32_t syncNodeStart(SSyncNode* pSyncNode) {
// start raft
if (pSyncNode->replicaNum == 1) {
raftStoreNextTerm(pSyncNode);
syncNodeBecomeLeader(pSyncNode, "one replica start");
if(pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex].nodeRole == TAOS_SYNC_ROLE_LEARNER){
syncNodeBecomeLearner(pSyncNode, "first start");
}
else{
if (pSyncNode->replicaNum == 1) {
raftStoreNextTerm(pSyncNode);
syncNodeBecomeLeader(pSyncNode, "one replica start");
// Raft 3.6.2 Committing entries from previous terms
syncNodeAppendNoop(pSyncNode);
} else {
syncNodeBecomeFollower(pSyncNode, "first start");
// Raft 3.6.2 Committing entries from previous terms
syncNodeAppendNoop(pSyncNode);
} else {
syncNodeBecomeFollower(pSyncNode, "first start");
}
}
int32_t ret = 0;
......@@ -1157,7 +1203,7 @@ void syncNodeClose(SSyncNode* pSyncNode) {
syncLogBufferDestroy(pSyncNode->pLogBuf);
pSyncNode->pLogBuf = NULL;
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
if (pSyncNode->senders[i] != NULL) {
sDebug("vgId:%d, snapshot sender destroy while close, data:%p", pSyncNode->vgId, pSyncNode->senders[i]);
......@@ -1350,7 +1396,7 @@ inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) {
bool b1 = false;
bool b2 = false;
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
if (strcmp(pCfg->nodeInfo[i].nodeFqdn, pNode->myNodeInfo.nodeFqdn) == 0 &&
pCfg->nodeInfo[i].nodePort == pNode->myNodeInfo.nodePort) {
b1 = true;
......@@ -1358,7 +1404,7 @@ inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) {
}
}
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SRaftId raftId = {
.addr = SYNC_ADDR(&pCfg->nodeInfo[i]),
.vgId = pNode->vgId,
......@@ -1375,13 +1421,14 @@ inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) {
}
static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) {
if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true;
if (pOldCfg->totalReplicaNum != pNewCfg->totalReplicaNum) return true;
if (pOldCfg->myIndex != pNewCfg->myIndex) return true;
for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pOldCfg->totalReplicaNum; ++i) {
const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i];
const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i];
if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true;
if (pOldInfo->nodePort != pNewInfo->nodePort) return true;
if(pOldInfo->nodeRole != pNewInfo->nodeRole) return true;
}
return false;
......@@ -1418,8 +1465,10 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
}
// log begin config change
sNInfo(pSyncNode, "begin do config change, from %d to %d, replicas:%d", pSyncNode->vgId, oldConfig.replicaNum,
pNewConfig->replicaNum);
sNInfo(pSyncNode, "begin do config change, from %d to %d, from %" PRId64 " to %" PRId64 ", replicas:%d",
pSyncNode->vgId,
oldConfig.totalReplicaNum, pNewConfig->totalReplicaNum,
oldConfig.lastIndex, pNewConfig->lastIndex);
if (IamInNew) {
pSyncNode->raftCfg.isStandBy = 0; // change isStandBy to normal
......@@ -1436,11 +1485,10 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
int32_t ret = 0;
// save snapshot senders
int32_t oldReplicaNum = pSyncNode->replicaNum;
SRaftId oldReplicasId[TSDB_MAX_REPLICA];
SRaftId oldReplicasId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
oldSenders[i] = pSyncNode->senders[i];
sSTrace(oldSenders[i], "snapshot sender save old");
}
......@@ -1450,9 +1498,9 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
// init peersNum, peers, peersId
pSyncNode->peersNum = pSyncNode->raftCfg.cfg.replicaNum - 1;
pSyncNode->peersNum = pSyncNode->raftCfg.cfg.totalReplicaNum - 1;
int32_t j = 0;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) {
if (i != pSyncNode->raftCfg.cfg.myIndex) {
pSyncNode->peersNodeInfo[j] = pSyncNode->raftCfg.cfg.nodeInfo[i];
syncUtilNodeInfo2EpSet(&pSyncNode->peersNodeInfo[j], &pSyncNode->peersEpset[j]);
......@@ -1465,7 +1513,8 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
// init replicaNum, replicasId
pSyncNode->replicaNum = pSyncNode->raftCfg.cfg.replicaNum;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
pSyncNode->totalReplicaNum = pSyncNode->raftCfg.cfg.totalReplicaNum;
for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) {
syncUtilNodeInfo2RaftId(&pSyncNode->raftCfg.cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
}
......@@ -1480,15 +1529,15 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
// reset snapshot senders
// clear new
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
pSyncNode->senders[i] = NULL;
}
// reset new
for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
for (int32_t i = 0; i < pSyncNode->totalReplicaNum; ++i) {
// reset sender
bool reset = false;
for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) {
for (int32_t j = 0; j < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++j) {
if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j]) && oldSenders[j] != NULL) {
sNTrace(pSyncNode, "snapshot sender reset for:%" PRId64 ", newIndex:%d, dnode:%d, %p",
(pSyncNode->replicasId)[i].addr, i, DID(&pSyncNode->replicasId[i]), oldSenders[j]);
......@@ -1510,7 +1559,7 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
}
// create new
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
if (pSyncNode->senders[i] == NULL) {
pSyncNode->senders[i] = snapshotSenderCreate(pSyncNode, i);
if (pSyncNode->senders[i] == NULL) {
......@@ -1525,7 +1574,7 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
}
// free old
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
if (oldSenders[i] != NULL) {
sSDebug(oldSenders[i], "snapshot sender destroy old, data:%p replica-index:%d", oldSenders[i], i);
snapshotSenderDestroy(oldSenders[i]);
......@@ -1550,12 +1599,12 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde
} else {
// persist cfg
syncWriteCfgFile(pSyncNode);
sNInfo(pSyncNode, "do not config change from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum);
sNInfo(pSyncNode, "do not config change from %d to %d", oldConfig.totalReplicaNum, pNewConfig->totalReplicaNum);
}
_END:
// log end config change
sNInfo(pSyncNode, "end do config change, from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum);
sNInfo(pSyncNode, "end do config change, from %d to %d", oldConfig.totalReplicaNum, pNewConfig->totalReplicaNum);
}
// raft state change --------------
......@@ -1635,6 +1684,27 @@ void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
syncNodeResetElectTimer(pSyncNode);
}
void syncNodeBecomeLearner(SSyncNode* pSyncNode, const char* debugStr) {
pSyncNode->hbSlowNum = 0;
// state change
pSyncNode->state = TAOS_SYNC_STATE_LEARNER;
// trace log
sNTrace(pSyncNode, "become learner %s", debugStr);
// call back
if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLearnerCb != NULL) {
pSyncNode->pFsm->FpBecomeLearnerCb(pSyncNode->pFsm);
}
// min match index
pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;
// reset log buffer
syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);
}
// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
......@@ -1752,7 +1822,7 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }
int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) {
pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID;
pSyncNode->peerStates[i].lastSendTime = 0;
}
......@@ -2039,7 +2109,7 @@ static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
if (!syncIsInit()) return;
SSyncNode* pNode = param;
if (pNode->replicaNum > 1) {
if (pNode->totalReplicaNum > 1) {
if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) {
SRpcMsg rpcMsg = {0};
int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock),
......@@ -2074,7 +2144,7 @@ static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
SSyncHbTimerData* pData = syncHbTimerDataAcquire(hbDataRid);
if (pData == NULL) {
sError("hb timer get pData NULL, %" PRId64, hbDataRid);
sError("hb timer get pData NULL, rid:%" PRId64 " addr:%" PRId64, hbDataRid, pData->destId.addr);
return;
}
......@@ -2101,9 +2171,9 @@ static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
return;
}
// sTrace("vgId:%d, eq peer hb timer", pSyncNode->vgId);
sTrace("vgId:%d, eq peer hb timer, rid:%" PRId64 " addr:%" PRId64, pSyncNode->vgId, hbDataRid, pData->destId.addr);
if (pSyncNode->replicaNum > 1) {
if (pSyncNode->totalReplicaNum > 1) {
int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock);
int64_t msgLogicClock = atomic_load_64(&pData->logicClock);
......@@ -2130,6 +2200,7 @@ static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
pSyncTimer->timeStamp = tsNow;
// send msg
sTrace("vgId:%d, send heartbeat to dnode:%d", pSyncNode->vgId, DID(&(pSyncMsg->destId)));
syncLogSendHeartbeat(pSyncNode, pSyncMsg, false, timerElapsed, pData->execTime);
syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
} else {
......@@ -2212,7 +2283,7 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) {
}
bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) {
if (pSyncNode->replicaNum == 1) {
if (pSyncNode->totalReplicaNum == 1) {
return false;
}
......@@ -2237,7 +2308,7 @@ bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) {
bool syncNodeSnapshotSending(SSyncNode* pSyncNode) {
if (pSyncNode == NULL) return false;
bool b = false;
for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
for (int32_t i = 0; i < pSyncNode->totalReplicaNum; ++i) {
if (pSyncNode->senders[i] != NULL && pSyncNode->senders[i]->start) {
b = true;
break;
......@@ -2328,13 +2399,17 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
pMsgReply->startTime = ths->startTime;
pMsgReply->timeStamp = tsMs;
sTrace(
"vgId:%d, heartbeat msg from dnode:%d, cluster:%d, Msgterm:%" PRId64 " currentTerm:%" PRId64,
ths->vgId, DID(&(pMsg->srcId)), CID(&(pMsg->srcId)), pMsg->term, currentTerm);
if (pMsg->term == currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) {
syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), tsMs);
resetElect = true;
ths->minMatchIndex = pMsg->minMatchIndex;
if (ths->state == TAOS_SYNC_STATE_FOLLOWER) {
if (ths->state == TAOS_SYNC_STATE_FOLLOWER || ths->state == TAOS_SYNC_STATE_LEARNER) {
SRpcMsg rpcMsgLocalCmd = {0};
(void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);
......@@ -2356,7 +2431,7 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
}
}
if (pMsg->term >= currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) {
if (pMsg->term >= currentTerm && ths->state == TAOS_SYNC_STATE_LEADER) {
SRpcMsg rpcMsgLocalCmd = {0};
(void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);
......@@ -2376,6 +2451,26 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
}
}
if (pMsg->term >= currentTerm && ths->state == TAOS_SYNC_STATE_LEARNER) {
SRpcMsg rpcMsgLocalCmd = {0};
(void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);
SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
pSyncMsg->cmd = SYNC_LOCAL_CMD_LEARNER_CMT;
pSyncMsg->currentTerm = pMsg->term;
pSyncMsg->commitIndex = pMsg->commitIndex;
if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
if (code != 0) {
sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code);
rpcFreeCont(rpcMsgLocalCmd.pCont);
} else {
sTrace("vgId:%d, sync enqueue step-down msg, new-term:%" PRId64, ths->vgId, pSyncMsg->currentTerm);
}
}
}
// reply
syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);
......@@ -2439,7 +2534,20 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
sError("vgId:%d, failed to commit raft log since %s. commit index:%" PRId64 "", ths->vgId, terrstr(),
ths->commitIndex);
}
} else {
} else if (pMsg->cmd == SYNC_LOCAL_CMD_LEARNER_CMT){
if (syncLogBufferIsEmpty(ths->pLogBuf)) {
sError("vgId:%d, sync log buffer is empty.", ths->vgId);
return 0;
}
raftStoreSetTerm(ths, pMsg->currentTerm);
(void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex);
sTrace("vgId:%d, start to commit raft log in heartbeat. commit index:%" PRId64 "", ths->vgId, ths->commitIndex);
if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
sError("vgId:%d, failed to commit raft log since %s. commit index:%" PRId64 "", ths->vgId, terrstr(),
ths->commitIndex);
}
}
else {
sError("error local cmd");
}
......@@ -2502,13 +2610,15 @@ const char* syncStr(ESyncState state) {
return "error";
case TAOS_SYNC_STATE_OFFLINE:
return "offline";
case TAOS_SYNC_STATE_LEARNER:
return "learner";
default:
return "unknown";
}
}
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pNewCfg->totalReplicaNum; ++i) {
SRaftId raftId = {
.addr = SYNC_ADDR(&pNewCfg->nodeInfo[i]),
.vgId = ths->vgId,
......@@ -2528,7 +2638,7 @@ bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
}
bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
for (int32_t i = 0; i < ths->replicaNum; ++i) {
for (int32_t i = 0; i < ths->totalReplicaNum; ++i) {
if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
return true;
}
......@@ -2538,7 +2648,7 @@ bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
SSyncSnapshotSender* pSender = NULL;
for (int32_t i = 0; i < ths->replicaNum; ++i) {
for (int32_t i = 0; i < ths->totalReplicaNum; ++i) {
if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
pSender = (ths->senders)[i];
}
......@@ -2548,7 +2658,7 @@ SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId)
SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
SSyncTimer* pTimer = NULL;
for (int32_t i = 0; i < ths->replicaNum; ++i) {
for (int32_t i = 0; i < ths->totalReplicaNum; ++i) {
if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
pTimer = &((ths->peerHeartbeatTimerArr)[i]);
}
......@@ -2558,7 +2668,7 @@ SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) {
SPeerState* pState = NULL;
for (int32_t i = 0; i < ths->replicaNum; ++i) {
for (int32_t i = 0; i < ths->totalReplicaNum; ++i) {
if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
pState = &((ths->peerStates)[i]);
}
......
......@@ -250,6 +250,8 @@ int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) {
// update startIndex
pBuf->startIndex = takeDummy ? index : index + 1;
pBuf->isCatchup = false;
sInfo("vgId:%d, init sync log buffer. buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId,
pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex);
......@@ -332,6 +334,15 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt
goto _out;
}
if(pNode->raftCfg.cfg.nodeInfo[pNode->raftCfg.cfg.myIndex].nodeRole == TAOS_SYNC_ROLE_LEARNER &&
index > 0 && index > pBuf->totalIndex){
pBuf->totalIndex = index;
sTrace("vgId:%d, update learner progress. index:%" PRId64 ", term:%" PRId64 ": prevterm:%" PRId64
" != lastmatch:%" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")",
pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex,
pBuf->matchIndex, pBuf->endIndex);
}
if (index - pBuf->startIndex >= pBuf->size) {
sWarn("vgId:%d, out of buffer range. index:%" PRId64 ", term:%" PRId64 ". log buffer: [%" PRId64 " %" PRId64
" %" PRId64 ", %" PRId64 ")",
......@@ -491,7 +502,7 @@ _out:
}
int32_t syncFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry,
int32_t applyCode) {
int32_t applyCode) {
if (pNode->replicaNum == 1 && pNode->restoreFinish && pNode->vgId != 1) {
return 0;
}
......@@ -965,7 +976,7 @@ void syncLogReplDestroy(SSyncLogReplMgr* pMgr) {
}
int32_t syncNodeLogReplInit(SSyncNode* pNode) {
for (int i = 0; i < TSDB_MAX_REPLICA; i++) {
for (int i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; i++) {
ASSERT(pNode->logReplMgrs[i] == NULL);
pNode->logReplMgrs[i] = syncLogReplCreate();
if (pNode->logReplMgrs[i] == NULL) {
......@@ -978,7 +989,7 @@ int32_t syncNodeLogReplInit(SSyncNode* pNode) {
}
void syncNodeLogReplDestroy(SSyncNode* pNode) {
for (int i = 0; i < TSDB_MAX_REPLICA; i++) {
for (int i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; i++) {
syncLogReplDestroy(pNode->logReplMgrs[i]);
pNode->logReplMgrs[i] = NULL;
}
......@@ -1108,7 +1119,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) {
pBuf->endIndex = pBuf->matchIndex + 1;
// reset repl mgr
for (int i = 0; i < pNode->replicaNum; i++) {
for (int i = 0; i < pNode->totalReplicaNum; i++) {
SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i];
syncLogReplReset(pMgr);
}
......
......@@ -18,21 +18,45 @@
#include "syncUtil.h"
#include "tjson.h"
const char* syncRoleToStr(ESyncRole role) {
switch (role) {
case TAOS_SYNC_ROLE_VOTER:
return "voter";
case TAOS_SYNC_ROLE_LEARNER:
return "learner";
default:
return "unknown";
}
}
const ESyncRole syncStrToRole(char* str) {
if(strcmp(str, "voter") == 0){
return TAOS_SYNC_ROLE_VOTER;
}
if(strcmp(str, "learner") == 0){
return TAOS_SYNC_ROLE_LEARNER;
}
return TAOS_SYNC_ROLE_ERROR;
}
static int32_t syncEncodeSyncCfg(const void *pObj, SJson *pJson) {
SSyncCfg *pCfg = (SSyncCfg *)pObj;
if (tjsonAddDoubleToObject(pJson, "totalReplicaNum", pCfg->totalReplicaNum) < 0) return -1;
if (tjsonAddDoubleToObject(pJson, "replicaNum", pCfg->replicaNum) < 0) return -1;
if (tjsonAddDoubleToObject(pJson, "myIndex", pCfg->myIndex) < 0) return -1;
SJson *nodeInfo = tjsonCreateArray();
if (nodeInfo == NULL) return -1;
if (tjsonAddItemToObject(pJson, "nodeInfo", nodeInfo) < 0) return -1;
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SJson *info = tjsonCreateObject();
if (info == NULL) return -1;
if (tjsonAddDoubleToObject(info, "nodePort", pCfg->nodeInfo[i].nodePort) < 0) return -1;
if (tjsonAddStringToObject(info, "nodeFqdn", pCfg->nodeInfo[i].nodeFqdn) < 0) return -1;
if (tjsonAddIntegerToObject(info, "nodeId", pCfg->nodeInfo[i].nodeId) < 0) return -1;
if (tjsonAddIntegerToObject(info, "clusterId", pCfg->nodeInfo[i].clusterId) < 0) return -1;
if (tjsonAddStringToObject(info, "nodeRole", syncRoleToStr(pCfg->nodeInfo[i].nodeRole)) < 0) return -1;
if (tjsonAddItemToArray(nodeInfo, info) < 0) return -1;
}
......@@ -90,7 +114,8 @@ int32_t syncWriteCfgFile(SSyncNode *pNode) {
if (taosRenameFile(file, realfile) != 0) goto _OVER;
code = 0;
sInfo("vgId:%d, succeed to write sync cfg file:%s, len:%d", pNode->vgId, realfile, len);
sInfo("vgId:%d, succeed to write sync cfg file:%s, len:%d, lastConfigIndex:%" PRId64, pNode->vgId,
realfile, len, pNode->raftCfg.lastConfigIndex);
_OVER:
if (pJson != NULL) tjsonDelete(pJson);
......@@ -108,6 +133,7 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) {
SSyncCfg *pCfg = (SSyncCfg *)pObj;
int32_t code = 0;
tjsonGetInt32ValueFromDouble(pJson, "totalReplicaNum", pCfg->totalReplicaNum, code);
tjsonGetInt32ValueFromDouble(pJson, "replicaNum", pCfg->replicaNum, code);
if (code < 0) return -1;
tjsonGetInt32ValueFromDouble(pJson, "myIndex", pCfg->myIndex, code);
......@@ -115,9 +141,9 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) {
SJson *nodeInfo = tjsonGetObjectItem(pJson, "nodeInfo");
if (nodeInfo == NULL) return -1;
pCfg->replicaNum = tjsonGetArraySize(nodeInfo);
pCfg->totalReplicaNum = tjsonGetArraySize(nodeInfo);
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SJson *info = tjsonGetArrayItem(nodeInfo, i);
if (info == NULL) return -1;
tjsonGetUInt16ValueFromDouble(info, "nodePort", pCfg->nodeInfo[i].nodePort, code);
......@@ -126,6 +152,15 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) {
if (code < 0) return -1;
tjsonGetNumberValue(info, "nodeId", pCfg->nodeInfo[i].nodeId, code);
tjsonGetNumberValue(info, "clusterId", pCfg->nodeInfo[i].clusterId, code);
char role[10] = {0};
code = tjsonGetStringValue(info, "nodeRole", role);
if(code < 0) return -1;
if(strlen(role) != 0){
pCfg->nodeInfo[i].nodeRole = syncStrToRole(role);
}
else{
pCfg->nodeInfo[i].nodeRole = TAOS_SYNC_ROLE_VOTER;
}
}
return 0;
......
......@@ -66,10 +66,10 @@ int32_t syncNodeReplicate(SSyncNode* pNode) {
}
int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) {
if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) {
if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->raftCfg.cfg.totalReplicaNum == 1) {
return -1;
}
for (int32_t i = 0; i < pNode->replicaNum; i++) {
for (int32_t i = 0; i < pNode->totalReplicaNum; i++) {
if (syncUtilSameId(&pNode->replicasId[i], &pNode->myRaftId)) {
continue;
}
......
......@@ -795,13 +795,18 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) {
return -1;
}
if (pMsg->term > raftStoreGetTerm(pSyncNode)) {
syncNodeStepDown(pSyncNode, pMsg->term);
if(pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){
if (pMsg->term > raftStoreGetTerm(pSyncNode)) {
syncNodeStepDown(pSyncNode, pMsg->term);
}
}
else{
syncNodeUpdateTermWithoutStepDown(pSyncNode, pMsg->term);
}
// state, term, seq/ack
int32_t code = 0;
if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) {
if (pMsg->term == raftStoreGetTerm(pSyncNode)) {
if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) {
syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq pre-snapshot");
......
......@@ -58,7 +58,7 @@ static void syncNodeCleanConfigIndex(SSyncNode* ths) {
static int32_t syncNodeTimerRoutine(SSyncNode* ths) {
ths->tmrRoutineNum++;
if (ths->tmrRoutineNum % 60 == 0 && ths->replicaNum > 1) {
if (ths->tmrRoutineNum % 60 == 0 && ths->totalReplicaNum > 1) {
sNInfo(ths, "timer routines");
} else {
sNTrace(ths, "timer routines");
......
......@@ -30,7 +30,7 @@ SVotesGranted *voteGrantedCreate(SSyncNode *pNode) {
return NULL;
}
pVotesGranted->replicas = &pNode->replicasId;
pVotesGranted->replicas = (void*)&pNode->replicasId;
pVotesGranted->replicaNum = pNode->replicaNum;
voteGrantedClearVotes(pVotesGranted);
......@@ -49,7 +49,7 @@ void voteGrantedDestroy(SVotesGranted *pVotesGranted) {
}
void voteGrantedUpdate(SVotesGranted *pVotesGranted, SSyncNode *pNode) {
pVotesGranted->replicas = &pNode->replicasId;
pVotesGranted->replicas = (void*)&pNode->replicasId;
pVotesGranted->replicaNum = pNode->replicaNum;
voteGrantedClearVotes(pVotesGranted);
......@@ -115,7 +115,7 @@ SVotesRespond *votesRespondCreate(SSyncNode *pNode) {
return NULL;
}
pVotesRespond->replicas = &pNode->replicasId;
pVotesRespond->replicas = (void*)&pNode->replicasId;
pVotesRespond->replicaNum = pNode->replicaNum;
pVotesRespond->term = 0;
pVotesRespond->pNode = pNode;
......@@ -130,7 +130,7 @@ void votesRespondDestory(SVotesRespond *pVotesRespond) {
}
void votesRespondUpdate(SVotesRespond *pVotesRespond, SSyncNode *pNode) {
pVotesRespond->replicas = &pNode->replicasId;
pVotesRespond->replicas = (void*)&pNode->replicasId;
pVotesRespond->replicaNum = pNode->replicaNum;
pVotesRespond->term = 0;
pVotesRespond->pNode = pNode;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册