diff --git a/include/common/tmsg.h b/include/common/tmsg.h index bb2450e8f7cc1882ef13ebabd9e0a347b5f4607c..b3b665d3233ea221b2bf9b74458a3b187de84373 100644 --- a/include/common/tmsg.h +++ b/include/common/tmsg.h @@ -1286,6 +1286,9 @@ typedef struct { int16_t hashSuffix; int32_t tsdbPageSize; int64_t reserved[8]; + int8_t learnerReplica; + int8_t learnerSelfIndex; + SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA]; } SCreateVnodeReq; int32_t tSerializeSCreateVnodeReq(void* buf, int32_t bufLen, SCreateVnodeReq* pReq); @@ -1357,7 +1360,10 @@ typedef struct { int8_t replica; SReplica replicas[TSDB_MAX_REPLICA]; int64_t reserved[8]; -} SAlterVnodeReplicaReq; + int8_t learnerSelfIndex; + int8_t learnerReplica; + SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA]; +} SAlterVnodeReplicaReq, SAlterVnodeTypeReq; int32_t tSerializeSAlterVnodeReplicaReq(void* buf, int32_t bufLen, SAlterVnodeReplicaReq* pReq); int32_t tDeserializeSAlterVnodeReplicaReq(void* buf, int32_t bufLen, SAlterVnodeReplicaReq* pReq); @@ -1629,7 +1635,10 @@ int32_t tDeserializeSCreateDropMQSNodeReq(void* buf, int32_t bufLen, SMCreateQno typedef struct { int8_t replica; SReplica replicas[TSDB_MAX_REPLICA]; -} SDCreateMnodeReq, SDAlterMnodeReq; + int8_t learnerReplica; + SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA]; + int64_t lastIndex; +} SDCreateMnodeReq, SDAlterMnodeReq, SDAlterMnodeTypeReq; int32_t tSerializeSDCreateMnodeReq(void* buf, int32_t bufLen, SDCreateMnodeReq* pReq); int32_t tDeserializeSDCreateMnodeReq(void* buf, int32_t bufLen, SDCreateMnodeReq* pReq); diff --git a/include/common/tmsgdef.h b/include/common/tmsgdef.h index 6cf6140815cf1663392950b0ee119b0ae65db126..9dca9ea3221af75144788d8f79decae976acb725 100644 --- a/include/common/tmsgdef.h +++ b/include/common/tmsgdef.h @@ -83,6 +83,8 @@ enum { TD_DEF_MSG_TYPE(TDMT_DND_CONFIG_DNODE, "config-dnode", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_DND_SYSTABLE_RETRIEVE, "dnode-retrieve", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_DND_MAX_MSG, "dnd-max", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_DND_ALTER_MNODE_TYPE, "dnode-alter-mnode-type", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_DND_ALTER_VNODE_TYPE, "dnode-alter-vnode-type", NULL, NULL) TD_NEW_MSG_SEG(TDMT_MND_MSG) TD_DEF_MSG_TYPE(TDMT_MND_CONNECT, "connect", NULL, NULL) diff --git a/include/dnode/mnode/mnode.h b/include/dnode/mnode/mnode.h index cdb1642a5c313767fe6a6d5ad53d6202f8c677a6..07a0ca952a1dbebbe43af966c183d8ea0981fc1c 100644 --- a/include/dnode/mnode/mnode.h +++ b/include/dnode/mnode/mnode.h @@ -33,8 +33,11 @@ typedef struct { bool deploy; int8_t selfIndex; int8_t numOfReplicas; - SReplica replicas[TSDB_MAX_REPLICA]; + int8_t numOfTotalReplicas; + SReplica replicas[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + int32_t nodeRoles[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; SMsgCb msgCb; + int64_t lastIndex; } SMnodeOpt; /* ------------------------ SMnode ------------------------ */ @@ -69,6 +72,8 @@ int32_t mndStart(SMnode *pMnode); */ void mndStop(SMnode *pMnode); +int32_t mndIsCatchUp(SMnode *pMnode); + /** * @brief Get mnode monitor info. * diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index fdb4506cf3c00d04b6f5981601eb6915d3cd381b..33cef538d2a7c02e19eac2143965a83e32bcdf27 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -55,6 +55,8 @@ extern "C" { #define SYNC_INDEX_INVALID -1 #define SYNC_TERM_INVALID -1 +#define SYNC_LEARNER_CATCHUP 10 + typedef enum { SYNC_STRATEGY_NO_SNAPSHOT = 0, SYNC_STRATEGY_STANDARD_SNAPSHOT = 1, @@ -76,19 +78,29 @@ typedef enum { TAOS_SYNC_STATE_CANDIDATE = 101, TAOS_SYNC_STATE_LEADER = 102, TAOS_SYNC_STATE_ERROR = 103, + TAOS_SYNC_STATE_LEARNER = 104, } ESyncState; +typedef enum { + TAOS_SYNC_ROLE_VOTER = 0, + TAOS_SYNC_ROLE_LEARNER = 1, + TAOS_SYNC_ROLE_ERROR = 2, +} ESyncRole; + typedef struct SNodeInfo { - int64_t clusterId; - int32_t nodeId; - uint16_t nodePort; - char nodeFqdn[TSDB_FQDN_LEN]; + int64_t clusterId; + int32_t nodeId; + uint16_t nodePort; + char nodeFqdn[TSDB_FQDN_LEN]; + ESyncRole nodeRole; } SNodeInfo; typedef struct SSyncCfg { + int32_t totalReplicaNum; int32_t replicaNum; int32_t myIndex; - SNodeInfo nodeInfo[TSDB_MAX_REPLICA]; + SNodeInfo nodeInfo[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + SyncIndex lastIndex; } SSyncCfg; typedef struct SFsmCbMeta { @@ -155,6 +167,7 @@ typedef struct SSyncFSM { void (*FpBecomeLeaderCb)(const struct SSyncFSM* pFsm); void (*FpBecomeFollowerCb)(const struct SSyncFSM* pFsm); + void (*FpBecomeLearnerCb)(const struct SSyncFSM* pFsm); int32_t (*FpGetSnapshot)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot, void* pReaderParam, void** ppReader); void (*FpGetSnapshotInfo)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot); @@ -236,6 +249,7 @@ void syncStop(int64_t rid); void syncPreStop(int64_t rid); void syncPostStop(int64_t rid); int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak, int64_t* seq); +int32_t syncIsCatchUp(int64_t rid); int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg); int32_t syncReconfig(int64_t rid, SSyncCfg* pCfg); int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex); diff --git a/include/util/tdef.h b/include/util/tdef.h index 2f86395dad7ef49b342adfaaec7a868565889ddd..1e05903407276336d53e8915a3656bdb69f94faa 100644 --- a/include/util/tdef.h +++ b/include/util/tdef.h @@ -285,6 +285,7 @@ typedef enum ELogicConditionType { #define TSDB_DNODE_ROLE_VNODE 2 #define TSDB_MAX_REPLICA 5 +#define TSDB_MAX_LEARNER_REPLICA 10 #define TSDB_SYNC_LOG_BUFFER_SIZE 4096 #define TSDB_SYNC_LOG_BUFFER_RETENTION (TSDB_SYNC_LOG_BUFFER_SIZE >> 4) diff --git a/source/common/src/tmsg.c b/source/common/src/tmsg.c index d9802244b708cb544264765132a7f1acc065c5aa..c785dff9f891af1bb9ab9d4a969996def73e2295 100644 --- a/source/common/src/tmsg.c +++ b/source/common/src/tmsg.c @@ -4129,6 +4129,12 @@ int32_t tSerializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *pR for (int32_t i = 0; i < 8; ++i) { if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1; } + if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1; + if (tEncodeI8(&encoder, pReq->learnerSelfIndex) < 0) return -1; + for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) { + SReplica *pReplica = &pReq->learnerReplicas[i]; + if (tEncodeSReplica(&encoder, pReplica) < 0) return -1; + } tEndEncode(&encoder); @@ -4207,6 +4213,12 @@ int32_t tDeserializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq * for (int32_t i = 0; i < 8; ++i) { if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1; } + if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1; + if (tDecodeI8(&decoder, &pReq->learnerSelfIndex) < 0) return -1; + for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) { + SReplica *pReplica = &pReq->learnerReplicas[i]; + if (tDecodeSReplica(&decoder, pReplica) < 0) return -1; + } tEndDecode(&decoder); tDecoderClear(&decoder); @@ -4433,6 +4445,12 @@ int32_t tSerializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnodeRe for (int32_t i = 0; i < 8; ++i) { if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1; } + if (tEncodeI8(&encoder, pReq->learnerSelfIndex) < 0) return -1; + if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1; + for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) { + SReplica *pReplica = &pReq->learnerReplicas[i]; + if (tEncodeSReplica(&encoder, pReplica) < 0) return -1; + } tEndEncode(&encoder); int32_t tlen = encoder.pos; @@ -4456,6 +4474,12 @@ int32_t tDeserializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnode for (int32_t i = 0; i < 8; ++i) { if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1; } + if (tDecodeI8(&decoder, &pReq->learnerSelfIndex) < 0) return -1; + if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1; + for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) { + SReplica *pReplica = &pReq->learnerReplicas[i]; + if (tDecodeSReplica(&decoder, pReplica) < 0) return -1; + } tEndDecode(&decoder); tDecoderClear(&decoder); @@ -4767,6 +4791,12 @@ int32_t tSerializeSDCreateMnodeReq(void *buf, int32_t bufLen, SDCreateMnodeReq * SReplica *pReplica = &pReq->replicas[i]; if (tEncodeSReplica(&encoder, pReplica) < 0) return -1; } + if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1; + for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) { + SReplica *pReplica = &pReq->learnerReplicas[i]; + if (tEncodeSReplica(&encoder, pReplica) < 0) return -1; + } + if (tEncodeI64(&encoder, pReq->lastIndex) < 0) return -1; tEndEncode(&encoder); int32_t tlen = encoder.pos; @@ -4784,6 +4814,12 @@ int32_t tDeserializeSDCreateMnodeReq(void *buf, int32_t bufLen, SDCreateMnodeReq SReplica *pReplica = &pReq->replicas[i]; if (tDecodeSReplica(&decoder, pReplica) < 0) return -1; } + if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1; + for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) { + SReplica *pReplica = &pReq->learnerReplicas[i]; + if (tDecodeSReplica(&decoder, pReplica) < 0) return -1; + } + if (tDecodeI64(&decoder, &pReq->lastIndex) < 0) return -1; tEndDecode(&decoder); tDecoderClear(&decoder); diff --git a/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h b/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h index 600a1f6829e11a1d38ffa262e7d360687c67b90d..6c40fa808fcaf008b3d4614594eb7bab262525f6 100644 --- a/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h +++ b/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h @@ -32,6 +32,7 @@ typedef struct SDnodeMgmt { TdThread crashReportThread; SSingleWorker mgmtWorker; ProcessCreateNodeFp processCreateNodeFp; + ProcessAlterNodeTypeFp processAlterNodeTypeFp; ProcessDropNodeFp processDropNodeFp; SendMonitorReportFp sendMonitorReportFp; GetVnodeLoadsFp getVnodeLoadsFp; diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmHandle.c b/source/dnode/mgmt/mgmt_dnode/src/dmHandle.c index 228f301aec7712fce516ddad2d22f4b2823a36b0..19982698968c8c91bc3f53edd3868f173b070005 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmHandle.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmHandle.c @@ -352,6 +352,7 @@ SArray *dmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_DND_CONFIG_DNODE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_DND_SERVER_STATUS, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_DND_SYSTABLE_RETRIEVE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_MNODE_TYPE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER; // Requests handled by MNODE if (dmSetMgmtHandle(pArray, TDMT_MND_GRANT, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmInt.c b/source/dnode/mgmt/mgmt_dnode/src/dmInt.c index 51df293ba70bc37be14c763ecb4a5ea296077bd4..09783a5ea9495506392ec68f9ec5395fa4281d6e 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmInt.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmInt.c @@ -48,6 +48,7 @@ static int32_t dmOpenMgmt(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) { pMgmt->path = pInput->path; pMgmt->name = pInput->name; pMgmt->processCreateNodeFp = pInput->processCreateNodeFp; + pMgmt->processAlterNodeTypeFp = pInput->processAlterNodeTypeFp; pMgmt->processDropNodeFp = pInput->processDropNodeFp; pMgmt->sendMonitorReportFp = pInput->sendMonitorReportFp; pMgmt->getVnodeLoadsFp = pInput->getVnodeLoadsFp; diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c index 0d3b423771267cc567a4cfc95f2b550c508342d3..06b6221940a4f0991fe520a58e2918409139380b 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c @@ -227,6 +227,9 @@ static void dmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) { case TDMT_DND_DROP_SNODE: code = (*pMgmt->processDropNodeFp)(SNODE, pMsg); break; + case TDMT_DND_ALTER_MNODE_TYPE: + code = (*pMgmt->processAlterNodeTypeFp)(MNODE, pMsg); + break; case TDMT_DND_SERVER_STATUS: code = dmProcessServerRunStatus(pMgmt, pMsg); break; diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmFile.c b/source/dnode/mgmt/mgmt_mnode/src/mmFile.c index f06669a610142a38b6b2937c95bd150c324df568..cb0849f4b96f945a41a2e27a0abcba2244484bba 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmFile.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmFile.c @@ -24,12 +24,16 @@ static int32_t mmDecodeOption(SJson *pJson, SMnodeOpt *pOption) { if (code < 0) return -1; tjsonGetInt32ValueFromDouble(pJson, "selfIndex", pOption->selfIndex, code); if (code < 0) return 0; + tjsonGetInt32ValueFromDouble(pJson, "lastIndex", pOption->lastIndex, code); + if (code < 0) return 0; SJson *replicas = tjsonGetObjectItem(pJson, "replicas"); if (replicas == NULL) return 0; - pOption->numOfReplicas = tjsonGetArraySize(replicas); + pOption->numOfTotalReplicas = tjsonGetArraySize(replicas); + + pOption->numOfReplicas = 0; - for (int32_t i = 0; i < pOption->numOfReplicas; ++i) { + for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) { SJson *replica = tjsonGetArrayItem(replicas, i); if (replica == NULL) return -1; @@ -40,6 +44,14 @@ static int32_t mmDecodeOption(SJson *pJson, SMnodeOpt *pOption) { if (code < 0) return -1; tjsonGetUInt16ValueFromDouble(replica, "port", pReplica->port, code); if (code < 0) return -1; + tjsonGetInt32ValueFromDouble(replica, "role", pOption->nodeRoles[i], code); + if (code < 0) return -1; + if(pOption->nodeRoles[i] == TAOS_SYNC_ROLE_VOTER){ + pOption->numOfReplicas++; + } + } + + for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) { } return 0; @@ -112,14 +124,14 @@ _OVER: } static int32_t mmEncodeOption(SJson *pJson, const SMnodeOpt *pOption) { - if (pOption->deploy && pOption->numOfReplicas > 0) { + if (pOption->deploy && pOption->numOfTotalReplicas > 0) { if (tjsonAddDoubleToObject(pJson, "selfIndex", pOption->selfIndex) < 0) return -1; SJson *replicas = tjsonCreateArray(); if (replicas == NULL) return -1; if (tjsonAddItemToObject(pJson, "replicas", replicas) < 0) return -1; - for (int32_t i = 0; i < pOption->numOfReplicas; ++i) { + for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) { SJson *replica = tjsonCreateObject(); if (replica == NULL) return -1; @@ -127,10 +139,13 @@ static int32_t mmEncodeOption(SJson *pJson, const SMnodeOpt *pOption) { if (tjsonAddDoubleToObject(replica, "id", pReplica->id) < 0) return -1; if (tjsonAddStringToObject(replica, "fqdn", pReplica->fqdn) < 0) return -1; if (tjsonAddDoubleToObject(replica, "port", pReplica->port) < 0) return -1; + if (tjsonAddDoubleToObject(replica, "role", pOption->nodeRoles[i]) < 0) return -1; if (tjsonAddItemToArray(replicas, replica) < 0) return -1; } } + if (tjsonAddDoubleToObject(pJson, "lastIndex", pOption->lastIndex) < 0) return -1; + if (tjsonAddDoubleToObject(pJson, "deployed", pOption->deploy) < 0) return -1; return 0; diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c index 76977dd4a8178792f3867b3a615ef552c5449c1d..76fe0ec2decb3f17f9c5c9c82bcafd1b47af8f49 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c @@ -33,12 +33,24 @@ int32_t mmProcessCreateReq(const SMgmtInputOpt *pInput, SRpcMsg *pMsg) { return -1; } - SMnodeOpt option = {.deploy = true, .numOfReplicas = createReq.replica, .selfIndex = -1}; + SMnodeOpt option = {.deploy = true, .numOfReplicas = createReq.replica, + .numOfTotalReplicas = createReq.replica + createReq.learnerReplica, + .selfIndex = -1, .lastIndex = createReq.lastIndex}; + memcpy(option.replicas, createReq.replicas, sizeof(createReq.replicas)); - for (int32_t i = 0; i < option.numOfReplicas; ++i) { + for (int32_t i = 0; i < createReq.replica; ++i) { if (createReq.replicas[i].id == pInput->pData->dnodeId) { option.selfIndex = i; } + option.nodeRoles[i] = TAOS_SYNC_ROLE_VOTER; + } + + memcpy(&(option.replicas[createReq.replica]), createReq.learnerReplicas, sizeof(createReq.learnerReplicas)); + for (int32_t i = 0; i < createReq.learnerReplica; ++i) { + if (createReq.learnerReplicas[i].id == pInput->pData->dnodeId) { + option.selfIndex = createReq.replica + i; + } + option.nodeRoles[createReq.replica + i] = TAOS_SYNC_ROLE_LEARNER; } if (option.selfIndex == -1) { @@ -92,6 +104,8 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_DND_CREATE_VNODE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_DND_DROP_VNODE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_DND_CONFIG_DNODE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_MNODE_TYPE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_VNODE_TYPE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_CONNECT, mmPutMsgToReadQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_CREATE_ACCT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmInt.c b/source/dnode/mgmt/mgmt_mnode/src/mmInt.c index e7f71ad4200bd2b1c3c5fcf10e82aa9aa0ea417f..05b59b98652ec2a33335ebfc3d3be351c7075672 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmInt.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmInt.c @@ -45,9 +45,11 @@ static void mmBuildOptionForDeploy(SMnodeMgmt *pMgmt, const SMgmtInputOpt *pInpu pOption->dnodeId = pMgmt->pData->dnodeId; pOption->selfIndex = 0; pOption->numOfReplicas = 1; + pOption->numOfTotalReplicas = 1; pOption->replicas[0].id = 1; pOption->replicas[0].port = tsServerPort; tstrncpy(pOption->replicas[0].fqdn, tsLocalFqdn, TSDB_FQDN_LEN); + pOption->lastIndex = SYNC_INDEX_INVALID; } static void mmBuildOptionForOpen(SMnodeMgmt *pMgmt, SMnodeOpt *pOption) { @@ -123,9 +125,10 @@ static int32_t mmOpen(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) { } tmsgReportStartup("mnode-worker", "initialized"); - if (option.numOfReplicas > 0) { + if (option.numOfTotalReplicas > 0) { option.deploy = true; option.numOfReplicas = 0; + option.numOfTotalReplicas = 0; if (mmWriteFile(pMgmt->path, &option) != 0) { dError("failed to write mnode file since %s", terrstr()); return -1; @@ -152,6 +155,10 @@ static void mmStop(SMnodeMgmt *pMgmt) { mndStop(pMgmt->pMnode); } +static int32_t mmSyncIsCatchUp(SMnodeMgmt *pMgmt) { + return mndIsCatchUp(pMgmt->pMnode); +} + SMgmtFunc mmGetMgmtFunc() { SMgmtFunc mgmtFunc = {0}; mgmtFunc.openFp = mmOpen; @@ -162,6 +169,7 @@ SMgmtFunc mmGetMgmtFunc() { mgmtFunc.dropFp = (NodeDropFp)mmProcessDropReq; mgmtFunc.requiredFp = mmRequire; mgmtFunc.getHandlesFp = mmGetMsgHandles; + mgmtFunc.isCatchUpFp = (NodeIsCatchUpFp)mmSyncIsCatchUp; return mgmtFunc; } diff --git a/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h b/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h index 4374ae363cce8b4138ebb224ddf3528eae2b0784..83fb331dbd5fc368f39c8ab866426200c99ab6bc 100644 --- a/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h +++ b/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h @@ -90,6 +90,7 @@ int32_t vmProcessDropVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg); int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg); int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg); int32_t vmProcessAlterHashRangeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg); +int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg); // vmFile.c int32_t vmGetVnodeListFromFile(SVnodeMgmt *pMgmt, SWrapperCfg **ppCfgs, int32_t *numOfVnodes); diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c index d61eb3ec0390af04e97c2977131d8381c26b351a..42dfb873d31f0a59518d63ea0306edccbb34d35b 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c @@ -131,15 +131,34 @@ static void vmGenerateVnodeCfg(SCreateVnodeReq *pCreate, SVnodeCfg *pCfg) { pCfg->tsdbPageSize = pCreate->tsdbPageSize * 1024; pCfg->standby = 0; - pCfg->syncCfg.myIndex = pCreate->selfIndex; - pCfg->syncCfg.replicaNum = pCreate->replica; + pCfg->syncCfg.replicaNum = 0; + pCfg->syncCfg.totalReplicaNum = 0; + memset(&pCfg->syncCfg.nodeInfo, 0, sizeof(pCfg->syncCfg.nodeInfo)); for (int32_t i = 0; i < pCreate->replica; ++i) { SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i]; - pNode->nodeId = pCreate->replicas[i].id; - pNode->nodePort = pCreate->replicas[i].port; - tstrncpy(pNode->nodeFqdn, pCreate->replicas[i].fqdn, TSDB_FQDN_LEN); + pNode->nodeId = pCreate->replicas[pCfg->syncCfg.replicaNum].id; + pNode->nodePort = pCreate->replicas[pCfg->syncCfg.replicaNum].port; + pNode->nodeRole = TAOS_SYNC_ROLE_VOTER; + tstrncpy(pNode->nodeFqdn, pCreate->replicas[pCfg->syncCfg.replicaNum].fqdn, TSDB_FQDN_LEN); + tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort); + pCfg->syncCfg.replicaNum++; + } + if(pCreate->selfIndex != -1){ + pCfg->syncCfg.myIndex = pCreate->selfIndex; + } + for (int32_t i = pCfg->syncCfg.replicaNum; i < pCreate->replica + pCreate->learnerReplica; ++i) { + SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i]; + pNode->nodeId = pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].id; + pNode->nodePort = pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].port; + pNode->nodeRole = TAOS_SYNC_ROLE_LEARNER; + tstrncpy(pNode->nodeFqdn, pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].fqdn, TSDB_FQDN_LEN); tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort); + pCfg->syncCfg.totalReplicaNum++; + } + pCfg->syncCfg.totalReplicaNum += pCfg->syncCfg.replicaNum; + if(pCreate->learnerSelfIndex != -1){ + pCfg->syncCfg.myIndex = pCreate->replica + pCreate->learnerSelfIndex; } } @@ -182,23 +201,35 @@ int32_t vmProcessCreateVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { return -1; } - dInfo("vgId:%d, start to create vnode, page:%d pageSize:%d buffer:%d szPage:%d szBuf:%" PRIu64 + dInfo("vgId:%d, vnode management handle msgType:%s, start to create vnode, page:%d pageSize:%d buffer:%d szPage:%d szBuf:%" PRIu64 ", cacheLast:%d cacheLastSize:%d sstTrigger:%d tsdbPageSize:%d %d dbname:%s dbId:%" PRId64 ", days:%d keep0:%d keep1:%d keep2:%d tsma:%d precision:%d compression:%d minRows:%d maxRows:%d" ", wal fsync:%d level:%d retentionPeriod:%d retentionSize:%" PRId64 " rollPeriod:%d segSize:%" PRId64 - ", hash method:%d begin:%u end:%u prefix:%d surfix:%d replica:%d selfIndex:%d strict:%d", - req.vgId, req.pages, req.pageSize, req.buffer, req.pageSize * 1024, (uint64_t)req.buffer * 1024 * 1024, + ", hash method:%d begin:%u end:%u prefix:%d surfix:%d replica:%d selfIndex:%d " + "learnerReplica:%d learnerSelfIndex:%d strict:%d", + req.vgId, TMSG_INFO(pMsg->msgType), req.pages, req.pageSize, req.buffer, req.pageSize * 1024, + (uint64_t)req.buffer * 1024 * 1024, req.cacheLast, req.cacheLastSize, req.sstTrigger, req.tsdbPageSize, req.tsdbPageSize * 1024, req.db, req.dbUid, req.daysPerFile, req.daysToKeep0, req.daysToKeep1, req.daysToKeep2, req.isTsma, req.precision, req.compression, req.minRows, req.maxRows, req.walFsyncPeriod, req.walLevel, req.walRetentionPeriod, req.walRetentionSize, req.walRollPeriod, req.walSegmentSize, req.hashMethod, req.hashBegin, req.hashEnd, req.hashPrefix, - req.hashSuffix, req.replica, req.selfIndex, req.strict); + req.hashSuffix, req.replica, req.selfIndex, req.learnerReplica, req.learnerSelfIndex, req.strict); for (int32_t i = 0; i < req.replica; ++i) { dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", req.vgId, i, req.replicas[i].fqdn, req.replicas[i].port, req.replicas[i].id); } + for (int32_t i = 0; i < req.learnerReplica; ++i) { + dInfo("vgId:%d, learnerReplica:%d ep:%s:%u dnode:%d", req.vgId, i, req.learnerReplicas[i].fqdn, + req.learnerReplicas[i].port, req.replicas[i].id); + } - SReplica *pReplica = &req.replicas[req.selfIndex]; + SReplica *pReplica = NULL; + if(req.selfIndex != -1){ + pReplica = &req.replicas[req.selfIndex]; + } + else{ + pReplica = &req.learnerReplicas[req.learnerSelfIndex]; + } if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort || strcmp(pReplica->fqdn, tsLocalFqdn) != 0) { terrno = TSDB_CODE_INVALID_MSG; @@ -275,7 +306,8 @@ _OVER: vnodeClose(pImpl); vnodeDestroy(path, pMgmt->pTfs); } else { - dInfo("vgId:%d, vnode is created", req.vgId); + dInfo("vgId:%d, vnode management handle msgType:%s, end to create vnode, vnode is created", + req.vgId, TMSG_INFO(pMsg->msgType)); } tFreeSCreateVnodeReq(&req); @@ -283,6 +315,106 @@ _OVER: return code; } +int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { + SAlterVnodeTypeReq req = {0}; + if (tDeserializeSAlterVnodeReplicaReq(pMsg->pCont, pMsg->contLen, &req) != 0) { + terrno = TSDB_CODE_INVALID_MSG; + return -1; + } + + dInfo("vgId:%d, vnode management handle msgType:%s, start to process alter-node-type-request", + req.vgId, TMSG_INFO(pMsg->msgType)); + + SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId); + if (pVnode == NULL) { + dError("vgId:%d, failed to alter hashrange since %s", req.vgId, terrstr()); + terrno = TSDB_CODE_VND_NOT_EXIST; + return -1; + } + + dInfo("vgId:%d, checking node catch up", req.vgId); + if(!vnodeIsCatchUp(pVnode->pImpl) == 0){ + return -1; + } + + dInfo("node:%s, catched up leader, continue to process alter-node-type-request", pMgmt->name); + + int32_t vgId = req.vgId; + dInfo("vgId:%d, start to alter vnode type replica:%d selfIndex:%d strict:%d", vgId, req.replica, req.selfIndex, + req.strict); + for (int32_t i = 0; i < req.replica; ++i) { + SReplica *pReplica = &req.replicas[i]; + dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->id); + } + for (int32_t i = 0; i < req.learnerReplica; ++i) { + SReplica *pReplica = &req.learnerReplicas[i]; + dInfo("vgId:%d, learnerReplicas:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->id); + } + + if (req.replica <= 0 || + (req.selfIndex < 0 && req.learnerSelfIndex <0)|| + req.selfIndex >= req.replica || req.learnerSelfIndex >= req.learnerReplica) { + terrno = TSDB_CODE_INVALID_MSG; + dError("vgId:%d, failed to alter replica since invalid msg", vgId); + return -1; + } + + SReplica *pReplica = NULL; + if(req.selfIndex > 0){ + pReplica = &req.replicas[req.selfIndex]; + } + else{ + pReplica = &req.learnerReplicas[req.learnerSelfIndex]; + } + + if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort || + strcmp(pReplica->fqdn, tsLocalFqdn) != 0) { + terrno = TSDB_CODE_INVALID_MSG; + dError("vgId:%d, dnodeId:%d ep:%s:%u not matched with local dnode", vgId, pReplica->id, pReplica->fqdn, + pReplica->port); + return -1; + } + + dInfo("vgId:%d, start to close vnode", vgId); + SWrapperCfg wrapperCfg = { + .dropped = pVnode->dropped, + .vgId = pVnode->vgId, + .vgVersion = pVnode->vgVersion, + }; + tstrncpy(wrapperCfg.path, pVnode->path, sizeof(wrapperCfg.path)); + vmCloseVnode(pMgmt, pVnode, false); + + char path[TSDB_FILENAME_LEN] = {0}; + snprintf(path, TSDB_FILENAME_LEN, "vnode%svnode%d", TD_DIRSEP, vgId); + + dInfo("vgId:%d, start to alter vnode replica at %s", vgId, path); + if (vnodeAlterReplica(path, &req, pMgmt->pTfs) < 0) { + dError("vgId:%d, failed to alter vnode at %s since %s", vgId, path, terrstr()); + return -1; + } + + dInfo("vgId:%d, begin to open vnode", vgId); + SVnode *pImpl = vnodeOpen(path, pMgmt->pTfs, pMgmt->msgCb); + if (pImpl == NULL) { + dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr()); + return -1; + } + + if (vmOpenVnode(pMgmt, &wrapperCfg, pImpl) != 0) { + dError("vgId:%d, failed to open vnode mgmt since %s", vgId, terrstr()); + return -1; + } + + if (vnodeStart(pImpl) != 0) { + dError("vgId:%d, failed to start sync since %s", vgId, terrstr()); + return -1; + } + + dInfo("vgId:%d, vnode management handle msgType:%s, end to process alter-node-type-request, vnode config is altered", + req.vgId, TMSG_INFO(pMsg->msgType)); + return 0; +} + int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { SDisableVnodeWriteReq req = {0}; if (tDeserializeSDisableVnodeWriteReq(pMsg->pCont, pMsg->contLen, &req) != 0) { @@ -378,20 +510,35 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { } int32_t vgId = alterReq.vgId; - dInfo("vgId:%d, start to alter vnode replica:%d selfIndex:%d strict:%d", vgId, alterReq.replica, alterReq.selfIndex, - alterReq.strict); + dInfo("vgId:%d,vnode management handle msgType:%s, start to alter vnode replica:%d selfIndex:%d leanerReplica:%d " + "learnerSelfIndex:%d strict:%d", + vgId, TMSG_INFO(pMsg->msgType), alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica, + alterReq.learnerSelfIndex, alterReq.strict); for (int32_t i = 0; i < alterReq.replica; ++i) { SReplica *pReplica = &alterReq.replicas[i]; dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->port); } - - if (alterReq.replica <= 0 || alterReq.selfIndex < 0 || alterReq.selfIndex >= alterReq.replica) { + for (int32_t i = 0; i < alterReq.learnerReplica; ++i) { + SReplica *pReplica = &alterReq.learnerReplicas[i]; + dInfo("vgId:%d, learnerReplicas:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->port); + } + + if (alterReq.replica <= 0 || + (alterReq.selfIndex < 0 && alterReq.learnerSelfIndex <0)|| + alterReq.selfIndex >= alterReq.replica || alterReq.learnerSelfIndex >= alterReq.learnerReplica) { terrno = TSDB_CODE_INVALID_MSG; dError("vgId:%d, failed to alter replica since invalid msg", vgId); return -1; } - SReplica *pReplica = &alterReq.replicas[alterReq.selfIndex]; + SReplica *pReplica = NULL; + if(alterReq.selfIndex != -1){ + pReplica = &alterReq.replicas[alterReq.selfIndex]; + } + else{ + pReplica = &alterReq.learnerReplicas[alterReq.learnerSelfIndex]; + } + if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort || strcmp(pReplica->fqdn, tsLocalFqdn) != 0) { terrno = TSDB_CODE_INVALID_MSG; @@ -425,7 +572,7 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { return -1; } - dInfo("vgId:%d, close vnode", vgId); + dInfo("vgId:%d, begin to open vnode", vgId); SVnode *pImpl = vnodeOpen(path, pMgmt->pTfs, pMgmt->msgCb); if (pImpl == NULL) { dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr()); @@ -442,7 +589,10 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { return -1; } - dInfo("vgId:%d, vnode config is altered", vgId); + dInfo("vgId:%d, vnode management handle msgType:%s, end to alter vnode replica:%d selfIndex:%d leanerReplica:%d " + "learnerSelfIndex:%d strict:%d", + vgId, TMSG_INFO(pMsg->msgType), alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica, + alterReq.learnerSelfIndex, alterReq.strict); return 0; } @@ -548,6 +698,7 @@ SArray *vmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_VND_TRIM, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_DND_CREATE_VNODE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_DND_DROP_VNODE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_VNODE_TYPE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_TIMEOUT_ELECTION, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_CLIENT_REQUEST, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c index da08bd01aca0ef60674ec6bb14c2c1cb1ebc781f..a7c4b2e70e13155d955fb2fd8d4862fc6725b2ea 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c @@ -49,6 +49,9 @@ static void vmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) { case TDMT_VND_ALTER_HASHRANGE: code = vmProcessAlterHashRangeReq(pMgmt, pMsg); break; + case TDMT_DND_ALTER_VNODE_TYPE: + code = vmProcessAlterVnodeTypeReq(pMgmt, pMsg); + break; default: terrno = TSDB_CODE_MSG_NOT_PROCESSED; dGError("msg:%p, not processed in vnode-mgmt queue", pMsg); diff --git a/source/dnode/mgmt/node_mgmt/src/dmEnv.c b/source/dnode/mgmt/node_mgmt/src/dmEnv.c index acf96ad397c405c35fe7089aa4d618dcd8eb13e5..c53c21cd3096ec061e0e88b2b457ca62c4f302a1 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmEnv.c +++ b/source/dnode/mgmt/node_mgmt/src/dmEnv.c @@ -169,6 +169,8 @@ static int32_t dmProcessCreateNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) { return -1; } + dInfo("start to process create-node-request"); + pWrapper = &pDnode->wrappers[ntype]; if (taosMkDir(pWrapper->path) != 0) { dmReleaseWrapper(pWrapper); @@ -198,6 +200,64 @@ static int32_t dmProcessCreateNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) { return code; } +static int32_t dmProcessAlterNodeTypeReq(EDndNodeType ntype, SRpcMsg *pMsg) { + SDnode *pDnode = dmInstance(); + + SMgmtWrapper *pWrapper = dmAcquireWrapper(pDnode, ntype); + if (pWrapper == NULL) { + dError("fail to process alter node type since node not exist"); + return -1; + } + dmReleaseWrapper(pWrapper); + + dInfo("node:%s, start to process alter-node-type-request", pWrapper->name); + + pWrapper = &pDnode->wrappers[ntype]; + + if(pWrapper->func.isCatchUpFp != NULL){ + dInfo("node:%s, checking node catch up", pWrapper->name); + if(!(*pWrapper->func.isCatchUpFp)(pWrapper->pMgmt) == 0){ + return -1; + } + } + + dInfo("node:%s, catched up leader, continue to process alter-node-type-request", pWrapper->name); + + taosThreadMutexLock(&pDnode->mutex); + + dInfo("node:%s, stopping node", pWrapper->name); + dmStopNode(pWrapper); + dInfo("node:%s, closing node", pWrapper->name); + dmCloseNode(pWrapper); + + pWrapper = &pDnode->wrappers[ntype]; + if (taosMkDir(pWrapper->path) != 0) { + dmReleaseWrapper(pWrapper); + terrno = TAOS_SYSTEM_ERROR(errno); + dError("failed to create dir:%s since %s", pWrapper->path, terrstr()); + return -1; + } + + SMgmtInputOpt input = dmBuildMgmtInputOpt(pWrapper); + + dInfo("node:%s, start to create", pWrapper->name); + int32_t code = (*pWrapper->func.createFp)(&input, pMsg); + if (code != 0) { + dError("node:%s, failed to create since %s", pWrapper->name, terrstr()); + } else { + dInfo("node:%s, has been created", pWrapper->name); + code = dmOpenNode(pWrapper); + if (code == 0) { + code = dmStartNode(pWrapper); + } + pWrapper->deployed = true; + pWrapper->required = true; + } + + taosThreadMutexUnlock(&pDnode->mutex); + return code; +} + static int32_t dmProcessDropNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) { SDnode *pDnode = dmInstance(); @@ -251,6 +311,7 @@ SMgmtInputOpt dmBuildMgmtInputOpt(SMgmtWrapper *pWrapper) { .name = pWrapper->name, .pData = &pWrapper->pDnode->data, .processCreateNodeFp = dmProcessCreateNodeReq, + .processAlterNodeTypeFp = dmProcessAlterNodeTypeReq, .processDropNodeFp = dmProcessDropNodeReq, .sendMonitorReportFp = dmSendMonitorReport, .getVnodeLoadsFp = dmGetVnodeLoads, diff --git a/source/dnode/mgmt/node_util/inc/dmUtil.h b/source/dnode/mgmt/node_util/inc/dmUtil.h index cfdea40477b1dae1cabc35e8566c1ff7ad86a148..000ce8120797e7246c9a85dc80a2ccd0d40c7ad1 100644 --- a/source/dnode/mgmt/node_util/inc/dmUtil.h +++ b/source/dnode/mgmt/node_util/inc/dmUtil.h @@ -89,6 +89,7 @@ typedef void (*SendMonitorReportFp)(); typedef void (*GetVnodeLoadsFp)(SMonVloadInfo *pInfo); typedef void (*GetMnodeLoadsFp)(SMonMloadInfo *pInfo); typedef void (*GetQnodeLoadsFp)(SQnodeLoad *pInfo); +typedef int32_t (*ProcessAlterNodeTypeFp)(EDndNodeType ntype, SRpcMsg *pMsg); typedef struct { int32_t dnodeId; @@ -112,6 +113,7 @@ typedef struct { SDnodeData *pData; SMsgCb msgCb; ProcessCreateNodeFp processCreateNodeFp; + ProcessAlterNodeTypeFp processAlterNodeTypeFp; ProcessDropNodeFp processDropNodeFp; SendMonitorReportFp sendMonitorReportFp; GetVnodeLoadsFp getVnodeLoadsFp; @@ -132,6 +134,7 @@ typedef int32_t (*NodeCreateFp)(const SMgmtInputOpt *pInput, SRpcMsg *pMsg); typedef int32_t (*NodeDropFp)(const SMgmtInputOpt *pInput, SRpcMsg *pMsg); typedef int32_t (*NodeRequireFp)(const SMgmtInputOpt *pInput, bool *required); typedef SArray *(*NodeGetHandlesFp)(); // array of SMgmtHandle +typedef bool (*NodeIsCatchUpFp)(void *pMgmt); typedef struct { NodeOpenFp openFp; @@ -142,6 +145,7 @@ typedef struct { NodeDropFp dropFp; NodeRequireFp requiredFp; NodeGetHandlesFp getHandlesFp; + NodeIsCatchUpFp isCatchUpFp; } SMgmtFunc; typedef struct { diff --git a/source/dnode/mnode/impl/inc/mndDef.h b/source/dnode/mnode/impl/inc/mndDef.h index 3a4f06f6fa3ecf4645ab0d67c52b2e2e3d04dd47..cc295f0d24d185a21d876b76bc00d62682a045b9 100644 --- a/source/dnode/mnode/impl/inc/mndDef.h +++ b/source/dnode/mnode/impl/inc/mndDef.h @@ -215,6 +215,8 @@ typedef struct { bool syncRestore; int64_t stateStartTime; SDnodeObj* pDnode; + int32_t role; + SyncIndex lastIndex; } SMnodeObj; typedef struct { @@ -341,6 +343,7 @@ typedef struct { ESyncState syncState; bool syncRestore; bool syncCanRead; + ESyncRole nodeRole; } SVnodeGid; typedef struct { @@ -361,7 +364,7 @@ typedef struct { int8_t compact; int8_t isTsma; int8_t replica; - SVnodeGid vnodeGid[TSDB_MAX_REPLICA]; + SVnodeGid vnodeGid[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; void* pTsma; int32_t numOfCachedTables; } SVgObj; diff --git a/source/dnode/mnode/impl/inc/mndInt.h b/source/dnode/mnode/impl/inc/mndInt.h index ffb2443808b9efdd2e0a3dfbe89662b88962f6be..ec83f8a7e6daed8e7a0685087d35c5aad90e390e 100644 --- a/source/dnode/mnode/impl/inc/mndInt.h +++ b/source/dnode/mnode/impl/inc/mndInt.h @@ -92,8 +92,11 @@ typedef struct { int64_t transSeq; TdThreadMutex lock; int8_t selfIndex; + int8_t numOfTotalReplicas; int8_t numOfReplicas; - SReplica replicas[TSDB_MAX_REPLICA]; + SReplica replicas[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + ESyncRole nodeRoles[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + SyncIndex lastIndex; } SSyncMgmt; typedef struct { diff --git a/source/dnode/mnode/impl/src/mndMain.c b/source/dnode/mnode/impl/src/mndMain.c index 5c20887cf5b548fc291d5ae37aac10e9a1f506b1..13ae4a11d5c61938c15e26a12b09c4c9fd057231 100644 --- a/source/dnode/mnode/impl/src/mndMain.c +++ b/source/dnode/mnode/impl/src/mndMain.c @@ -491,7 +491,10 @@ static void mndSetOptions(SMnode *pMnode, const SMnodeOpt *pOption) { pMnode->selfDnodeId = pOption->dnodeId; pMnode->syncMgmt.selfIndex = pOption->selfIndex; pMnode->syncMgmt.numOfReplicas = pOption->numOfReplicas; + pMnode->syncMgmt.numOfTotalReplicas = pOption->numOfTotalReplicas; + pMnode->syncMgmt.lastIndex = pOption->lastIndex; memcpy(pMnode->syncMgmt.replicas, pOption->replicas, sizeof(pOption->replicas)); + memcpy(pMnode->syncMgmt.nodeRoles, pOption->nodeRoles, sizeof(pOption->nodeRoles)); } SMnode *mndOpen(const char *path, const SMnodeOpt *pOption) { @@ -582,6 +585,11 @@ int32_t mndStart(SMnode *pMnode) { return mndInitTimer(pMnode); } +int32_t mndIsCatchUp(SMnode *pMnode) { + int64_t rid = pMnode->syncMgmt.sync; + return syncIsCatchUp(rid); +} + void mndStop(SMnode *pMnode) { mndSetStop(pMnode); mndSyncStop(pMnode); diff --git a/source/dnode/mnode/impl/src/mndMnode.c b/source/dnode/mnode/impl/src/mndMnode.c index 50fab447e3c7b8fb85ece7c025059358f6ab9e6f..53baf843de7d0a83a26fa1129ada5329f7862238 100644 --- a/source/dnode/mnode/impl/src/mndMnode.c +++ b/source/dnode/mnode/impl/src/mndMnode.c @@ -23,7 +23,7 @@ #include "mndTrans.h" #include "tmisce.h" -#define MNODE_VER_NUMBER 1 +#define MNODE_VER_NUMBER 2 #define MNODE_RESERVE_SIZE 64 static int32_t mndCreateDefaultMnode(SMnode *pMnode); @@ -53,6 +53,7 @@ int32_t mndInitMnode(SMnode *pMnode) { mndSetMsgHandle(pMnode, TDMT_MND_CREATE_MNODE, mndProcessCreateMnodeReq); mndSetMsgHandle(pMnode, TDMT_DND_CREATE_MNODE_RSP, mndTransProcessRsp); + mndSetMsgHandle(pMnode, TDMT_DND_ALTER_MNODE_TYPE_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_MND_ALTER_MNODE, mndProcessAlterMnodeReq); mndSetMsgHandle(pMnode, TDMT_MND_ALTER_MNODE_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_MND_DROP_MNODE, mndProcessDropMnodeReq); @@ -126,6 +127,8 @@ static SSdbRaw *mndMnodeActionEncode(SMnodeObj *pObj) { SDB_SET_INT32(pRaw, dataPos, pObj->id, _OVER) SDB_SET_INT64(pRaw, dataPos, pObj->createdTime, _OVER) SDB_SET_INT64(pRaw, dataPos, pObj->updateTime, _OVER) + SDB_SET_INT32(pRaw, dataPos, pObj->role, _OVER) + SDB_SET_INT64(pRaw, dataPos, pObj->lastIndex, _OVER) SDB_SET_RESERVE(pRaw, dataPos, MNODE_RESERVE_SIZE, _OVER) terrno = 0; @@ -149,7 +152,7 @@ static SSdbRow *mndMnodeActionDecode(SSdbRaw *pRaw) { int8_t sver = 0; if (sdbGetRawSoftVer(pRaw, &sver) != 0) return NULL; - if (sver != MNODE_VER_NUMBER) { + if (sver != 1 && sver != 2) { terrno = TSDB_CODE_SDB_INVALID_DATA_VER; goto _OVER; } @@ -164,6 +167,10 @@ static SSdbRow *mndMnodeActionDecode(SSdbRaw *pRaw) { SDB_GET_INT32(pRaw, dataPos, &pObj->id, _OVER) SDB_GET_INT64(pRaw, dataPos, &pObj->createdTime, _OVER) SDB_GET_INT64(pRaw, dataPos, &pObj->updateTime, _OVER) + if(sver >=2){ + SDB_GET_INT32(pRaw, dataPos, &pObj->role, _OVER) + SDB_GET_INT64(pRaw, dataPos, &pObj->lastIndex, _OVER) + } SDB_GET_RESERVE(pRaw, dataPos, MNODE_RESERVE_SIZE, _OVER) terrno = 0; @@ -204,7 +211,9 @@ static int32_t mndMnodeActionDelete(SSdb *pSdb, SMnodeObj *pObj) { static int32_t mndMnodeActionUpdate(SSdb *pSdb, SMnodeObj *pOld, SMnodeObj *pNew) { mTrace("mnode:%d, perform update action, old row:%p new row:%p", pOld->id, pOld, pNew); + pOld->role = pNew->role; pOld->updateTime = pNew->updateTime; + pOld->lastIndex = pNew->lastIndex; mndReloadSyncConfig(pSdb->pMnode); return 0; @@ -302,6 +311,27 @@ static int32_t mndBuildCreateMnodeRedoAction(STrans *pTrans, SDCreateMnodeReq *p return 0; } +static int32_t mndBuildAlterMnodeTypeRedoAction(STrans *pTrans, + SDAlterMnodeTypeReq *pAlterMnodeTypeReq, SEpSet *pAlterMnodeTypeEpSet) { + int32_t contLen = tSerializeSDCreateMnodeReq(NULL, 0, pAlterMnodeTypeReq); + void *pReq = taosMemoryMalloc(contLen); + tSerializeSDCreateMnodeReq(pReq, contLen, pAlterMnodeTypeReq); + + STransAction action = { + .epSet = *pAlterMnodeTypeEpSet, + .pCont = pReq, + .contLen = contLen, + .msgType = TDMT_DND_ALTER_MNODE_TYPE, + .acceptableCode = TSDB_CODE_MNODE_ALREADY_DEPLOYED, + }; + + if (mndTransAppendRedoAction(pTrans, &action) != 0) { + taosMemoryFree(pReq); + return -1; + } + return 0; +} + static int32_t mndBuildAlterMnodeRedoAction(STrans *pTrans, SDCreateMnodeReq *pAlterReq, SEpSet *pAlterEpSet) { int32_t contLen = tSerializeSDCreateMnodeReq(NULL, 0, pAlterReq); void *pReq = taosMemoryMalloc(contLen); @@ -347,6 +377,7 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno SSdb *pSdb = pMnode->pSdb; void *pIter = NULL; int32_t numOfReplicas = 0; + int32_t numOfLearnerReplicas = 0; SDCreateMnodeReq createReq = {0}; SEpSet createEpset = {0}; @@ -355,18 +386,29 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno pIter = sdbFetch(pSdb, SDB_MNODE, pIter, (void **)&pMObj); if (pIter == NULL) break; - createReq.replicas[numOfReplicas].id = pMObj->id; - createReq.replicas[numOfReplicas].port = pMObj->pDnode->port; - memcpy(createReq.replicas[numOfReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN); + if(pMObj->role == TAOS_SYNC_ROLE_VOTER){ + createReq.replicas[numOfReplicas].id = pMObj->id; + createReq.replicas[numOfReplicas].port = pMObj->pDnode->port; + memcpy(createReq.replicas[numOfReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN); + numOfReplicas++; + } + else{ + createReq.learnerReplicas[numOfLearnerReplicas].id = pMObj->id; + createReq.learnerReplicas[numOfLearnerReplicas].port = pMObj->pDnode->port; + memcpy(createReq.learnerReplicas[numOfLearnerReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN); + numOfLearnerReplicas++; + } - numOfReplicas++; sdbRelease(pSdb, pMObj); } - createReq.replica = numOfReplicas + 1; - createReq.replicas[numOfReplicas].id = pDnode->id; - createReq.replicas[numOfReplicas].port = pDnode->port; - memcpy(createReq.replicas[numOfReplicas].fqdn, pDnode->fqdn, TSDB_FQDN_LEN); + createReq.replica = numOfReplicas; + createReq.learnerReplica = numOfLearnerReplicas + 1; + createReq.learnerReplicas[numOfLearnerReplicas].id = pDnode->id; + createReq.learnerReplicas[numOfLearnerReplicas].port = pDnode->port; + memcpy(createReq.learnerReplicas[numOfLearnerReplicas].fqdn, pDnode->fqdn, TSDB_FQDN_LEN); + + createReq.lastIndex = pObj->lastIndex; createEpset.inUse = 0; createEpset.numOfEps = 1; @@ -378,23 +420,78 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno return 0; } +static int32_t mndSetAlterMnodeTypeRedoActions(SMnode *pMnode, STrans *pTrans, SDnodeObj *pDnode, SMnodeObj *pObj) { + SSdb *pSdb = pMnode->pSdb; + void *pIter = NULL; + SDAlterMnodeTypeReq alterReq = {0}; + SEpSet createEpset = {0}; + + while (1) { + SMnodeObj *pMObj = NULL; + pIter = sdbFetch(pSdb, SDB_MNODE, pIter, (void **)&pMObj); + if (pIter == NULL) break; + + if(pMObj->role == TAOS_SYNC_ROLE_VOTER){ + alterReq.replicas[alterReq.replica].id = pMObj->id; + alterReq.replicas[alterReq.replica].port = pMObj->pDnode->port; + memcpy(alterReq.replicas[alterReq.replica].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN); + alterReq.replica++; + } + else{ + alterReq.learnerReplicas[alterReq.learnerReplica].id = pMObj->id; + alterReq.learnerReplicas[alterReq.learnerReplica].port = pMObj->pDnode->port; + memcpy(alterReq.learnerReplicas[alterReq.learnerReplica].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN); + alterReq.learnerReplica++; + } + + sdbRelease(pSdb, pMObj); + } + + alterReq.replicas[alterReq.replica].id = pDnode->id; + alterReq.replicas[alterReq.replica].port = pDnode->port; + memcpy(alterReq.replicas[alterReq.replica].fqdn, pDnode->fqdn, TSDB_FQDN_LEN); + alterReq.replica++; + + alterReq.lastIndex = pObj->lastIndex; + + createEpset.inUse = 0; + createEpset.numOfEps = 1; + createEpset.eps[0].port = pDnode->port; + memcpy(createEpset.eps[0].fqdn, pDnode->fqdn, TSDB_FQDN_LEN); + + if (mndBuildAlterMnodeTypeRedoAction(pTrans, &alterReq, &createEpset) != 0) return -1; + + return 0; +} + static int32_t mndCreateMnode(SMnode *pMnode, SRpcMsg *pReq, SDnodeObj *pDnode, SMCreateMnodeReq *pCreate) { int32_t code = -1; - SMnodeObj mnodeObj = {0}; - mnodeObj.id = pDnode->id; - mnodeObj.createdTime = taosGetTimestampMs(); - mnodeObj.updateTime = mnodeObj.createdTime; - STrans *pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_GLOBAL, pReq, "create-mnode"); if (pTrans == NULL) goto _OVER; mndTransSetSerial(pTrans); mInfo("trans:%d, used to create mnode:%d", pTrans->id, pCreate->dnodeId); if (mndTrancCheckConflict(pMnode, pTrans) != 0) goto _OVER; + SMnodeObj mnodeObj = {0}; + mnodeObj.id = pDnode->id; + mnodeObj.createdTime = taosGetTimestampMs(); + mnodeObj.updateTime = mnodeObj.createdTime; + mnodeObj.role = TAOS_SYNC_ROLE_LEARNER; + mnodeObj.lastIndex = pMnode->applied; + if (mndSetCreateMnodeRedoActions(pMnode, pTrans, pDnode, &mnodeObj) != 0) goto _OVER; if (mndSetCreateMnodeRedoLogs(pMnode, pTrans, &mnodeObj) != 0) goto _OVER; - if (mndSetCreateMnodeCommitLogs(pMnode, pTrans, &mnodeObj) != 0) goto _OVER; + + SMnodeObj mnodeLeaderObj = {0}; + mnodeLeaderObj.id = pDnode->id; + mnodeLeaderObj.createdTime = taosGetTimestampMs(); + mnodeLeaderObj.updateTime = mnodeLeaderObj.createdTime; + mnodeLeaderObj.role = TAOS_SYNC_ROLE_VOTER; + mnodeLeaderObj.lastIndex = pMnode->applied + 1; + + if (mndSetAlterMnodeTypeRedoActions(pMnode, pTrans, pDnode, &mnodeLeaderObj) != 0) goto _OVER; + if (mndSetCreateMnodeCommitLogs(pMnode, pTrans, &mnodeLeaderObj) != 0) goto _OVER; if (mndTransPrepare(pMnode, pTrans) != 0) goto _OVER; code = 0; @@ -514,6 +611,7 @@ static int32_t mndSetDropMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDnode int32_t mndSetDropMnodeInfoToTrans(SMnode *pMnode, STrans *pTrans, SMnodeObj *pObj, bool force) { if (pObj == NULL) return 0; + pObj->lastIndex = pMnode->applied; if (mndSetDropMnodeRedoActions(pMnode, pTrans, pObj->pDnode, pObj, force) != 0) return -1; if (mndSetDropMnodeCommitLogs(pMnode, pTrans, pObj) != 0) return -1; return 0; @@ -730,7 +828,8 @@ static void mndReloadSyncConfig(SMnode *pMnode) { void *pIter = NULL; int32_t updatingMnodes = 0; int32_t readyMnodes = 0; - SSyncCfg cfg = {.myIndex = -1}; + SSyncCfg cfg = {.myIndex = -1, .lastIndex = 0,}; + SyncIndex maxIndex = 0; while (1) { pIter = sdbFetchAll(pSdb, SDB_MNODE, pIter, (void **)&pObj, &objStatus, false); @@ -745,26 +844,41 @@ static void mndReloadSyncConfig(SMnode *pMnode) { } if (objStatus == SDB_STATUS_READY || objStatus == SDB_STATUS_CREATING) { - SNodeInfo *pNode = &cfg.nodeInfo[cfg.replicaNum]; + SNodeInfo *pNode = &cfg.nodeInfo[cfg.totalReplicaNum]; pNode->nodeId = pObj->pDnode->id; pNode->clusterId = mndGetClusterId(pMnode); pNode->nodePort = pObj->pDnode->port; + pNode->nodeRole = pObj->role; tstrncpy(pNode->nodeFqdn, pObj->pDnode->fqdn, TSDB_FQDN_LEN); (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort); mInfo("vgId:1, ep:%s:%u dnode:%d", pNode->nodeFqdn, pNode->nodePort, pNode->nodeId); if (pObj->pDnode->id == pMnode->selfDnodeId) { - cfg.myIndex = cfg.replicaNum; + cfg.myIndex = cfg.totalReplicaNum; + } + if(pNode->nodeRole == TAOS_SYNC_ROLE_VOTER){ + cfg.replicaNum++; + } + cfg.totalReplicaNum++; + if(pObj->lastIndex > cfg.lastIndex){ + cfg.lastIndex = pObj->lastIndex; + } + } + + if (objStatus == SDB_STATUS_DROPPING) { + if(pObj->lastIndex > cfg.lastIndex){ + cfg.lastIndex = pObj->lastIndex; } - cfg.replicaNum++; } + mInfo("vgId:1, mnode:%d, role:%d, lastIndex:%" PRId64, pObj->id, pObj->role, pObj->lastIndex); + sdbReleaseLock(pSdb, pObj, false); } - if (readyMnodes <= 0 || updatingMnodes <= 0) { - mInfo("vgId:1, mnode sync not reconfig since readyMnodes:%d updatingMnodes:%d", readyMnodes, updatingMnodes); - return; - } + //if (readyMnodes <= 0 || updatingMnodes <= 0) { + // mInfo("vgId:1, mnode sync not reconfig since readyMnodes:%d updatingMnodes:%d", readyMnodes, updatingMnodes); + // return; + //} if (cfg.myIndex == -1) { #if 1 @@ -777,12 +891,14 @@ static void mndReloadSyncConfig(SMnode *pMnode) { return; } - if (updatingMnodes > 0) { - mInfo("vgId:1, mnode sync reconfig, replica:%d myIndex:%d", cfg.replicaNum, cfg.myIndex); - for (int32_t i = 0; i < cfg.replicaNum; ++i) { + if (pMnode->syncMgmt.sync > 0) { + mInfo("vgId:1, mnode sync reconfig, totalReplica:%d replica:%d myIndex:%d", + cfg.totalReplicaNum, cfg.replicaNum, cfg.myIndex); + + for (int32_t i = 0; i < cfg.totalReplicaNum; ++i) { SNodeInfo *pNode = &cfg.nodeInfo[i]; - mInfo("vgId:1, index:%d, ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId, - pNode->clusterId); + mInfo("vgId:1, index:%d, ep:%s:%u dnode:%d cluster:%" PRId64 " role:%d", i, pNode->nodeFqdn, pNode->nodePort, + pNode->nodeId, pNode->clusterId, pNode->nodeRole); } int32_t code = syncReconfig(pMnode->syncMgmt.sync, &cfg); diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index 4965d5c34a2f09b3ce25edf4bc6ed902a2f23a42..0a6df02f5fbf2636de06358831b577917846b797 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -237,6 +237,23 @@ static void mndBecomeFollower(const SSyncFSM *pFsm) { taosThreadMutexUnlock(&pMgmt->lock); } +static void mndBecomeLearner(const SSyncFSM *pFsm) { + SMnode *pMnode = pFsm->data; + SSyncMgmt *pMgmt = &pMnode->syncMgmt; + mInfo("vgId:1, become learner"); + + taosThreadMutexLock(&pMgmt->lock); + if (pMgmt->transId != 0) { + mInfo("vgId:1, become learner and post sem, trans:%d, failed to propose since not leader", pMgmt->transId); + pMgmt->transId = 0; + pMgmt->transSec = 0; + pMgmt->transSeq = 0; + pMgmt->errCode = TSDB_CODE_SYN_NOT_LEADER; + tsem_post(&pMgmt->syncSem); + } + taosThreadMutexUnlock(&pMgmt->lock); +} + static void mndBecomeLeader(const SSyncFSM *pFsm) { mInfo("vgId:1, become leader"); SMnode *pMnode = pFsm->data; @@ -278,6 +295,7 @@ SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) { pFsm->FpReConfigCb = NULL; pFsm->FpBecomeLeaderCb = mndBecomeLeader; pFsm->FpBecomeFollowerCb = mndBecomeFollower; + pFsm->FpBecomeLearnerCb = mndBecomeLearner; pFsm->FpGetSnapshot = mndSyncGetSnapshot; pFsm->FpGetSnapshotInfo = mndSyncGetSnapshotInfo; pFsm->FpSnapshotStartRead = mndSnapshotStartRead; @@ -317,13 +335,16 @@ int32_t mndInitSync(SMnode *pMnode) { mInfo("vgId:1, start to open sync, replica:%d selfIndex:%d", pMgmt->numOfReplicas, pMgmt->selfIndex); SSyncCfg *pCfg = &syncInfo.syncCfg; + pCfg->totalReplicaNum = pMgmt->numOfTotalReplicas; pCfg->replicaNum = pMgmt->numOfReplicas; pCfg->myIndex = pMgmt->selfIndex; - for (int32_t i = 0; i < pMgmt->numOfReplicas; ++i) { + pCfg->lastIndex = pMgmt->lastIndex; + for (int32_t i = 0; i < pMgmt->numOfTotalReplicas; ++i) { SNodeInfo *pNode = &pCfg->nodeInfo[i]; pNode->nodeId = pMgmt->replicas[i].id; pNode->nodePort = pMgmt->replicas[i].port; tstrncpy(pNode->nodeFqdn, pMgmt->replicas[i].fqdn, sizeof(pNode->nodeFqdn)); + pNode->nodeRole = pMgmt->nodeRoles[i]; (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort); mInfo("vgId:1, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId, pNode->clusterId); diff --git a/source/dnode/mnode/impl/src/mndVgroup.c b/source/dnode/mnode/impl/src/mndVgroup.c index ed1fddb63f9310b5c0aae6d99d249c47a20a2a7d..d674db5b4bdc2571104f14c0843440594e267d12 100644 --- a/source/dnode/mnode/impl/src/mndVgroup.c +++ b/source/dnode/mnode/impl/src/mndVgroup.c @@ -62,6 +62,7 @@ int32_t mndInitVgroup(SMnode *pMnode) { mndSetMsgHandle(pMnode, TDMT_VND_COMPACT_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_VND_DISABLE_WRITE_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_SYNC_FORCE_FOLLOWER_RSP, mndTransProcessRsp); + mndSetMsgHandle(pMnode, TDMT_DND_ALTER_VNODE_TYPE_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_MND_REDISTRIBUTE_VGROUP, mndProcessRedistributeVgroupMsg); mndSetMsgHandle(pMnode, TDMT_MND_SPLIT_VGROUP, mndProcessSplitVgroupMsg); @@ -203,7 +204,7 @@ static int32_t mndVgroupActionUpdate(SSdb *pSdb, SVgObj *pOld, SVgObj *pNew) { pNew->compStorage = pOld->compStorage; pNew->pointsWritten = pOld->pointsWritten; pNew->compact = pOld->compact; - memcpy(pOld->vnodeGid, pNew->vnodeGid, TSDB_MAX_REPLICA * sizeof(SVnodeGid)); + memcpy(pOld->vnodeGid, pNew->vnodeGid, (TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA) * sizeof(SVnodeGid)); return 0; } @@ -244,8 +245,10 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg createReq.compression = pDb->cfg.compression; createReq.strict = pDb->cfg.strict; createReq.cacheLast = pDb->cfg.cacheLast; - createReq.replica = pVgroup->replica; + createReq.replica = 0; + createReq.learnerReplica = 0; createReq.selfIndex = -1; + createReq.learnerSelfIndex = -1; createReq.hashBegin = pVgroup->hashBegin; createReq.hashEnd = pVgroup->hashEnd; createReq.hashMethod = pDb->cfg.hashMethod; @@ -263,7 +266,15 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg createReq.tsdbPageSize = pDb->cfg.tsdbPageSize; for (int32_t v = 0; v < pVgroup->replica; ++v) { - SReplica *pReplica = &createReq.replicas[v]; + SReplica *pReplica = NULL; + + if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){ + pReplica = &createReq.replicas[createReq.replica]; + } + else{ + pReplica = &createReq.learnerReplicas[createReq.learnerReplica]; + } + SVnodeGid *pVgid = &pVgroup->vnodeGid[v]; SDnodeObj *pVgidDnode = mndAcquireDnode(pMnode, pVgid->dnodeId); if (pVgidDnode == NULL) { @@ -275,21 +286,40 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg memcpy(pReplica->fqdn, pVgidDnode->fqdn, TSDB_FQDN_LEN); mndReleaseDnode(pMnode, pVgidDnode); - if (pDnode->id == pVgid->dnodeId) { - createReq.selfIndex = v; + if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){ + if (pDnode->id == pVgid->dnodeId) { + createReq.selfIndex = createReq.replica; + } + } + else{ + if (pDnode->id == pVgid->dnodeId) { + createReq.learnerSelfIndex = createReq.learnerReplica; + } + } + + if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){ + createReq.replica++; + } + else{ + createReq.learnerReplica++; } } - if (createReq.selfIndex == -1) { + if (createReq.selfIndex == -1 && createReq.learnerSelfIndex == -1) { terrno = TSDB_CODE_APP_ERROR; return NULL; } - mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d strict:%d", createReq.vgId, createReq.replica, - createReq.selfIndex, createReq.strict); + mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d learnerReplica:%d learnerSelfIndex:%d strict:%d", + createReq.vgId, createReq.replica, createReq.selfIndex, createReq.learnerReplica, + createReq.learnerReplica, createReq.strict); for (int32_t i = 0; i < createReq.replica; ++i) { mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.replicas[i].fqdn, createReq.replicas[i].port); } + for (int32_t i = 0; i < createReq.learnerReplica; ++i) { + mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.learnerReplicas[i].fqdn, + createReq.learnerReplicas[i].port); + } int32_t contLen = tSerializeSCreateVnodeReq(NULL, 0, &createReq); if (contLen < 0) { @@ -356,12 +386,24 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p SAlterVnodeReplicaReq alterReq = { .vgId = pVgroup->vgId, .strict = pDb->cfg.strict, - .replica = pVgroup->replica, + .replica = 0, + .learnerReplica = 0, .selfIndex = -1, + .learnerSelfIndex = -1, }; for (int32_t v = 0; v < pVgroup->replica; ++v) { - SReplica *pReplica = &alterReq.replicas[v]; + SReplica *pReplica = NULL; + + if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){ + pReplica = &alterReq.replicas[alterReq.replica]; + alterReq.replica++; + } + else{ + pReplica = &alterReq.learnerReplicas[alterReq.learnerReplica]; + alterReq.learnerReplica++; + } + SVnodeGid *pVgid = &pVgroup->vnodeGid[v]; SDnodeObj *pVgidDnode = mndAcquireDnode(pMnode, pVgid->dnodeId); if (pVgidDnode == NULL) return NULL; @@ -371,18 +413,30 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p memcpy(pReplica->fqdn, pVgidDnode->fqdn, TSDB_FQDN_LEN); mndReleaseDnode(pMnode, pVgidDnode); - if (dnodeId == pVgid->dnodeId) { - alterReq.selfIndex = v; + if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){ + if (dnodeId == pVgid->dnodeId) { + alterReq.selfIndex = v; + } + } + else{ + if (dnodeId == pVgid->dnodeId) { + alterReq.learnerSelfIndex = v; + } } } - alterReq.replica = pVgroup->replica; - mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d strict:%d", alterReq.vgId, alterReq.replica, - alterReq.selfIndex, alterReq.strict); + + mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d learnerReplica:%d learnerSelfIndex:%d strict:%d", + alterReq.vgId, alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica, + alterReq.learnerSelfIndex, alterReq.strict); for (int32_t i = 0; i < alterReq.replica; ++i) { mInfo("vgId:%d, replica:%d ep:%s:%u", alterReq.vgId, i, alterReq.replicas[i].fqdn, alterReq.replicas[i].port); } + for (int32_t i = 0; i < alterReq.learnerReplica; ++i) { + mInfo("vgId:%d, learnerReplica:%d ep:%s:%u", alterReq.vgId, i, + alterReq.learnerReplicas[i].fqdn, alterReq.learnerReplicas[i].port); + } - if (alterReq.selfIndex == -1) { + if (alterReq.selfIndex == -1 && alterReq.learnerSelfIndex == -1) { terrno = TSDB_CODE_APP_ERROR; return NULL; } @@ -1194,6 +1248,30 @@ int32_t mndAddAlterVnodeReplicaAction(SMnode *pMnode, STrans *pTrans, SDbObj *pD return 0; } +int32_t mndAddAlterVnodeTypeAction(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroup, int32_t dnodeId) { + SDnodeObj *pDnode = mndAcquireDnode(pMnode, dnodeId); + if (pDnode == NULL) return -1; + + STransAction action = {0}; + action.epSet = mndGetDnodeEpset(pDnode); + mndReleaseDnode(pMnode, pDnode); + + int32_t contLen = 0; + void *pReq = mndBuildAlterVnodeReplicaReq(pMnode, pDb, pVgroup, dnodeId, &contLen); + if (pReq == NULL) return -1; + + action.pCont = pReq; + action.contLen = contLen; + action.msgType = TDMT_DND_ALTER_VNODE_TYPE; + + if (mndTransAppendRedoAction(pTrans, &action) != 0) { + taosMemoryFree(pReq); + return -1; + } + + return 0; +} + static int32_t mndAddDisableVnodeWriteAction(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroup, int32_t dnodeId) { SDnodeObj *pDnode = mndAcquireDnode(pMnode, dnodeId); @@ -1953,18 +2031,33 @@ int32_t mndBuildAlterVgroupAction(SMnode *pMnode, STrans *pTrans, SDbObj *pOldDb mInfo("db:%s, vgId:%d, will add 2 vnodes, vn:0 dnode:%d", pVgroup->dbName, pVgroup->vgId, pVgroup->vnodeGid[0].dnodeId); + //add first if (mndAddVnodeToVgroup(pMnode, pTrans, &newVgroup, pArray) != 0) return -1; + + newVgroup.vnodeGid[0].nodeRole = TAOS_SYNC_ROLE_VOTER; + newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_LEARNER; if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[0].dnodeId) != 0) return -1; + if (mndAddCreateVnodeAction(pMnode, pTrans, pNewDb, &newVgroup, &newVgroup.vnodeGid[1]) != 0) return -1; + newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_VOTER; + if (mndAddAlterVnodeTypeAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[1].dnodeId) != 0) + return -1; + if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pNewDb, &newVgroup) != 0) return -1; + //add second if (mndAddVnodeToVgroup(pMnode, pTrans, &newVgroup, pArray) != 0) return -1; + newVgroup.vnodeGid[0].nodeRole = TAOS_SYNC_ROLE_VOTER; + newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_VOTER; + newVgroup.vnodeGid[2].nodeRole = TAOS_SYNC_ROLE_VOTER; + if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[0].dnodeId) != 0) return -1; if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[1].dnodeId) != 0) return -1; if (mndAddCreateVnodeAction(pMnode, pTrans, pNewDb, &newVgroup, &newVgroup.vnodeGid[2]) != 0) return -1; + if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pNewDb, &newVgroup) != 0) return -1; } else if (newVgroup.replica == 3 && pNewDb->cfg.replications == 1) { mInfo("db:%s, vgId:%d, will remove 2 vnodes, vn:0 dnode:%d vn:1 dnode:%d vn:2 dnode:%d", pVgroup->dbName, diff --git a/source/dnode/vnode/inc/vnode.h b/source/dnode/vnode/inc/vnode.h index fb2c2f4be340461276dddca80eb565318a0f7e87..7dfaf1508df1c6c3d930789f4193d919a63c4cf7 100644 --- a/source/dnode/vnode/inc/vnode.h +++ b/source/dnode/vnode/inc/vnode.h @@ -68,6 +68,7 @@ void vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnapshot); void vnodeGetInfo(SVnode *pVnode, const char **dbname, int32_t *vgId); int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen); int32_t vnodeGetAllTableList(SVnode *pVnode, uint64_t uid, SArray *list); +int32_t vnodeIsCatchUp(SVnode *pVnode); int32_t vnodeGetCtbIdList(SVnode *pVnode, int64_t suid, SArray *list); int32_t vnodeGetCtbIdListByFilter(SVnode *pVnode, int64_t suid, SArray *list, bool (*filter)(void *arg), void *arg); diff --git a/source/dnode/vnode/src/vnd/vnodeCfg.c b/source/dnode/vnode/src/vnd/vnodeCfg.c index c326c8bfacbd897fc15527edaef5cd8e77c40f5f..a760fc03c22dfac4b7c9ad8a8cf4a81fdb6dc775 100644 --- a/source/dnode/vnode/src/vnd/vnodeCfg.c +++ b/source/dnode/vnode/src/vnd/vnodeCfg.c @@ -57,6 +57,28 @@ int vnodeCheckCfg(const SVnodeCfg *pCfg) { return 0; } +const char* vnodeRoleToStr(ESyncRole role) { + switch (role) { + case TAOS_SYNC_ROLE_VOTER: + return "voter"; + case TAOS_SYNC_ROLE_LEARNER: + return "learner"; + default: + return "unknown"; + } +} + +const ESyncRole vnodeStrToRole(char* str) { + if(strcmp(str, "voter") == 0){ + return TAOS_SYNC_ROLE_VOTER; + } + if(strcmp(str, "learner") == 0){ + return TAOS_SYNC_ROLE_LEARNER; + } + + return TAOS_SYNC_ROLE_ERROR; +} + int vnodeEncodeConfig(const void *pObj, SJson *pJson) { const SVnodeCfg *pCfg = (SVnodeCfg *)pObj; @@ -117,6 +139,7 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) { if (tjsonAddIntegerToObject(pJson, "hashSuffix", pCfg->hashSuffix) < 0) return -1; if (tjsonAddIntegerToObject(pJson, "syncCfg.replicaNum", pCfg->syncCfg.replicaNum) < 0) return -1; + if (tjsonAddIntegerToObject(pJson, "syncCfg.totalReplicaNum", pCfg->syncCfg.totalReplicaNum) < 0) return -1; if (tjsonAddIntegerToObject(pJson, "syncCfg.myIndex", pCfg->syncCfg.myIndex) < 0) return -1; if (tjsonAddIntegerToObject(pJson, "vndStats.stables", pCfg->vndStats.numOfSTables) < 0) return -1; @@ -128,9 +151,9 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) { SJson *nodeInfo = tjsonCreateArray(); if (nodeInfo == NULL) return -1; if (tjsonAddItemToObject(pJson, "syncCfg.nodeInfo", nodeInfo) < 0) return -1; - vDebug("vgId:%d, encode config, replicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum, - pCfg->syncCfg.myIndex); - for (int i = 0; i < pCfg->syncCfg.replicaNum; ++i) { + vDebug("vgId:%d, encode config, replicas:%d totalReplicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum, + pCfg->syncCfg.totalReplicaNum, pCfg->syncCfg.myIndex); + for (int i = 0; i < pCfg->syncCfg.totalReplicaNum; ++i) { SJson *info = tjsonCreateObject(); SNodeInfo *pNode = (SNodeInfo *)&pCfg->syncCfg.nodeInfo[i]; if (info == NULL) return -1; @@ -138,6 +161,7 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) { if (tjsonAddStringToObject(info, "nodeFqdn", pNode->nodeFqdn) < 0) return -1; if (tjsonAddIntegerToObject(info, "nodeId", pNode->nodeId) < 0) return -1; if (tjsonAddIntegerToObject(info, "clusterId", pNode->clusterId) < 0) return -1; + if (tjsonAddStringToObject(info, "nodeRole", vnodeRoleToStr(pNode->nodeRole)) < 0) return -1; if (tjsonAddItemToArray(nodeInfo, info) < 0) return -1; vDebug("vgId:%d, encode config, replica:%d ep:%s:%u dnode:%d", pCfg->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId); @@ -235,6 +259,8 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) { tjsonGetNumberValue(pJson, "syncCfg.replicaNum", pCfg->syncCfg.replicaNum, code); if (code < 0) return -1; + tjsonGetNumberValue(pJson, "syncCfg.totalReplicaNum", pCfg->syncCfg.totalReplicaNum, code); + if (code < 0) return -1; tjsonGetNumberValue(pJson, "syncCfg.myIndex", pCfg->syncCfg.myIndex, code); if (code < 0) return -1; @@ -251,10 +277,10 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) { SJson *nodeInfo = tjsonGetObjectItem(pJson, "syncCfg.nodeInfo"); int arraySize = tjsonGetArraySize(nodeInfo); - if (arraySize != pCfg->syncCfg.replicaNum) return -1; + if (arraySize != pCfg->syncCfg.totalReplicaNum) return -1; - vDebug("vgId:%d, decode config, replicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum, - pCfg->syncCfg.myIndex); + vDebug("vgId:%d, decode config, replicas:%d totalReplicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum, + pCfg->syncCfg.totalReplicaNum, pCfg->syncCfg.myIndex); for (int i = 0; i < arraySize; ++i) { SJson *info = tjsonGetArrayItem(nodeInfo, i); SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i]; @@ -266,6 +292,9 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) { if (code < 0) return -1; tjsonGetNumberValue(info, "clusterId", pNode->clusterId, code); if (code < 0) return -1; + char role[10]; + tjsonGetStringValue(info, "nodeRole", role); + pNode->nodeRole = vnodeStrToRole(role); vDebug("vgId:%d, decode config, replica:%d ep:%s:%u dnode:%d", pCfg->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId); } diff --git a/source/dnode/vnode/src/vnd/vnodeOpen.c b/source/dnode/vnode/src/vnd/vnodeOpen.c index c7d155be0d50600e419940b03297d2bad6d29528..b432e8ba4eadccd8e0821be3c7b035e04a7f1674 100644 --- a/source/dnode/vnode/src/vnd/vnodeOpen.c +++ b/source/dnode/vnode/src/vnd/vnodeOpen.c @@ -76,19 +76,41 @@ int32_t vnodeAlterReplica(const char *path, SAlterVnodeReplicaReq *pReq, STfs *p } SSyncCfg *pCfg = &info.config.syncCfg; - pCfg->myIndex = pReq->selfIndex; - pCfg->replicaNum = pReq->replica; + + pCfg->replicaNum = 0; + pCfg->totalReplicaNum = 0; memset(&pCfg->nodeInfo, 0, sizeof(pCfg->nodeInfo)); - vInfo("vgId:%d, save config while alter, replicas:%d selfIndex:%d", pReq->vgId, pCfg->replicaNum, pCfg->myIndex); for (int i = 0; i < pReq->replica; ++i) { SNodeInfo *pNode = &pCfg->nodeInfo[i]; pNode->nodeId = pReq->replicas[i].id; pNode->nodePort = pReq->replicas[i].port; tstrncpy(pNode->nodeFqdn, pReq->replicas[i].fqdn, sizeof(pNode->nodeFqdn)); + pNode->nodeRole = TAOS_SYNC_ROLE_VOTER; (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort); vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId); + pCfg->replicaNum++; } + if(pReq->selfIndex != -1){ + pCfg->myIndex = pReq->selfIndex; + } + for (int i = pCfg->replicaNum; i < pReq->replica + pReq->learnerReplica; ++i) { + SNodeInfo *pNode = &pCfg->nodeInfo[i]; + pNode->nodeId = pReq->learnerReplicas[pCfg->totalReplicaNum].id; + pNode->nodePort = pReq->learnerReplicas[pCfg->totalReplicaNum].port; + pNode->nodeRole = TAOS_SYNC_ROLE_LEARNER; + tstrncpy(pNode->nodeFqdn, pReq->learnerReplicas[pCfg->totalReplicaNum].fqdn, sizeof(pNode->nodeFqdn)); + (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort); + vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId); + pCfg->totalReplicaNum++; + } + pCfg->totalReplicaNum += pReq->replica; + if(pReq->learnerSelfIndex != -1){ + pCfg->myIndex = pReq->replica + pReq->learnerSelfIndex; + } + + vInfo("vgId:%d, save config while alter, replicas:%d totalReplicas:%d selfIndex:%d", + pReq->vgId, pCfg->replicaNum, pCfg->totalReplicaNum, pCfg->myIndex); info.config.syncCfg = *pCfg; ret = vnodeSaveInfo(dir, &info); @@ -181,6 +203,7 @@ int32_t vnodeAlterHashRange(const char *srcPath, const char *dstPath, SAlterVnod SSyncCfg *pCfg = &info.config.syncCfg; pCfg->myIndex = 0; pCfg->replicaNum = 1; + pCfg->totalReplicaNum = 1; memset(&pCfg->nodeInfo, 0, sizeof(pCfg->nodeInfo)); vInfo("vgId:%d, alter vnode replicas to 1", pReq->srcVgId); @@ -248,7 +271,7 @@ SVnode *vnodeOpen(const char *path, STfs *pTfs, SMsgCb msgCb) { // save vnode info on dnode ep changed bool updated = false; SSyncCfg *pCfg = &info.config.syncCfg; - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { SNodeInfo *pNode = &pCfg->nodeInfo[i]; if (tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort)) { updated = true; @@ -406,6 +429,10 @@ void vnodeClose(SVnode *pVnode) { // start the sync timer after the queue is ready int32_t vnodeStart(SVnode *pVnode) { return vnodeSyncStart(pVnode); } +int32_t vnodeIsCatchUp(SVnode *pVnode){ + return syncIsCatchUp(pVnode->sync); +} + void vnodeStop(SVnode *pVnode) {} int64_t vnodeGetSyncHandle(SVnode *pVnode) { return pVnode->sync; } diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index b62bf27def54c75558bd184a330655a6745d871d..1adb75ac8853ee5a6ebf0a229cf7d8147315a63d 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -1447,7 +1447,8 @@ int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen) { } static int32_t vnodeProcessAlterConfirmReq(SVnode *pVnode, int64_t version, void *pReq, int32_t len, SRpcMsg *pRsp) { - vInfo("vgId:%d, alter replica confim msg is processed", TD_VID(pVnode)); + vInfo("vgId:%d, vnode management handle msgType:alter-confirm, alter replica confim msg is processed", + TD_VID(pVnode)); pRsp->msgType = TDMT_VND_ALTER_CONFIRM_RSP; pRsp->code = TSDB_CODE_SUCCESS; pRsp->pCont = NULL; diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index d4a394b584edb4886984f285f261e1115cc4c444..bc69378487869fa22988fb0bf9cfbe51b391bfe6 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -571,6 +571,19 @@ static void vnodeBecomeFollower(const SSyncFSM *pFsm) { taosThreadMutexUnlock(&pVnode->lock); } +static void vnodeBecomeLearner(const SSyncFSM *pFsm) { + SVnode *pVnode = pFsm->data; + vInfo("vgId:%d, become learner", pVnode->config.vgId); + + taosThreadMutexLock(&pVnode->lock); + if (pVnode->blocked) { + pVnode->blocked = false; + vDebug("vgId:%d, become learner and post block", pVnode->config.vgId); + tsem_post(&pVnode->syncSem); + } + taosThreadMutexUnlock(&pVnode->lock); +} + static void vnodeBecomeLeader(const SSyncFSM *pFsm) { SVnode *pVnode = pFsm->data; vDebug("vgId:%d, become leader", pVnode->config.vgId); @@ -612,6 +625,7 @@ static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) { pFsm->FpApplyQueueItems = vnodeApplyQueueItems; pFsm->FpBecomeLeaderCb = vnodeBecomeLeader; pFsm->FpBecomeFollowerCb = vnodeBecomeFollower; + pFsm->FpBecomeLearnerCb = vnodeBecomeLearner; pFsm->FpReConfigCb = NULL; pFsm->FpSnapshotStartRead = vnodeSnapshotStartRead; pFsm->FpSnapshotStopRead = vnodeSnapshotStopRead; @@ -644,7 +658,7 @@ int32_t vnodeSyncOpen(SVnode *pVnode, char *path) { SSyncCfg *pCfg = &syncInfo.syncCfg; vInfo("vgId:%d, start to open sync, replica:%d selfIndex:%d", pVnode->config.vgId, pCfg->replicaNum, pCfg->myIndex); - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { SNodeInfo *pNode = &pCfg->nodeInfo[i]; vInfo("vgId:%d, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, pVnode->config.vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId, pNode->clusterId); diff --git a/source/libs/sync/inc/syncIndexMgr.h b/source/libs/sync/inc/syncIndexMgr.h index 4e6ab284f8cf9b10a06f5880abd8f14e7c13de81..a4e83dc4956160648362b55900b5131ae80998cf 100644 --- a/source/libs/sync/inc/syncIndexMgr.h +++ b/source/libs/sync/inc/syncIndexMgr.h @@ -24,12 +24,13 @@ extern "C" { // SIndexMgr ----------------------------- typedef struct SSyncIndexMgr { - SRaftId (*replicas)[TSDB_MAX_REPLICA]; - SyncIndex index[TSDB_MAX_REPLICA]; - SyncTerm privateTerm[TSDB_MAX_REPLICA]; // for advanced function - int64_t startTimeArr[TSDB_MAX_REPLICA]; - int64_t recvTimeArr[TSDB_MAX_REPLICA]; + SRaftId (*replicas)[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + SyncIndex index[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + SyncTerm privateTerm[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; // for advanced function + int64_t startTimeArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + int64_t recvTimeArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; int32_t replicaNum; + int32_t totalReplicaNum; SSyncNode *pNode; } SSyncIndexMgr; diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 6f2c1a1ad0e76d233d516e36ee419cae43fcc409..7d336c83135a20449f09431c2d9427a6dd6cc0c5 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -54,13 +54,13 @@ typedef struct SSyncLogReplMgr SSyncLogReplMgr; #define MAX_CONFIG_INDEX_COUNT 256 typedef struct SRaftCfg { - SSyncCfg cfg; - int32_t batchSize; - int8_t isStandBy; - int8_t snapshotStrategy; - SyncIndex lastConfigIndex; - int32_t configIndexCount; - SyncIndex configIndexArr[MAX_CONFIG_INDEX_COUNT]; + SSyncCfg cfg; + int32_t batchSize; + int8_t isStandBy; + int8_t snapshotStrategy; + SyncIndex lastConfigIndex; + int32_t configIndexCount; + SyncIndex configIndexArr[MAX_CONFIG_INDEX_COUNT]; } SRaftCfg; typedef struct SRaftId { @@ -127,12 +127,13 @@ typedef struct SSyncNode { SRaftId myRaftId; int32_t peersNum; - SNodeInfo peersNodeInfo[TSDB_MAX_REPLICA]; - SEpSet peersEpset[TSDB_MAX_REPLICA]; - SRaftId peersId[TSDB_MAX_REPLICA]; + SNodeInfo peersNodeInfo[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + SEpSet peersEpset[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + SRaftId peersId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; int32_t replicaNum; - SRaftId replicasId[TSDB_MAX_REPLICA]; + int32_t totalReplicaNum; + SRaftId replicasId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; // raft algorithm SSyncFSM* pFsm; @@ -188,7 +189,7 @@ typedef struct SSyncNode { uint64_t heartbeatTimerCounter; // peer heartbeat timer - SSyncTimer peerHeartbeatTimerArr[TSDB_MAX_REPLICA]; + SSyncTimer peerHeartbeatTimerArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; // tools SSyncRespMgr* pSyncRespMgr; @@ -196,13 +197,13 @@ typedef struct SSyncNode { // restore state bool restoreFinish; // SSnapshot* pSnapshot; - SSyncSnapshotSender* senders[TSDB_MAX_REPLICA]; + SSyncSnapshotSender* senders[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; SSyncSnapshotReceiver* pNewNodeReceiver; // log replication mgr - SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA]; + SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; - SPeerState peerStates[TSDB_MAX_REPLICA]; + SPeerState peerStates[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; // is config changing bool changing; @@ -275,6 +276,7 @@ void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term); void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term); void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm); void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr); +void syncNodeBecomeLearner(SSyncNode* pSyncNode, const char* debugStr); void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr); void syncNodeCandidate2Leader(SSyncNode* pSyncNode); void syncNodeFollower2Candidate(SSyncNode* pSyncNode); diff --git a/source/libs/sync/inc/syncMessage.h b/source/libs/sync/inc/syncMessage.h index c3566c7c820b97c4484f15ce59239d423db9d480..f8c96d8be296f136292996daed85f78d2b4aa698 100644 --- a/source/libs/sync/inc/syncMessage.h +++ b/source/libs/sync/inc/syncMessage.h @@ -237,6 +237,7 @@ typedef struct SyncLeaderTransfer { typedef enum { SYNC_LOCAL_CMD_STEP_DOWN = 100, SYNC_LOCAL_CMD_FOLLOWER_CMT, + SYNC_LOCAL_CMD_LEARNER_CMT, } ESyncLocalCmd; typedef struct SyncLocalCmd { diff --git a/source/libs/sync/inc/syncPipeline.h b/source/libs/sync/inc/syncPipeline.h index d709e33cd46aa091255afa84a3e3aac91bbe1d03..02790732a23e02e21a5708169780e222711754c2 100644 --- a/source/libs/sync/inc/syncPipeline.h +++ b/source/libs/sync/inc/syncPipeline.h @@ -56,6 +56,8 @@ typedef struct SSyncLogBuffer { int64_t size; TdThreadMutex mutex; TdThreadMutexAttr attr; + int64_t totalIndex; + bool isCatchup; } SSyncLogBuffer; // SSyncLogRepMgr diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 9ab545075cfc38281358e8847447f821a00393aa..b488455dba4cfe5e1c103cbf2319bd730e59e7b3 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -137,8 +137,10 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { pReply->term = pMsg->term; } - syncNodeStepDown(ths, pMsg->term); - resetElect = true; + if(ths->raftCfg.cfg.nodeInfo[ths->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){ + syncNodeStepDown(ths, pMsg->term); + resetElect = true; + } if (pMsg->dataLen < sizeof(SSyncRaftEntry)) { sError("vgId:%d, incomplete append entries received. prev index:%" PRId64 ", term:%" PRId64 ", datalen:%d", diff --git a/source/libs/sync/src/syncElection.c b/source/libs/sync/src/syncElection.c index 3699efbc594c812c798c0c4b6aa38a6ae001edc5..86e28db90c93bf69a5ad40d0a6149bb7a7cad111 100644 --- a/source/libs/sync/src/syncElection.c +++ b/source/libs/sync/src/syncElection.c @@ -41,6 +41,8 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) { int32_t ret = 0; for (int i = 0; i < pNode->peersNum; ++i) { + if(pNode->peersNodeInfo[i].nodeRole == TAOS_SYNC_ROLE_LEARNER) continue; + SRpcMsg rpcMsg = {0}; ret = syncBuildRequestVote(&rpcMsg, pNode->vgId); if (ret < 0) { diff --git a/source/libs/sync/src/syncIndexMgr.c b/source/libs/sync/src/syncIndexMgr.c index 7ecb9d77825b496ea290d4bbca6f3c873edc716a..e3c3f63a4f89b4b0021325f54a8b595e28f56970 100644 --- a/source/libs/sync/src/syncIndexMgr.c +++ b/source/libs/sync/src/syncIndexMgr.c @@ -26,6 +26,7 @@ SSyncIndexMgr *syncIndexMgrCreate(SSyncNode *pNode) { pIndexMgr->replicas = &pNode->replicasId; pIndexMgr->replicaNum = pNode->replicaNum; + pIndexMgr->totalReplicaNum = pNode->totalReplicaNum; pIndexMgr->pNode = pNode; syncIndexMgrClear(pIndexMgr); @@ -35,6 +36,7 @@ SSyncIndexMgr *syncIndexMgrCreate(SSyncNode *pNode) { void syncIndexMgrUpdate(SSyncIndexMgr *pIndexMgr, SSyncNode *pNode) { pIndexMgr->replicas = &pNode->replicasId; pIndexMgr->replicaNum = pNode->replicaNum; + pIndexMgr->totalReplicaNum = pNode->totalReplicaNum; pIndexMgr->pNode = pNode; syncIndexMgrClear(pIndexMgr); } @@ -50,14 +52,14 @@ void syncIndexMgrClear(SSyncIndexMgr *pIndexMgr) { memset(pIndexMgr->privateTerm, 0, sizeof(pIndexMgr->privateTerm)); int64_t timeNow = taosGetTimestampMs(); - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { pIndexMgr->startTimeArr[i] = 0; pIndexMgr->recvTimeArr[i] = timeNow; } } void syncIndexMgrSetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncIndex index) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { (pIndexMgr->index)[i] = index; return; @@ -69,7 +71,7 @@ void syncIndexMgrSetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, Sync } SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pRaftId) { - for (int i = 0; i < pNode->replicaNum; i++) { + for (int i = 0; i < pNode->totalReplicaNum; i++) { if (syncUtilSameId(&pNode->replicasId[i], pRaftId)) { return pNode->logReplMgrs[i]; } @@ -80,7 +82,7 @@ SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pRaftId) { } SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { SyncIndex idx = (pIndexMgr->index)[i]; return idx; @@ -93,7 +95,7 @@ SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) } void syncIndexMgrSetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, int64_t startTime) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { (pIndexMgr->startTimeArr)[i] = startTime; return; @@ -105,7 +107,7 @@ void syncIndexMgrSetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, } int64_t syncIndexMgrGetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { int64_t startTime = (pIndexMgr->startTimeArr)[i]; return startTime; @@ -118,7 +120,7 @@ int64_t syncIndexMgrGetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftI } void syncIndexMgrSetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, int64_t recvTime) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { (pIndexMgr->recvTimeArr)[i] = recvTime; return; @@ -130,7 +132,7 @@ void syncIndexMgrSetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, i } int64_t syncIndexMgrGetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { int64_t recvTime = (pIndexMgr->recvTimeArr)[i]; return recvTime; @@ -143,7 +145,7 @@ int64_t syncIndexMgrGetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId } void syncIndexMgrSetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncTerm term) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { (pIndexMgr->privateTerm)[i] = term; return; @@ -155,7 +157,7 @@ void syncIndexMgrSetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncT } SyncTerm syncIndexMgrGetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) { - for (int i = 0; i < pIndexMgr->replicaNum; ++i) { + for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) { if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) { SyncTerm term = (pIndexMgr->privateTerm)[i]; return term; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 966b3ed09380fc16410b83a35008d74227bed1af..c4e7833245ca3b656274fe832a053dc858258e6b 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -141,6 +141,13 @@ int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) { SSyncNode* pSyncNode = syncNodeAcquire(rid); if (pSyncNode == NULL) return -1; + if(pSyncNode->raftCfg.lastConfigIndex >= pNewCfg->lastIndex){ + syncNodeRelease(pSyncNode); + sInfo("vgId:%d, no need Reconfig, current index:%" PRId64 ", new index:%" PRId64, pSyncNode->vgId, + pSyncNode->raftCfg.lastConfigIndex, pNewCfg->lastIndex); + return 0; + } + if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) { syncNodeRelease(pSyncNode); terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR; @@ -149,12 +156,12 @@ int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) { } syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg); - syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID); + syncNodeDoConfigChange(pSyncNode, pNewCfg, pNewCfg->lastIndex); if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) { syncNodeStopHeartbeatTimer(pSyncNode); - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]); } @@ -315,8 +322,9 @@ int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) { } } - if (pSyncNode->replicaNum > 1) { - if (pSyncNode->state != TAOS_SYNC_STATE_LEADER && pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) { + if (pSyncNode->totalReplicaNum > 1) { + if (pSyncNode->state != TAOS_SYNC_STATE_LEADER && pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER + && pSyncNode->state != TAOS_SYNC_STATE_LEARNER) { sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate or unknown state, do not delete wal", lastApplyIndex); syncNodeRelease(pSyncNode); @@ -537,7 +545,8 @@ void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) { SSyncNode* pSyncNode = syncNodeAcquire(rid); if (pSyncNode == NULL) return; - for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) { + for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) { + if(pSyncNode->raftCfg.cfg.nodeInfo[i].nodeRole == TAOS_SYNC_ROLE_LEARNER) continue; SEp* pEp = &pEpSet->eps[i]; tstrncpy(pEp->fqdn, pSyncNode->raftCfg.cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN); pEp->port = (pSyncNode->raftCfg.cfg.nodeInfo)[i].nodePort; @@ -564,6 +573,34 @@ int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak, int64_t* seq) { return ret; } +int32_t syncIsCatchUp(int64_t rid) { + SSyncNode* pSyncNode = syncNodeAcquire(rid); + if (pSyncNode == NULL) { + sError("sync Node Acquire error since %d", errno); + return -1; + } + + while(1){ + if(pSyncNode->pLogBuf->totalIndex < 0 || pSyncNode->pLogBuf->commitIndex < 0 || + pSyncNode->pLogBuf->totalIndex < pSyncNode->pLogBuf->commitIndex || + pSyncNode->pLogBuf->totalIndex - pSyncNode->pLogBuf->commitIndex > SYNC_LEARNER_CATCHUP){ + sInfo("vgId:%d, Not catch up, wait one second, totalIndex:%" PRId64 " commitIndex:%" PRId64 " matchIndex:%" PRId64, + pSyncNode->vgId, pSyncNode->pLogBuf->totalIndex, pSyncNode->pLogBuf->commitIndex, + pSyncNode->pLogBuf->matchIndex); + taosSsleep(1); + } + else{ + sInfo("vgId:%d, Catch up, totalIndex:%" PRId64 " commitIndex:%" PRId64 " matchIndex:%" PRId64, + pSyncNode->vgId, pSyncNode->pLogBuf->totalIndex, pSyncNode->pLogBuf->commitIndex, + pSyncNode->pLogBuf->matchIndex); + break; + } + } + + syncNodeRelease(pSyncNode); + return 0; +} + int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak, int64_t* seq) { if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { terrno = TSDB_CODE_SYN_NOT_LEADER; @@ -655,6 +692,8 @@ static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) { pData->logicClock = pSyncTimer->logicClock; pData->execTime = tsNow + pSyncTimer->timerMS; + sTrace("vgId:%d, start hb timer, rid:%" PRId64 " addr:%" PRId64, pSyncNode->vgId, pData->rid, pData->destId.addr); + taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)(pData->rid), syncEnv()->pTimerManager, &pSyncTimer->pTimer); } else { @@ -719,7 +758,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { sInfo("vgId:%d, create a new raft config file", pSyncNode->vgId); pSyncNode->raftCfg.isStandBy = pSyncInfo->isStandBy; pSyncNode->raftCfg.snapshotStrategy = pSyncInfo->snapshotStrategy; - pSyncNode->raftCfg.lastConfigIndex = SYNC_INDEX_INVALID; + pSyncNode->raftCfg.lastConfigIndex = pSyncInfo->syncCfg.lastIndex; pSyncNode->raftCfg.batchSize = pSyncInfo->batchSize; pSyncNode->raftCfg.cfg = pSyncInfo->syncCfg; pSyncNode->raftCfg.configIndexCount = 1; @@ -736,7 +775,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { goto _error; } - if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->raftCfg.cfg, &pSyncInfo->syncCfg)) { + if (pSyncInfo->syncCfg.totalReplicaNum > 0 && syncIsConfigChanged(&pSyncNode->raftCfg.cfg, &pSyncInfo->syncCfg)) { sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId); pSyncNode->raftCfg.cfg = pSyncInfo->syncCfg; if (syncWriteCfgFile(pSyncNode) != 0) { @@ -753,8 +792,9 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { pSyncNode->vgId = pSyncInfo->vgId; SSyncCfg* pCfg = &pSyncNode->raftCfg.cfg; bool updated = false; - sInfo("vgId:%d, start to open sync node, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex); - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + sInfo("vgId:%d, start to open sync node, totalReplicaNum:%d replicaNum:%d selfIndex:%d", + pSyncNode->vgId, pCfg->totalReplicaNum, pCfg->replicaNum, pCfg->myIndex); + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { SNodeInfo* pNode = &pCfg->nodeInfo[i]; if (tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort)) { updated = true; @@ -792,9 +832,9 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { } // init peersNum, peers, peersId - pSyncNode->peersNum = pSyncNode->raftCfg.cfg.replicaNum - 1; + pSyncNode->peersNum = pSyncNode->raftCfg.cfg.totalReplicaNum - 1; int32_t j = 0; - for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) { + for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) { if (i != pSyncNode->raftCfg.cfg.myIndex) { pSyncNode->peersNodeInfo[j] = pSyncNode->raftCfg.cfg.nodeInfo[i]; syncUtilNodeInfo2EpSet(&pSyncNode->peersNodeInfo[j], &pSyncNode->peersEpset[j]); @@ -810,7 +850,8 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { // init replicaNum, replicasId pSyncNode->replicaNum = pSyncNode->raftCfg.cfg.replicaNum; - for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) { + pSyncNode->totalReplicaNum = pSyncNode->raftCfg.cfg.totalReplicaNum; + for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) { if (!syncUtilNodeInfo2RaftId(&pSyncNode->raftCfg.cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) { sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i); goto _error; @@ -934,7 +975,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { pSyncNode->heartbeatTimerCounter = 0; // init peer heartbeat timer - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]); } @@ -949,7 +990,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { pSyncNode->restoreFinish = false; // snapshot senders - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i); if (pSender == NULL) return NULL; @@ -1059,14 +1100,19 @@ int32_t syncNodeRestore(SSyncNode* pSyncNode) { int32_t syncNodeStart(SSyncNode* pSyncNode) { // start raft - if (pSyncNode->replicaNum == 1) { - raftStoreNextTerm(pSyncNode); - syncNodeBecomeLeader(pSyncNode, "one replica start"); + if(pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex].nodeRole == TAOS_SYNC_ROLE_LEARNER){ + syncNodeBecomeLearner(pSyncNode, "first start"); + } + else{ + if (pSyncNode->replicaNum == 1) { + raftStoreNextTerm(pSyncNode); + syncNodeBecomeLeader(pSyncNode, "one replica start"); - // Raft 3.6.2 Committing entries from previous terms - syncNodeAppendNoop(pSyncNode); - } else { - syncNodeBecomeFollower(pSyncNode, "first start"); + // Raft 3.6.2 Committing entries from previous terms + syncNodeAppendNoop(pSyncNode); + } else { + syncNodeBecomeFollower(pSyncNode, "first start"); + } } int32_t ret = 0; @@ -1157,7 +1203,7 @@ void syncNodeClose(SSyncNode* pSyncNode) { syncLogBufferDestroy(pSyncNode->pLogBuf); pSyncNode->pLogBuf = NULL; - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { if (pSyncNode->senders[i] != NULL) { sDebug("vgId:%d, snapshot sender destroy while close, data:%p", pSyncNode->vgId, pSyncNode->senders[i]); @@ -1350,7 +1396,7 @@ inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) { bool b1 = false; bool b2 = false; - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { if (strcmp(pCfg->nodeInfo[i].nodeFqdn, pNode->myNodeInfo.nodeFqdn) == 0 && pCfg->nodeInfo[i].nodePort == pNode->myNodeInfo.nodePort) { b1 = true; @@ -1358,7 +1404,7 @@ inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) { } } - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { SRaftId raftId = { .addr = SYNC_ADDR(&pCfg->nodeInfo[i]), .vgId = pNode->vgId, @@ -1375,13 +1421,14 @@ inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) { } static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) { - if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true; + if (pOldCfg->totalReplicaNum != pNewCfg->totalReplicaNum) return true; if (pOldCfg->myIndex != pNewCfg->myIndex) return true; - for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pOldCfg->totalReplicaNum; ++i) { const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i]; const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i]; if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true; if (pOldInfo->nodePort != pNewInfo->nodePort) return true; + if(pOldInfo->nodeRole != pNewInfo->nodeRole) return true; } return false; @@ -1418,8 +1465,10 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde } // log begin config change - sNInfo(pSyncNode, "begin do config change, from %d to %d, replicas:%d", pSyncNode->vgId, oldConfig.replicaNum, - pNewConfig->replicaNum); + sNInfo(pSyncNode, "begin do config change, from %d to %d, from %" PRId64 " to %" PRId64 ", replicas:%d", + pSyncNode->vgId, + oldConfig.totalReplicaNum, pNewConfig->totalReplicaNum, + oldConfig.lastIndex, pNewConfig->lastIndex); if (IamInNew) { pSyncNode->raftCfg.isStandBy = 0; // change isStandBy to normal @@ -1436,11 +1485,10 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde int32_t ret = 0; // save snapshot senders - int32_t oldReplicaNum = pSyncNode->replicaNum; - SRaftId oldReplicasId[TSDB_MAX_REPLICA]; + SRaftId oldReplicasId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId)); - SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA]; - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { oldSenders[i] = pSyncNode->senders[i]; sSTrace(oldSenders[i], "snapshot sender save old"); } @@ -1450,9 +1498,9 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId); // init peersNum, peers, peersId - pSyncNode->peersNum = pSyncNode->raftCfg.cfg.replicaNum - 1; + pSyncNode->peersNum = pSyncNode->raftCfg.cfg.totalReplicaNum - 1; int32_t j = 0; - for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) { + for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) { if (i != pSyncNode->raftCfg.cfg.myIndex) { pSyncNode->peersNodeInfo[j] = pSyncNode->raftCfg.cfg.nodeInfo[i]; syncUtilNodeInfo2EpSet(&pSyncNode->peersNodeInfo[j], &pSyncNode->peersEpset[j]); @@ -1465,7 +1513,8 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde // init replicaNum, replicasId pSyncNode->replicaNum = pSyncNode->raftCfg.cfg.replicaNum; - for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) { + pSyncNode->totalReplicaNum = pSyncNode->raftCfg.cfg.totalReplicaNum; + for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.totalReplicaNum; ++i) { syncUtilNodeInfo2RaftId(&pSyncNode->raftCfg.cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]); } @@ -1480,15 +1529,15 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde // reset snapshot senders // clear new - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { pSyncNode->senders[i] = NULL; } // reset new - for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) { + for (int32_t i = 0; i < pSyncNode->totalReplicaNum; ++i) { // reset sender bool reset = false; - for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) { + for (int32_t j = 0; j < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++j) { if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j]) && oldSenders[j] != NULL) { sNTrace(pSyncNode, "snapshot sender reset for:%" PRId64 ", newIndex:%d, dnode:%d, %p", (pSyncNode->replicasId)[i].addr, i, DID(&pSyncNode->replicasId[i]), oldSenders[j]); @@ -1510,7 +1559,7 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde } // create new - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { if (pSyncNode->senders[i] == NULL) { pSyncNode->senders[i] = snapshotSenderCreate(pSyncNode, i); if (pSyncNode->senders[i] == NULL) { @@ -1525,7 +1574,7 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde } // free old - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { if (oldSenders[i] != NULL) { sSDebug(oldSenders[i], "snapshot sender destroy old, data:%p replica-index:%d", oldSenders[i], i); snapshotSenderDestroy(oldSenders[i]); @@ -1550,12 +1599,12 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde } else { // persist cfg syncWriteCfgFile(pSyncNode); - sNInfo(pSyncNode, "do not config change from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum); + sNInfo(pSyncNode, "do not config change from %d to %d", oldConfig.totalReplicaNum, pNewConfig->totalReplicaNum); } _END: // log end config change - sNInfo(pSyncNode, "end do config change, from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum); + sNInfo(pSyncNode, "end do config change, from %d to %d", oldConfig.totalReplicaNum, pNewConfig->totalReplicaNum); } // raft state change -------------- @@ -1635,6 +1684,27 @@ void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) { syncNodeResetElectTimer(pSyncNode); } +void syncNodeBecomeLearner(SSyncNode* pSyncNode, const char* debugStr) { + pSyncNode->hbSlowNum = 0; + + // state change + pSyncNode->state = TAOS_SYNC_STATE_LEARNER; + + // trace log + sNTrace(pSyncNode, "become learner %s", debugStr); + + // call back + if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLearnerCb != NULL) { + pSyncNode->pFsm->FpBecomeLearnerCb(pSyncNode->pFsm); + } + + // min match index + pSyncNode->minMatchIndex = SYNC_INDEX_INVALID; + + // reset log buffer + syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode); +} + // TLA+ Spec // \* Candidate i transitions to leader. // BecomeLeader(i) == @@ -1752,7 +1822,7 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) { bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); } int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) { - for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) { + for (int32_t i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; ++i) { pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID; pSyncNode->peerStates[i].lastSendTime = 0; } @@ -2039,7 +2109,7 @@ static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) { if (!syncIsInit()) return; SSyncNode* pNode = param; - if (pNode->replicaNum > 1) { + if (pNode->totalReplicaNum > 1) { if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) { SRpcMsg rpcMsg = {0}; int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock), @@ -2074,7 +2144,7 @@ static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) { SSyncHbTimerData* pData = syncHbTimerDataAcquire(hbDataRid); if (pData == NULL) { - sError("hb timer get pData NULL, %" PRId64, hbDataRid); + sError("hb timer get pData NULL, rid:%" PRId64 " addr:%" PRId64, hbDataRid, pData->destId.addr); return; } @@ -2101,9 +2171,9 @@ static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) { return; } - // sTrace("vgId:%d, eq peer hb timer", pSyncNode->vgId); + sTrace("vgId:%d, eq peer hb timer, rid:%" PRId64 " addr:%" PRId64, pSyncNode->vgId, hbDataRid, pData->destId.addr); - if (pSyncNode->replicaNum > 1) { + if (pSyncNode->totalReplicaNum > 1) { int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock); int64_t msgLogicClock = atomic_load_64(&pData->logicClock); @@ -2130,6 +2200,7 @@ static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) { pSyncTimer->timeStamp = tsNow; // send msg + sTrace("vgId:%d, send heartbeat to dnode:%d", pSyncNode->vgId, DID(&(pSyncMsg->destId))); syncLogSendHeartbeat(pSyncNode, pSyncMsg, false, timerElapsed, pData->execTime); syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg); } else { @@ -2212,7 +2283,7 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { } bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) { - if (pSyncNode->replicaNum == 1) { + if (pSyncNode->totalReplicaNum == 1) { return false; } @@ -2237,7 +2308,7 @@ bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) { bool syncNodeSnapshotSending(SSyncNode* pSyncNode) { if (pSyncNode == NULL) return false; bool b = false; - for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) { + for (int32_t i = 0; i < pSyncNode->totalReplicaNum; ++i) { if (pSyncNode->senders[i] != NULL && pSyncNode->senders[i]->start) { b = true; break; @@ -2328,13 +2399,17 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) { pMsgReply->startTime = ths->startTime; pMsgReply->timeStamp = tsMs; + sTrace( + "vgId:%d, heartbeat msg from dnode:%d, cluster:%d, Msgterm:%" PRId64 " currentTerm:%" PRId64, + ths->vgId, DID(&(pMsg->srcId)), CID(&(pMsg->srcId)), pMsg->term, currentTerm); + if (pMsg->term == currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) { syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), tsMs); resetElect = true; ths->minMatchIndex = pMsg->minMatchIndex; - if (ths->state == TAOS_SYNC_STATE_FOLLOWER) { + if (ths->state == TAOS_SYNC_STATE_FOLLOWER || ths->state == TAOS_SYNC_STATE_LEARNER) { SRpcMsg rpcMsgLocalCmd = {0}; (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId); @@ -2356,7 +2431,7 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) { } } - if (pMsg->term >= currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) { + if (pMsg->term >= currentTerm && ths->state == TAOS_SYNC_STATE_LEADER) { SRpcMsg rpcMsgLocalCmd = {0}; (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId); @@ -2376,6 +2451,26 @@ int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) { } } + if (pMsg->term >= currentTerm && ths->state == TAOS_SYNC_STATE_LEARNER) { + SRpcMsg rpcMsgLocalCmd = {0}; + (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId); + + SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont; + pSyncMsg->cmd = SYNC_LOCAL_CMD_LEARNER_CMT; + pSyncMsg->currentTerm = pMsg->term; + pSyncMsg->commitIndex = pMsg->commitIndex; + + if (ths->syncEqMsg != NULL && ths->msgcb != NULL) { + int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd); + if (code != 0) { + sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code); + rpcFreeCont(rpcMsgLocalCmd.pCont); + } else { + sTrace("vgId:%d, sync enqueue step-down msg, new-term:%" PRId64, ths->vgId, pSyncMsg->currentTerm); + } + } + } + // reply syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg); @@ -2439,7 +2534,20 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) { sError("vgId:%d, failed to commit raft log since %s. commit index:%" PRId64 "", ths->vgId, terrstr(), ths->commitIndex); } - } else { + } else if (pMsg->cmd == SYNC_LOCAL_CMD_LEARNER_CMT){ + if (syncLogBufferIsEmpty(ths->pLogBuf)) { + sError("vgId:%d, sync log buffer is empty.", ths->vgId); + return 0; + } + raftStoreSetTerm(ths, pMsg->currentTerm); + (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex); + sTrace("vgId:%d, start to commit raft log in heartbeat. commit index:%" PRId64 "", ths->vgId, ths->commitIndex); + if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { + sError("vgId:%d, failed to commit raft log since %s. commit index:%" PRId64 "", ths->vgId, terrstr(), + ths->commitIndex); + } + } + else { sError("error local cmd"); } @@ -2502,13 +2610,15 @@ const char* syncStr(ESyncState state) { return "error"; case TAOS_SYNC_STATE_OFFLINE: return "offline"; + case TAOS_SYNC_STATE_LEARNER: + return "learner"; default: return "unknown"; } } int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) { - for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pNewCfg->totalReplicaNum; ++i) { SRaftId raftId = { .addr = SYNC_ADDR(&pNewCfg->nodeInfo[i]), .vgId = ths->vgId, @@ -2528,7 +2638,7 @@ bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) { } bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) { - for (int32_t i = 0; i < ths->replicaNum; ++i) { + for (int32_t i = 0; i < ths->totalReplicaNum; ++i) { if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) { return true; } @@ -2538,7 +2648,7 @@ bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) { SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) { SSyncSnapshotSender* pSender = NULL; - for (int32_t i = 0; i < ths->replicaNum; ++i) { + for (int32_t i = 0; i < ths->totalReplicaNum; ++i) { if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) { pSender = (ths->senders)[i]; } @@ -2548,7 +2658,7 @@ SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) { SSyncTimer* pTimer = NULL; - for (int32_t i = 0; i < ths->replicaNum; ++i) { + for (int32_t i = 0; i < ths->totalReplicaNum; ++i) { if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) { pTimer = &((ths->peerHeartbeatTimerArr)[i]); } @@ -2558,7 +2668,7 @@ SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) { SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) { SPeerState* pState = NULL; - for (int32_t i = 0; i < ths->replicaNum; ++i) { + for (int32_t i = 0; i < ths->totalReplicaNum; ++i) { if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) { pState = &((ths->peerStates)[i]); } diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 6bebef77dc75ac0ec0adc0273d23bec4789cf527..81dbead506093998020e1b56726bf9a3bfec6954 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -242,6 +242,8 @@ int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) { // update startIndex pBuf->startIndex = takeDummy ? index : index + 1; + pBuf->isCatchup = false; + sInfo("vgId:%d, init sync log buffer. buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); @@ -324,6 +326,15 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt goto _out; } + if(pNode->raftCfg.cfg.nodeInfo[pNode->raftCfg.cfg.myIndex].nodeRole == TAOS_SYNC_ROLE_LEARNER && + index > 0 && index > pBuf->totalIndex){ + pBuf->totalIndex = index; + sTrace("vgId:%d, update learner progress. index:%" PRId64 ", term:%" PRId64 ": prevterm:%" PRId64 + " != lastmatch:%" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex, + pBuf->matchIndex, pBuf->endIndex); + } + if (index - pBuf->startIndex >= pBuf->size) { sWarn("vgId:%d, out of buffer range. index:%" PRId64 ", term:%" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", @@ -483,7 +494,7 @@ _out: } int32_t syncFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry, - int32_t applyCode) { + int32_t applyCode) { if (pNode->replicaNum == 1 && pNode->restoreFinish && pNode->vgId != 1) { return 0; } @@ -957,7 +968,7 @@ void syncLogReplDestroy(SSyncLogReplMgr* pMgr) { } int32_t syncNodeLogReplInit(SSyncNode* pNode) { - for (int i = 0; i < TSDB_MAX_REPLICA; i++) { + for (int i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; i++) { ASSERT(pNode->logReplMgrs[i] == NULL); pNode->logReplMgrs[i] = syncLogReplCreate(); if (pNode->logReplMgrs[i] == NULL) { @@ -970,7 +981,7 @@ int32_t syncNodeLogReplInit(SSyncNode* pNode) { } void syncNodeLogReplDestroy(SSyncNode* pNode) { - for (int i = 0; i < TSDB_MAX_REPLICA; i++) { + for (int i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; i++) { syncLogReplDestroy(pNode->logReplMgrs[i]); pNode->logReplMgrs[i] = NULL; } @@ -1100,7 +1111,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) { pBuf->endIndex = pBuf->matchIndex + 1; // reset repl mgr - for (int i = 0; i < pNode->replicaNum; i++) { + for (int i = 0; i < pNode->totalReplicaNum; i++) { SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i]; syncLogReplReset(pMgr); } diff --git a/source/libs/sync/src/syncRaftCfg.c b/source/libs/sync/src/syncRaftCfg.c index f780e255ce77eb485ffcd1de8d078718dc1973a5..88a57534f45413d725ef3ec79af342a5d5a35039 100644 --- a/source/libs/sync/src/syncRaftCfg.c +++ b/source/libs/sync/src/syncRaftCfg.c @@ -18,21 +18,45 @@ #include "syncUtil.h" #include "tjson.h" +const char* syncRoleToStr(ESyncRole role) { + switch (role) { + case TAOS_SYNC_ROLE_VOTER: + return "voter"; + case TAOS_SYNC_ROLE_LEARNER: + return "learner"; + default: + return "unknown"; + } +} + +const ESyncRole syncStrToRole(char* str) { + if(strcmp(str, "voter") == 0){ + return TAOS_SYNC_ROLE_VOTER; + } + if(strcmp(str, "learner") == 0){ + return TAOS_SYNC_ROLE_LEARNER; + } + + return TAOS_SYNC_ROLE_ERROR; +} + static int32_t syncEncodeSyncCfg(const void *pObj, SJson *pJson) { SSyncCfg *pCfg = (SSyncCfg *)pObj; + if (tjsonAddDoubleToObject(pJson, "totalReplicaNum", pCfg->totalReplicaNum) < 0) return -1; if (tjsonAddDoubleToObject(pJson, "replicaNum", pCfg->replicaNum) < 0) return -1; if (tjsonAddDoubleToObject(pJson, "myIndex", pCfg->myIndex) < 0) return -1; SJson *nodeInfo = tjsonCreateArray(); if (nodeInfo == NULL) return -1; if (tjsonAddItemToObject(pJson, "nodeInfo", nodeInfo) < 0) return -1; - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { SJson *info = tjsonCreateObject(); if (info == NULL) return -1; if (tjsonAddDoubleToObject(info, "nodePort", pCfg->nodeInfo[i].nodePort) < 0) return -1; if (tjsonAddStringToObject(info, "nodeFqdn", pCfg->nodeInfo[i].nodeFqdn) < 0) return -1; if (tjsonAddIntegerToObject(info, "nodeId", pCfg->nodeInfo[i].nodeId) < 0) return -1; if (tjsonAddIntegerToObject(info, "clusterId", pCfg->nodeInfo[i].clusterId) < 0) return -1; + if (tjsonAddStringToObject(info, "nodeRole", syncRoleToStr(pCfg->nodeInfo[i].nodeRole)) < 0) return -1; if (tjsonAddItemToArray(nodeInfo, info) < 0) return -1; } @@ -90,7 +114,8 @@ int32_t syncWriteCfgFile(SSyncNode *pNode) { if (taosRenameFile(file, realfile) != 0) goto _OVER; code = 0; - sInfo("vgId:%d, succeed to write sync cfg file:%s, len:%d", pNode->vgId, realfile, len); + sInfo("vgId:%d, succeed to write sync cfg file:%s, len:%d, lastConfigIndex:%" PRId64, pNode->vgId, + realfile, len, pNode->raftCfg.lastConfigIndex); _OVER: if (pJson != NULL) tjsonDelete(pJson); @@ -108,6 +133,7 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) { SSyncCfg *pCfg = (SSyncCfg *)pObj; int32_t code = 0; + tjsonGetInt32ValueFromDouble(pJson, "totalReplicaNum", pCfg->totalReplicaNum, code); tjsonGetInt32ValueFromDouble(pJson, "replicaNum", pCfg->replicaNum, code); if (code < 0) return -1; tjsonGetInt32ValueFromDouble(pJson, "myIndex", pCfg->myIndex, code); @@ -115,9 +141,9 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) { SJson *nodeInfo = tjsonGetObjectItem(pJson, "nodeInfo"); if (nodeInfo == NULL) return -1; - pCfg->replicaNum = tjsonGetArraySize(nodeInfo); + pCfg->totalReplicaNum = tjsonGetArraySize(nodeInfo); - for (int32_t i = 0; i < pCfg->replicaNum; ++i) { + for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) { SJson *info = tjsonGetArrayItem(nodeInfo, i); if (info == NULL) return -1; tjsonGetUInt16ValueFromDouble(info, "nodePort", pCfg->nodeInfo[i].nodePort, code); @@ -126,6 +152,10 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) { if (code < 0) return -1; tjsonGetNumberValue(info, "nodeId", pCfg->nodeInfo[i].nodeId, code); tjsonGetNumberValue(info, "clusterId", pCfg->nodeInfo[i].clusterId, code); + char role[10]; + code = tjsonGetStringValue(info, "nodeRole", role); + if(code < 0) return -1; + pCfg->nodeInfo[i].nodeRole = syncStrToRole(role); } return 0; diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 8ac9a860e3e4b1f368fda48fc27b41d6c89f6ae9..77e73402fae7f75b8b2240a58d2ed71f48b19e0e 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -66,10 +66,10 @@ int32_t syncNodeReplicate(SSyncNode* pNode) { } int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) { - if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) { + if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->raftCfg.cfg.totalReplicaNum == 1) { return -1; } - for (int32_t i = 0; i < pNode->replicaNum; i++) { + for (int32_t i = 0; i < pNode->totalReplicaNum; i++) { if (syncUtilSameId(&pNode->replicasId[i], &pNode->myRaftId)) { continue; } diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index a9d0ddad17d99f929c51ef60e313d2df463377e3..763d4ec5d6b0fa8015d841231fc094d96661fadc 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -795,13 +795,18 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { return -1; } - if (pMsg->term > raftStoreGetTerm(pSyncNode)) { - syncNodeStepDown(pSyncNode, pMsg->term); + if(pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){ + if (pMsg->term > raftStoreGetTerm(pSyncNode)) { + syncNodeStepDown(pSyncNode, pMsg->term); + } + } + else{ + syncNodeUpdateTermWithoutStepDown(pSyncNode, pMsg->term); } // state, term, seq/ack int32_t code = 0; - if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) { + if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) { if (pMsg->term == raftStoreGetTerm(pSyncNode)) { if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) { syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq pre-snapshot"); diff --git a/source/libs/sync/src/syncTimeout.c b/source/libs/sync/src/syncTimeout.c index a27be2853e512d73ada59bc224ce028ae208e0cb..5ee67da9ab8d2cef07b7d85ef275199e8cf2ef0e 100644 --- a/source/libs/sync/src/syncTimeout.c +++ b/source/libs/sync/src/syncTimeout.c @@ -58,7 +58,7 @@ static void syncNodeCleanConfigIndex(SSyncNode* ths) { static int32_t syncNodeTimerRoutine(SSyncNode* ths) { ths->tmrRoutineNum++; - if (ths->tmrRoutineNum % 60 == 0 && ths->replicaNum > 1) { + if (ths->tmrRoutineNum % 60 == 0 && ths->totalReplicaNum > 1) { sNInfo(ths, "timer routines"); } else { sNTrace(ths, "timer routines"); diff --git a/source/libs/sync/src/syncVoteMgr.c b/source/libs/sync/src/syncVoteMgr.c index 6bd9625276c898184341cec291abed6c304700fb..83b0dde8e0c77b5c199a72d4a28bda64108b0810 100644 --- a/source/libs/sync/src/syncVoteMgr.c +++ b/source/libs/sync/src/syncVoteMgr.c @@ -30,7 +30,7 @@ SVotesGranted *voteGrantedCreate(SSyncNode *pNode) { return NULL; } - pVotesGranted->replicas = &pNode->replicasId; + pVotesGranted->replicas = (void*)&pNode->replicasId; pVotesGranted->replicaNum = pNode->replicaNum; voteGrantedClearVotes(pVotesGranted); @@ -49,7 +49,7 @@ void voteGrantedDestroy(SVotesGranted *pVotesGranted) { } void voteGrantedUpdate(SVotesGranted *pVotesGranted, SSyncNode *pNode) { - pVotesGranted->replicas = &pNode->replicasId; + pVotesGranted->replicas = (void*)&pNode->replicasId; pVotesGranted->replicaNum = pNode->replicaNum; voteGrantedClearVotes(pVotesGranted); @@ -115,7 +115,7 @@ SVotesRespond *votesRespondCreate(SSyncNode *pNode) { return NULL; } - pVotesRespond->replicas = &pNode->replicasId; + pVotesRespond->replicas = (void*)&pNode->replicasId; pVotesRespond->replicaNum = pNode->replicaNum; pVotesRespond->term = 0; pVotesRespond->pNode = pNode; @@ -130,7 +130,7 @@ void votesRespondDestory(SVotesRespond *pVotesRespond) { } void votesRespondUpdate(SVotesRespond *pVotesRespond, SSyncNode *pNode) { - pVotesRespond->replicas = &pNode->replicasId; + pVotesRespond->replicas = (void*)&pNode->replicasId; pVotesRespond->replicaNum = pNode->replicaNum; pVotesRespond->term = 0; pVotesRespond->pNode = pNode;