未验证 提交 969fdd1b 编写于 作者: X Xiaoyu Wang 提交者: GitHub

Merge pull request #20971 from taosdata/feat/learner

add learner
......@@ -1286,6 +1286,9 @@ typedef struct {
int16_t hashSuffix;
int32_t tsdbPageSize;
int64_t reserved[8];
int8_t learnerReplica;
int8_t learnerSelfIndex;
SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA];
} SCreateVnodeReq;
int32_t tSerializeSCreateVnodeReq(void* buf, int32_t bufLen, SCreateVnodeReq* pReq);
......@@ -1357,7 +1360,10 @@ typedef struct {
int8_t replica;
SReplica replicas[TSDB_MAX_REPLICA];
int64_t reserved[8];
} SAlterVnodeReplicaReq;
int8_t learnerSelfIndex;
int8_t learnerReplica;
SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA];
} SAlterVnodeReplicaReq, SAlterVnodeTypeReq;
int32_t tSerializeSAlterVnodeReplicaReq(void* buf, int32_t bufLen, SAlterVnodeReplicaReq* pReq);
int32_t tDeserializeSAlterVnodeReplicaReq(void* buf, int32_t bufLen, SAlterVnodeReplicaReq* pReq);
......@@ -1629,7 +1635,10 @@ int32_t tDeserializeSCreateDropMQSNodeReq(void* buf, int32_t bufLen, SMCreateQno
typedef struct {
int8_t replica;
SReplica replicas[TSDB_MAX_REPLICA];
} SDCreateMnodeReq, SDAlterMnodeReq;
int8_t learnerReplica;
SReplica learnerReplicas[TSDB_MAX_LEARNER_REPLICA];
int64_t lastIndex;
} SDCreateMnodeReq, SDAlterMnodeReq, SDAlterMnodeTypeReq;
int32_t tSerializeSDCreateMnodeReq(void* buf, int32_t bufLen, SDCreateMnodeReq* pReq);
int32_t tDeserializeSDCreateMnodeReq(void* buf, int32_t bufLen, SDCreateMnodeReq* pReq);
......
......@@ -83,6 +83,8 @@ enum {
TD_DEF_MSG_TYPE(TDMT_DND_CONFIG_DNODE, "config-dnode", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_SYSTABLE_RETRIEVE, "dnode-retrieve", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_MAX_MSG, "dnd-max", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_ALTER_MNODE_TYPE, "dnode-alter-mnode-type", NULL, NULL)
TD_DEF_MSG_TYPE(TDMT_DND_ALTER_VNODE_TYPE, "dnode-alter-vnode-type", NULL, NULL)
TD_NEW_MSG_SEG(TDMT_MND_MSG)
TD_DEF_MSG_TYPE(TDMT_MND_CONNECT, "connect", NULL, NULL)
......
......@@ -33,8 +33,11 @@ typedef struct {
bool deploy;
int8_t selfIndex;
int8_t numOfReplicas;
SReplica replicas[TSDB_MAX_REPLICA];
int8_t numOfTotalReplicas;
SReplica replicas[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int32_t nodeRoles[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SMsgCb msgCb;
int64_t lastIndex;
} SMnodeOpt;
/* ------------------------ SMnode ------------------------ */
......@@ -69,6 +72,8 @@ int32_t mndStart(SMnode *pMnode);
*/
void mndStop(SMnode *pMnode);
int32_t mndIsCatchUp(SMnode *pMnode);
/**
* @brief Get mnode monitor info.
*
......
......@@ -55,6 +55,8 @@ extern "C" {
#define SYNC_INDEX_INVALID -1
#define SYNC_TERM_INVALID -1
#define SYNC_LEARNER_CATCHUP 10
typedef enum {
SYNC_STRATEGY_NO_SNAPSHOT = 0,
SYNC_STRATEGY_STANDARD_SNAPSHOT = 1,
......@@ -76,19 +78,29 @@ typedef enum {
TAOS_SYNC_STATE_CANDIDATE = 101,
TAOS_SYNC_STATE_LEADER = 102,
TAOS_SYNC_STATE_ERROR = 103,
TAOS_SYNC_STATE_LEARNER = 104,
} ESyncState;
typedef enum {
TAOS_SYNC_ROLE_VOTER = 0,
TAOS_SYNC_ROLE_LEARNER = 1,
TAOS_SYNC_ROLE_ERROR = 2,
} ESyncRole;
typedef struct SNodeInfo {
int64_t clusterId;
int32_t nodeId;
uint16_t nodePort;
char nodeFqdn[TSDB_FQDN_LEN];
int64_t clusterId;
int32_t nodeId;
uint16_t nodePort;
char nodeFqdn[TSDB_FQDN_LEN];
ESyncRole nodeRole;
} SNodeInfo;
typedef struct SSyncCfg {
int32_t totalReplicaNum;
int32_t replicaNum;
int32_t myIndex;
SNodeInfo nodeInfo[TSDB_MAX_REPLICA];
SNodeInfo nodeInfo[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncIndex lastIndex;
} SSyncCfg;
typedef struct SFsmCbMeta {
......@@ -155,6 +167,7 @@ typedef struct SSyncFSM {
void (*FpBecomeLeaderCb)(const struct SSyncFSM* pFsm);
void (*FpBecomeFollowerCb)(const struct SSyncFSM* pFsm);
void (*FpBecomeLearnerCb)(const struct SSyncFSM* pFsm);
int32_t (*FpGetSnapshot)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot, void* pReaderParam, void** ppReader);
void (*FpGetSnapshotInfo)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot);
......@@ -236,6 +249,7 @@ void syncStop(int64_t rid);
void syncPreStop(int64_t rid);
void syncPostStop(int64_t rid);
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak, int64_t* seq);
int32_t syncIsCatchUp(int64_t rid);
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg);
int32_t syncReconfig(int64_t rid, SSyncCfg* pCfg);
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex);
......
......@@ -285,7 +285,7 @@ typedef enum ELogicConditionType {
#define TSDB_DNODE_ROLE_VNODE 2
#define TSDB_MAX_REPLICA 5
#define TSDB_MAX_LEARNER_REPLICA 10
#define TSDB_SYNC_LOG_BUFFER_SIZE 4096
#define TSDB_SYNC_LOG_BUFFER_RETENTION 256
#define TSDB_SYNC_APPLYQ_SIZE_LIMIT 512
......
......@@ -4129,6 +4129,12 @@ int32_t tSerializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *pR
for (int32_t i = 0; i < 8; ++i) {
if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1;
}
if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1;
if (tEncodeI8(&encoder, pReq->learnerSelfIndex) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
tEndEncode(&encoder);
......@@ -4207,6 +4213,14 @@ int32_t tDeserializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *
for (int32_t i = 0; i < 8; ++i) {
if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1;
}
if (!tDecodeIsEnd(&decoder)) {
if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1;
if (tDecodeI8(&decoder, &pReq->learnerSelfIndex) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
}
tEndDecode(&decoder);
tDecoderClear(&decoder);
......@@ -4433,6 +4447,12 @@ int32_t tSerializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnodeRe
for (int32_t i = 0; i < 8; ++i) {
if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1;
}
if (tEncodeI8(&encoder, pReq->learnerSelfIndex) < 0) return -1;
if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
tEndEncode(&encoder);
int32_t tlen = encoder.pos;
......@@ -4456,7 +4476,15 @@ int32_t tDeserializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnode
for (int32_t i = 0; i < 8; ++i) {
if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1;
}
if (!tDecodeIsEnd(&decoder)) {
if (tDecodeI8(&decoder, &pReq->learnerSelfIndex) < 0) return -1;
if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
}
tEndDecode(&decoder);
tDecoderClear(&decoder);
return 0;
......@@ -4767,6 +4795,12 @@ int32_t tSerializeSDCreateMnodeReq(void *buf, int32_t bufLen, SDCreateMnodeReq *
SReplica *pReplica = &pReq->replicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
if (tEncodeI8(&encoder, pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tEncodeSReplica(&encoder, pReplica) < 0) return -1;
}
if (tEncodeI64(&encoder, pReq->lastIndex) < 0) return -1;
tEndEncode(&encoder);
int32_t tlen = encoder.pos;
......@@ -4784,6 +4818,14 @@ int32_t tDeserializeSDCreateMnodeReq(void *buf, int32_t bufLen, SDCreateMnodeReq
SReplica *pReplica = &pReq->replicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
if (!tDecodeIsEnd(&decoder)) {
if (tDecodeI8(&decoder, &pReq->learnerReplica) < 0) return -1;
for (int32_t i = 0; i < TSDB_MAX_LEARNER_REPLICA; ++i) {
SReplica *pReplica = &pReq->learnerReplicas[i];
if (tDecodeSReplica(&decoder, pReplica) < 0) return -1;
}
if (tDecodeI64(&decoder, &pReq->lastIndex) < 0) return -1;
}
tEndDecode(&decoder);
tDecoderClear(&decoder);
......
......@@ -32,6 +32,7 @@ typedef struct SDnodeMgmt {
TdThread crashReportThread;
SSingleWorker mgmtWorker;
ProcessCreateNodeFp processCreateNodeFp;
ProcessAlterNodeTypeFp processAlterNodeTypeFp;
ProcessDropNodeFp processDropNodeFp;
SendMonitorReportFp sendMonitorReportFp;
GetVnodeLoadsFp getVnodeLoadsFp;
......
......@@ -352,6 +352,7 @@ SArray *dmGetMsgHandles() {
if (dmSetMgmtHandle(pArray, TDMT_DND_CONFIG_DNODE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_SERVER_STATUS, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_SYSTABLE_RETRIEVE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_MNODE_TYPE, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
// Requests handled by MNODE
if (dmSetMgmtHandle(pArray, TDMT_MND_GRANT, dmPutNodeMsgToMgmtQueue, 0) == NULL) goto _OVER;
......
......@@ -48,6 +48,7 @@ static int32_t dmOpenMgmt(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) {
pMgmt->path = pInput->path;
pMgmt->name = pInput->name;
pMgmt->processCreateNodeFp = pInput->processCreateNodeFp;
pMgmt->processAlterNodeTypeFp = pInput->processAlterNodeTypeFp;
pMgmt->processDropNodeFp = pInput->processDropNodeFp;
pMgmt->sendMonitorReportFp = pInput->sendMonitorReportFp;
pMgmt->getVnodeLoadsFp = pInput->getVnodeLoadsFp;
......
......@@ -227,6 +227,9 @@ static void dmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) {
case TDMT_DND_DROP_SNODE:
code = (*pMgmt->processDropNodeFp)(SNODE, pMsg);
break;
case TDMT_DND_ALTER_MNODE_TYPE:
code = (*pMgmt->processAlterNodeTypeFp)(MNODE, pMsg);
break;
case TDMT_DND_SERVER_STATUS:
code = dmProcessServerRunStatus(pMgmt, pMsg);
break;
......
......@@ -24,12 +24,16 @@ static int32_t mmDecodeOption(SJson *pJson, SMnodeOpt *pOption) {
if (code < 0) return -1;
tjsonGetInt32ValueFromDouble(pJson, "selfIndex", pOption->selfIndex, code);
if (code < 0) return 0;
tjsonGetInt32ValueFromDouble(pJson, "lastIndex", pOption->lastIndex, code);
if (code < 0) return 0;
SJson *replicas = tjsonGetObjectItem(pJson, "replicas");
if (replicas == NULL) return 0;
pOption->numOfReplicas = tjsonGetArraySize(replicas);
pOption->numOfTotalReplicas = tjsonGetArraySize(replicas);
pOption->numOfReplicas = 0;
for (int32_t i = 0; i < pOption->numOfReplicas; ++i) {
for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) {
SJson *replica = tjsonGetArrayItem(replicas, i);
if (replica == NULL) return -1;
......@@ -40,6 +44,14 @@ static int32_t mmDecodeOption(SJson *pJson, SMnodeOpt *pOption) {
if (code < 0) return -1;
tjsonGetUInt16ValueFromDouble(replica, "port", pReplica->port, code);
if (code < 0) return -1;
tjsonGetInt32ValueFromDouble(replica, "role", pOption->nodeRoles[i], code);
if (code < 0) return -1;
if(pOption->nodeRoles[i] == TAOS_SYNC_ROLE_VOTER){
pOption->numOfReplicas++;
}
}
for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) {
}
return 0;
......@@ -112,14 +124,14 @@ _OVER:
}
static int32_t mmEncodeOption(SJson *pJson, const SMnodeOpt *pOption) {
if (pOption->deploy && pOption->numOfReplicas > 0) {
if (pOption->deploy && pOption->numOfTotalReplicas > 0) {
if (tjsonAddDoubleToObject(pJson, "selfIndex", pOption->selfIndex) < 0) return -1;
SJson *replicas = tjsonCreateArray();
if (replicas == NULL) return -1;
if (tjsonAddItemToObject(pJson, "replicas", replicas) < 0) return -1;
for (int32_t i = 0; i < pOption->numOfReplicas; ++i) {
for (int32_t i = 0; i < pOption->numOfTotalReplicas; ++i) {
SJson *replica = tjsonCreateObject();
if (replica == NULL) return -1;
......@@ -127,10 +139,13 @@ static int32_t mmEncodeOption(SJson *pJson, const SMnodeOpt *pOption) {
if (tjsonAddDoubleToObject(replica, "id", pReplica->id) < 0) return -1;
if (tjsonAddStringToObject(replica, "fqdn", pReplica->fqdn) < 0) return -1;
if (tjsonAddDoubleToObject(replica, "port", pReplica->port) < 0) return -1;
if (tjsonAddDoubleToObject(replica, "role", pOption->nodeRoles[i]) < 0) return -1;
if (tjsonAddItemToArray(replicas, replica) < 0) return -1;
}
}
if (tjsonAddDoubleToObject(pJson, "lastIndex", pOption->lastIndex) < 0) return -1;
if (tjsonAddDoubleToObject(pJson, "deployed", pOption->deploy) < 0) return -1;
return 0;
......
......@@ -33,12 +33,24 @@ int32_t mmProcessCreateReq(const SMgmtInputOpt *pInput, SRpcMsg *pMsg) {
return -1;
}
SMnodeOpt option = {.deploy = true, .numOfReplicas = createReq.replica, .selfIndex = -1};
SMnodeOpt option = {.deploy = true, .numOfReplicas = createReq.replica,
.numOfTotalReplicas = createReq.replica + createReq.learnerReplica,
.selfIndex = -1, .lastIndex = createReq.lastIndex};
memcpy(option.replicas, createReq.replicas, sizeof(createReq.replicas));
for (int32_t i = 0; i < option.numOfReplicas; ++i) {
for (int32_t i = 0; i < createReq.replica; ++i) {
if (createReq.replicas[i].id == pInput->pData->dnodeId) {
option.selfIndex = i;
}
option.nodeRoles[i] = TAOS_SYNC_ROLE_VOTER;
}
memcpy(&(option.replicas[createReq.replica]), createReq.learnerReplicas, sizeof(createReq.learnerReplicas));
for (int32_t i = 0; i < createReq.learnerReplica; ++i) {
if (createReq.learnerReplicas[i].id == pInput->pData->dnodeId) {
option.selfIndex = createReq.replica + i;
}
option.nodeRoles[createReq.replica + i] = TAOS_SYNC_ROLE_LEARNER;
}
if (option.selfIndex == -1) {
......@@ -92,6 +104,8 @@ SArray *mmGetMsgHandles() {
if (dmSetMgmtHandle(pArray, TDMT_DND_CREATE_VNODE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_DROP_VNODE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_CONFIG_DNODE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_MNODE_TYPE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_VNODE_TYPE_RSP, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_MND_CONNECT, mmPutMsgToReadQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_MND_CREATE_ACCT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
......
......@@ -45,9 +45,11 @@ static void mmBuildOptionForDeploy(SMnodeMgmt *pMgmt, const SMgmtInputOpt *pInpu
pOption->dnodeId = pMgmt->pData->dnodeId;
pOption->selfIndex = 0;
pOption->numOfReplicas = 1;
pOption->numOfTotalReplicas = 1;
pOption->replicas[0].id = 1;
pOption->replicas[0].port = tsServerPort;
tstrncpy(pOption->replicas[0].fqdn, tsLocalFqdn, TSDB_FQDN_LEN);
pOption->lastIndex = SYNC_INDEX_INVALID;
}
static void mmBuildOptionForOpen(SMnodeMgmt *pMgmt, SMnodeOpt *pOption) {
......@@ -123,9 +125,10 @@ static int32_t mmOpen(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) {
}
tmsgReportStartup("mnode-worker", "initialized");
if (option.numOfReplicas > 0) {
if (option.numOfTotalReplicas > 0) {
option.deploy = true;
option.numOfReplicas = 0;
option.numOfTotalReplicas = 0;
if (mmWriteFile(pMgmt->path, &option) != 0) {
dError("failed to write mnode file since %s", terrstr());
return -1;
......@@ -152,6 +155,10 @@ static void mmStop(SMnodeMgmt *pMgmt) {
mndStop(pMgmt->pMnode);
}
static int32_t mmSyncIsCatchUp(SMnodeMgmt *pMgmt) {
return mndIsCatchUp(pMgmt->pMnode);
}
SMgmtFunc mmGetMgmtFunc() {
SMgmtFunc mgmtFunc = {0};
mgmtFunc.openFp = mmOpen;
......@@ -162,6 +169,7 @@ SMgmtFunc mmGetMgmtFunc() {
mgmtFunc.dropFp = (NodeDropFp)mmProcessDropReq;
mgmtFunc.requiredFp = mmRequire;
mgmtFunc.getHandlesFp = mmGetMsgHandles;
mgmtFunc.isCatchUpFp = (NodeIsCatchUpFp)mmSyncIsCatchUp;
return mgmtFunc;
}
......@@ -90,6 +90,7 @@ int32_t vmProcessDropVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessAlterHashRangeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg);
// vmFile.c
int32_t vmGetVnodeListFromFile(SVnodeMgmt *pMgmt, SWrapperCfg **ppCfgs, int32_t *numOfVnodes);
......
......@@ -131,15 +131,34 @@ static void vmGenerateVnodeCfg(SCreateVnodeReq *pCreate, SVnodeCfg *pCfg) {
pCfg->tsdbPageSize = pCreate->tsdbPageSize * 1024;
pCfg->standby = 0;
pCfg->syncCfg.myIndex = pCreate->selfIndex;
pCfg->syncCfg.replicaNum = pCreate->replica;
pCfg->syncCfg.replicaNum = 0;
pCfg->syncCfg.totalReplicaNum = 0;
memset(&pCfg->syncCfg.nodeInfo, 0, sizeof(pCfg->syncCfg.nodeInfo));
for (int32_t i = 0; i < pCreate->replica; ++i) {
SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i];
pNode->nodeId = pCreate->replicas[i].id;
pNode->nodePort = pCreate->replicas[i].port;
tstrncpy(pNode->nodeFqdn, pCreate->replicas[i].fqdn, TSDB_FQDN_LEN);
pNode->nodeId = pCreate->replicas[pCfg->syncCfg.replicaNum].id;
pNode->nodePort = pCreate->replicas[pCfg->syncCfg.replicaNum].port;
pNode->nodeRole = TAOS_SYNC_ROLE_VOTER;
tstrncpy(pNode->nodeFqdn, pCreate->replicas[pCfg->syncCfg.replicaNum].fqdn, TSDB_FQDN_LEN);
tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
pCfg->syncCfg.replicaNum++;
}
if(pCreate->selfIndex != -1){
pCfg->syncCfg.myIndex = pCreate->selfIndex;
}
for (int32_t i = pCfg->syncCfg.replicaNum; i < pCreate->replica + pCreate->learnerReplica; ++i) {
SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i];
pNode->nodeId = pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].id;
pNode->nodePort = pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].port;
pNode->nodeRole = TAOS_SYNC_ROLE_LEARNER;
tstrncpy(pNode->nodeFqdn, pCreate->learnerReplicas[pCfg->syncCfg.totalReplicaNum].fqdn, TSDB_FQDN_LEN);
tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
pCfg->syncCfg.totalReplicaNum++;
}
pCfg->syncCfg.totalReplicaNum += pCfg->syncCfg.replicaNum;
if(pCreate->learnerSelfIndex != -1){
pCfg->syncCfg.myIndex = pCreate->replica + pCreate->learnerSelfIndex;
}
}
......@@ -182,23 +201,40 @@ int32_t vmProcessCreateVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
dInfo("vgId:%d, start to create vnode, page:%d pageSize:%d buffer:%d szPage:%d szBuf:%" PRIu64
if(req.learnerReplica == 0)
{
req.learnerSelfIndex = -1;
}
dInfo("vgId:%d, vnode management handle msgType:%s, start to create vnode, page:%d pageSize:%d buffer:%d szPage:%d szBuf:%" PRIu64
", cacheLast:%d cacheLastSize:%d sstTrigger:%d tsdbPageSize:%d %d dbname:%s dbId:%" PRId64
", days:%d keep0:%d keep1:%d keep2:%d tsma:%d precision:%d compression:%d minRows:%d maxRows:%d"
", wal fsync:%d level:%d retentionPeriod:%d retentionSize:%" PRId64 " rollPeriod:%d segSize:%" PRId64
", hash method:%d begin:%u end:%u prefix:%d surfix:%d replica:%d selfIndex:%d strict:%d",
req.vgId, req.pages, req.pageSize, req.buffer, req.pageSize * 1024, (uint64_t)req.buffer * 1024 * 1024,
", hash method:%d begin:%u end:%u prefix:%d surfix:%d replica:%d selfIndex:%d "
"learnerReplica:%d learnerSelfIndex:%d strict:%d",
req.vgId, TMSG_INFO(pMsg->msgType), req.pages, req.pageSize, req.buffer, req.pageSize * 1024,
(uint64_t)req.buffer * 1024 * 1024,
req.cacheLast, req.cacheLastSize, req.sstTrigger, req.tsdbPageSize, req.tsdbPageSize * 1024, req.db, req.dbUid,
req.daysPerFile, req.daysToKeep0, req.daysToKeep1, req.daysToKeep2, req.isTsma, req.precision, req.compression,
req.minRows, req.maxRows, req.walFsyncPeriod, req.walLevel, req.walRetentionPeriod, req.walRetentionSize,
req.walRollPeriod, req.walSegmentSize, req.hashMethod, req.hashBegin, req.hashEnd, req.hashPrefix,
req.hashSuffix, req.replica, req.selfIndex, req.strict);
req.hashSuffix, req.replica, req.selfIndex, req.learnerReplica, req.learnerSelfIndex, req.strict);
for (int32_t i = 0; i < req.replica; ++i) {
dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", req.vgId, i, req.replicas[i].fqdn, req.replicas[i].port,
req.replicas[i].id);
}
for (int32_t i = 0; i < req.learnerReplica; ++i) {
dInfo("vgId:%d, learnerReplica:%d ep:%s:%u dnode:%d", req.vgId, i, req.learnerReplicas[i].fqdn,
req.learnerReplicas[i].port, req.replicas[i].id);
}
SReplica *pReplica = &req.replicas[req.selfIndex];
SReplica *pReplica = NULL;
if(req.selfIndex != -1){
pReplica = &req.replicas[req.selfIndex];
}
else{
pReplica = &req.learnerReplicas[req.learnerSelfIndex];
}
if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort ||
strcmp(pReplica->fqdn, tsLocalFqdn) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
......@@ -275,7 +311,8 @@ _OVER:
vnodeClose(pImpl);
vnodeDestroy(path, pMgmt->pTfs);
} else {
dInfo("vgId:%d, vnode is created", req.vgId);
dInfo("vgId:%d, vnode management handle msgType:%s, end to create vnode, vnode is created",
req.vgId, TMSG_INFO(pMsg->msgType));
}
tFreeSCreateVnodeReq(&req);
......@@ -283,6 +320,110 @@ _OVER:
return code;
}
int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
SAlterVnodeTypeReq req = {0};
if (tDeserializeSAlterVnodeReplicaReq(pMsg->pCont, pMsg->contLen, &req) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
return -1;
}
if(req.learnerReplicas == 0){
req.learnerSelfIndex = -1;
}
dInfo("vgId:%d, vnode management handle msgType:%s, start to process alter-node-type-request",
req.vgId, TMSG_INFO(pMsg->msgType));
SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId);
if (pVnode == NULL) {
dError("vgId:%d, failed to alter hashrange since %s", req.vgId, terrstr());
terrno = TSDB_CODE_VND_NOT_EXIST;
return -1;
}
dInfo("vgId:%d, checking node catch up", req.vgId);
if(vnodeIsCatchUp(pVnode->pImpl) != 0){
return -1;
}
dInfo("node:%s, catched up leader, continue to process alter-node-type-request", pMgmt->name);
int32_t vgId = req.vgId;
dInfo("vgId:%d, start to alter vnode type replica:%d selfIndex:%d strict:%d", vgId, req.replica, req.selfIndex,
req.strict);
for (int32_t i = 0; i < req.replica; ++i) {
SReplica *pReplica = &req.replicas[i];
dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->id);
}
for (int32_t i = 0; i < req.learnerReplica; ++i) {
SReplica *pReplica = &req.learnerReplicas[i];
dInfo("vgId:%d, learnerReplicas:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->id);
}
if (req.replica <= 0 ||
(req.selfIndex < 0 && req.learnerSelfIndex <0)||
req.selfIndex >= req.replica || req.learnerSelfIndex >= req.learnerReplica) {
terrno = TSDB_CODE_INVALID_MSG;
dError("vgId:%d, failed to alter replica since invalid msg", vgId);
return -1;
}
SReplica *pReplica = NULL;
if(req.selfIndex > 0){
pReplica = &req.replicas[req.selfIndex];
}
else{
pReplica = &req.learnerReplicas[req.learnerSelfIndex];
}
if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort ||
strcmp(pReplica->fqdn, tsLocalFqdn) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
dError("vgId:%d, dnodeId:%d ep:%s:%u not matched with local dnode", vgId, pReplica->id, pReplica->fqdn,
pReplica->port);
return -1;
}
dInfo("vgId:%d, start to close vnode", vgId);
SWrapperCfg wrapperCfg = {
.dropped = pVnode->dropped,
.vgId = pVnode->vgId,
.vgVersion = pVnode->vgVersion,
};
tstrncpy(wrapperCfg.path, pVnode->path, sizeof(wrapperCfg.path));
vmCloseVnode(pMgmt, pVnode, false);
char path[TSDB_FILENAME_LEN] = {0};
snprintf(path, TSDB_FILENAME_LEN, "vnode%svnode%d", TD_DIRSEP, vgId);
dInfo("vgId:%d, start to alter vnode replica at %s", vgId, path);
if (vnodeAlterReplica(path, &req, pMgmt->pTfs) < 0) {
dError("vgId:%d, failed to alter vnode at %s since %s", vgId, path, terrstr());
return -1;
}
dInfo("vgId:%d, begin to open vnode", vgId);
SVnode *pImpl = vnodeOpen(path, pMgmt->pTfs, pMgmt->msgCb);
if (pImpl == NULL) {
dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr());
return -1;
}
if (vmOpenVnode(pMgmt, &wrapperCfg, pImpl) != 0) {
dError("vgId:%d, failed to open vnode mgmt since %s", vgId, terrstr());
return -1;
}
if (vnodeStart(pImpl) != 0) {
dError("vgId:%d, failed to start sync since %s", vgId, terrstr());
return -1;
}
dInfo("vgId:%d, vnode management handle msgType:%s, end to process alter-node-type-request, vnode config is altered",
req.vgId, TMSG_INFO(pMsg->msgType));
return 0;
}
int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
SDisableVnodeWriteReq req = {0};
if (tDeserializeSDisableVnodeWriteReq(pMsg->pCont, pMsg->contLen, &req) != 0) {
......@@ -377,21 +518,40 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
if(alterReq.learnerReplica == 0){
alterReq.learnerSelfIndex = -1;
}
int32_t vgId = alterReq.vgId;
dInfo("vgId:%d, start to alter vnode replica:%d selfIndex:%d strict:%d", vgId, alterReq.replica, alterReq.selfIndex,
alterReq.strict);
dInfo("vgId:%d,vnode management handle msgType:%s, start to alter vnode replica:%d selfIndex:%d leanerReplica:%d "
"learnerSelfIndex:%d strict:%d",
vgId, TMSG_INFO(pMsg->msgType), alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica,
alterReq.learnerSelfIndex, alterReq.strict);
for (int32_t i = 0; i < alterReq.replica; ++i) {
SReplica *pReplica = &alterReq.replicas[i];
dInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->port);
}
if (alterReq.replica <= 0 || alterReq.selfIndex < 0 || alterReq.selfIndex >= alterReq.replica) {
for (int32_t i = 0; i < alterReq.learnerReplica; ++i) {
SReplica *pReplica = &alterReq.learnerReplicas[i];
dInfo("vgId:%d, learnerReplicas:%d ep:%s:%u dnode:%d", vgId, i, pReplica->fqdn, pReplica->port, pReplica->port);
}
if (alterReq.replica <= 0 ||
(alterReq.selfIndex < 0 && alterReq.learnerSelfIndex <0)||
alterReq.selfIndex >= alterReq.replica || alterReq.learnerSelfIndex >= alterReq.learnerReplica) {
terrno = TSDB_CODE_INVALID_MSG;
dError("vgId:%d, failed to alter replica since invalid msg", vgId);
return -1;
}
SReplica *pReplica = &alterReq.replicas[alterReq.selfIndex];
SReplica *pReplica = NULL;
if(alterReq.selfIndex != -1){
pReplica = &alterReq.replicas[alterReq.selfIndex];
}
else{
pReplica = &alterReq.learnerReplicas[alterReq.learnerSelfIndex];
}
if (pReplica->id != pMgmt->pData->dnodeId || pReplica->port != tsServerPort ||
strcmp(pReplica->fqdn, tsLocalFqdn) != 0) {
terrno = TSDB_CODE_INVALID_MSG;
......@@ -425,7 +585,7 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
dInfo("vgId:%d, close vnode", vgId);
dInfo("vgId:%d, begin to open vnode", vgId);
SVnode *pImpl = vnodeOpen(path, pMgmt->pTfs, pMgmt->msgCb);
if (pImpl == NULL) {
dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr());
......@@ -442,7 +602,10 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) {
return -1;
}
dInfo("vgId:%d, vnode config is altered", vgId);
dInfo("vgId:%d, vnode management handle msgType:%s, end to alter vnode replica:%d selfIndex:%d leanerReplica:%d "
"learnerSelfIndex:%d strict:%d",
vgId, TMSG_INFO(pMsg->msgType), alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica,
alterReq.learnerSelfIndex, alterReq.strict);
return 0;
}
......@@ -548,6 +711,7 @@ SArray *vmGetMsgHandles() {
if (dmSetMgmtHandle(pArray, TDMT_VND_TRIM, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_CREATE_VNODE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_DROP_VNODE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_DND_ALTER_VNODE_TYPE, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_SYNC_TIMEOUT_ELECTION, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER;
if (dmSetMgmtHandle(pArray, TDMT_SYNC_CLIENT_REQUEST, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER;
......
......@@ -49,6 +49,9 @@ static void vmProcessMgmtQueue(SQueueInfo *pInfo, SRpcMsg *pMsg) {
case TDMT_VND_ALTER_HASHRANGE:
code = vmProcessAlterHashRangeReq(pMgmt, pMsg);
break;
case TDMT_DND_ALTER_VNODE_TYPE:
code = vmProcessAlterVnodeTypeReq(pMgmt, pMsg);
break;
default:
terrno = TSDB_CODE_MSG_NOT_PROCESSED;
dGError("msg:%p, not processed in vnode-mgmt queue", pMsg);
......
......@@ -169,6 +169,8 @@ static int32_t dmProcessCreateNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
return -1;
}
dInfo("start to process create-node-request");
pWrapper = &pDnode->wrappers[ntype];
if (taosMkDir(pWrapper->path) != 0) {
dmReleaseWrapper(pWrapper);
......@@ -198,6 +200,64 @@ static int32_t dmProcessCreateNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
return code;
}
static int32_t dmProcessAlterNodeTypeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
SDnode *pDnode = dmInstance();
SMgmtWrapper *pWrapper = dmAcquireWrapper(pDnode, ntype);
if (pWrapper == NULL) {
dError("fail to process alter node type since node not exist");
return -1;
}
dmReleaseWrapper(pWrapper);
dInfo("node:%s, start to process alter-node-type-request", pWrapper->name);
pWrapper = &pDnode->wrappers[ntype];
if(pWrapper->func.isCatchUpFp != NULL){
dInfo("node:%s, checking node catch up", pWrapper->name);
if(!(*pWrapper->func.isCatchUpFp)(pWrapper->pMgmt) == 0){
return -1;
}
}
dInfo("node:%s, catched up leader, continue to process alter-node-type-request", pWrapper->name);
taosThreadMutexLock(&pDnode->mutex);
dInfo("node:%s, stopping node", pWrapper->name);
dmStopNode(pWrapper);
dInfo("node:%s, closing node", pWrapper->name);
dmCloseNode(pWrapper);
pWrapper = &pDnode->wrappers[ntype];
if (taosMkDir(pWrapper->path) != 0) {
dmReleaseWrapper(pWrapper);
terrno = TAOS_SYSTEM_ERROR(errno);
dError("failed to create dir:%s since %s", pWrapper->path, terrstr());
return -1;
}
SMgmtInputOpt input = dmBuildMgmtInputOpt(pWrapper);
dInfo("node:%s, start to create", pWrapper->name);
int32_t code = (*pWrapper->func.createFp)(&input, pMsg);
if (code != 0) {
dError("node:%s, failed to create since %s", pWrapper->name, terrstr());
} else {
dInfo("node:%s, has been created", pWrapper->name);
code = dmOpenNode(pWrapper);
if (code == 0) {
code = dmStartNode(pWrapper);
}
pWrapper->deployed = true;
pWrapper->required = true;
}
taosThreadMutexUnlock(&pDnode->mutex);
return code;
}
static int32_t dmProcessDropNodeReq(EDndNodeType ntype, SRpcMsg *pMsg) {
SDnode *pDnode = dmInstance();
......@@ -251,6 +311,7 @@ SMgmtInputOpt dmBuildMgmtInputOpt(SMgmtWrapper *pWrapper) {
.name = pWrapper->name,
.pData = &pWrapper->pDnode->data,
.processCreateNodeFp = dmProcessCreateNodeReq,
.processAlterNodeTypeFp = dmProcessAlterNodeTypeReq,
.processDropNodeFp = dmProcessDropNodeReq,
.sendMonitorReportFp = dmSendMonitorReport,
.getVnodeLoadsFp = dmGetVnodeLoads,
......
......@@ -89,6 +89,7 @@ typedef void (*SendMonitorReportFp)();
typedef void (*GetVnodeLoadsFp)(SMonVloadInfo *pInfo);
typedef void (*GetMnodeLoadsFp)(SMonMloadInfo *pInfo);
typedef void (*GetQnodeLoadsFp)(SQnodeLoad *pInfo);
typedef int32_t (*ProcessAlterNodeTypeFp)(EDndNodeType ntype, SRpcMsg *pMsg);
typedef struct {
int32_t dnodeId;
......@@ -112,6 +113,7 @@ typedef struct {
SDnodeData *pData;
SMsgCb msgCb;
ProcessCreateNodeFp processCreateNodeFp;
ProcessAlterNodeTypeFp processAlterNodeTypeFp;
ProcessDropNodeFp processDropNodeFp;
SendMonitorReportFp sendMonitorReportFp;
GetVnodeLoadsFp getVnodeLoadsFp;
......@@ -132,6 +134,7 @@ typedef int32_t (*NodeCreateFp)(const SMgmtInputOpt *pInput, SRpcMsg *pMsg);
typedef int32_t (*NodeDropFp)(const SMgmtInputOpt *pInput, SRpcMsg *pMsg);
typedef int32_t (*NodeRequireFp)(const SMgmtInputOpt *pInput, bool *required);
typedef SArray *(*NodeGetHandlesFp)(); // array of SMgmtHandle
typedef bool (*NodeIsCatchUpFp)(void *pMgmt);
typedef struct {
NodeOpenFp openFp;
......@@ -142,6 +145,7 @@ typedef struct {
NodeDropFp dropFp;
NodeRequireFp requiredFp;
NodeGetHandlesFp getHandlesFp;
NodeIsCatchUpFp isCatchUpFp;
} SMgmtFunc;
typedef struct {
......
......@@ -215,6 +215,8 @@ typedef struct {
bool syncRestore;
int64_t stateStartTime;
SDnodeObj* pDnode;
int32_t role;
SyncIndex lastIndex;
} SMnodeObj;
typedef struct {
......@@ -341,6 +343,7 @@ typedef struct {
ESyncState syncState;
bool syncRestore;
bool syncCanRead;
ESyncRole nodeRole;
} SVnodeGid;
typedef struct {
......@@ -361,7 +364,7 @@ typedef struct {
int8_t compact;
int8_t isTsma;
int8_t replica;
SVnodeGid vnodeGid[TSDB_MAX_REPLICA];
SVnodeGid vnodeGid[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
void* pTsma;
int32_t numOfCachedTables;
} SVgObj;
......
......@@ -92,8 +92,11 @@ typedef struct {
int64_t transSeq;
TdThreadMutex lock;
int8_t selfIndex;
int8_t numOfTotalReplicas;
int8_t numOfReplicas;
SReplica replicas[TSDB_MAX_REPLICA];
SReplica replicas[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
ESyncRole nodeRoles[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncIndex lastIndex;
} SSyncMgmt;
typedef struct {
......
......@@ -491,7 +491,10 @@ static void mndSetOptions(SMnode *pMnode, const SMnodeOpt *pOption) {
pMnode->selfDnodeId = pOption->dnodeId;
pMnode->syncMgmt.selfIndex = pOption->selfIndex;
pMnode->syncMgmt.numOfReplicas = pOption->numOfReplicas;
pMnode->syncMgmt.numOfTotalReplicas = pOption->numOfTotalReplicas;
pMnode->syncMgmt.lastIndex = pOption->lastIndex;
memcpy(pMnode->syncMgmt.replicas, pOption->replicas, sizeof(pOption->replicas));
memcpy(pMnode->syncMgmt.nodeRoles, pOption->nodeRoles, sizeof(pOption->nodeRoles));
}
SMnode *mndOpen(const char *path, const SMnodeOpt *pOption) {
......@@ -582,6 +585,11 @@ int32_t mndStart(SMnode *pMnode) {
return mndInitTimer(pMnode);
}
int32_t mndIsCatchUp(SMnode *pMnode) {
int64_t rid = pMnode->syncMgmt.sync;
return syncIsCatchUp(rid);
}
void mndStop(SMnode *pMnode) {
mndSetStop(pMnode);
mndSyncStop(pMnode);
......
......@@ -23,7 +23,7 @@
#include "mndTrans.h"
#include "tmisce.h"
#define MNODE_VER_NUMBER 1
#define MNODE_VER_NUMBER 2
#define MNODE_RESERVE_SIZE 64
static int32_t mndCreateDefaultMnode(SMnode *pMnode);
......@@ -53,6 +53,7 @@ int32_t mndInitMnode(SMnode *pMnode) {
mndSetMsgHandle(pMnode, TDMT_MND_CREATE_MNODE, mndProcessCreateMnodeReq);
mndSetMsgHandle(pMnode, TDMT_DND_CREATE_MNODE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_DND_ALTER_MNODE_TYPE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_MND_ALTER_MNODE, mndProcessAlterMnodeReq);
mndSetMsgHandle(pMnode, TDMT_MND_ALTER_MNODE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_MND_DROP_MNODE, mndProcessDropMnodeReq);
......@@ -126,6 +127,8 @@ static SSdbRaw *mndMnodeActionEncode(SMnodeObj *pObj) {
SDB_SET_INT32(pRaw, dataPos, pObj->id, _OVER)
SDB_SET_INT64(pRaw, dataPos, pObj->createdTime, _OVER)
SDB_SET_INT64(pRaw, dataPos, pObj->updateTime, _OVER)
SDB_SET_INT32(pRaw, dataPos, pObj->role, _OVER)
SDB_SET_INT64(pRaw, dataPos, pObj->lastIndex, _OVER)
SDB_SET_RESERVE(pRaw, dataPos, MNODE_RESERVE_SIZE, _OVER)
terrno = 0;
......@@ -149,7 +152,7 @@ static SSdbRow *mndMnodeActionDecode(SSdbRaw *pRaw) {
int8_t sver = 0;
if (sdbGetRawSoftVer(pRaw, &sver) != 0) return NULL;
if (sver != MNODE_VER_NUMBER) {
if (sver != 1 && sver != 2) {
terrno = TSDB_CODE_SDB_INVALID_DATA_VER;
goto _OVER;
}
......@@ -164,6 +167,10 @@ static SSdbRow *mndMnodeActionDecode(SSdbRaw *pRaw) {
SDB_GET_INT32(pRaw, dataPos, &pObj->id, _OVER)
SDB_GET_INT64(pRaw, dataPos, &pObj->createdTime, _OVER)
SDB_GET_INT64(pRaw, dataPos, &pObj->updateTime, _OVER)
if(sver >=2){
SDB_GET_INT32(pRaw, dataPos, &pObj->role, _OVER)
SDB_GET_INT64(pRaw, dataPos, &pObj->lastIndex, _OVER)
}
SDB_GET_RESERVE(pRaw, dataPos, MNODE_RESERVE_SIZE, _OVER)
terrno = 0;
......@@ -204,7 +211,9 @@ static int32_t mndMnodeActionDelete(SSdb *pSdb, SMnodeObj *pObj) {
static int32_t mndMnodeActionUpdate(SSdb *pSdb, SMnodeObj *pOld, SMnodeObj *pNew) {
mTrace("mnode:%d, perform update action, old row:%p new row:%p", pOld->id, pOld, pNew);
pOld->role = pNew->role;
pOld->updateTime = pNew->updateTime;
pOld->lastIndex = pNew->lastIndex;
mndReloadSyncConfig(pSdb->pMnode);
return 0;
......@@ -302,6 +311,27 @@ static int32_t mndBuildCreateMnodeRedoAction(STrans *pTrans, SDCreateMnodeReq *p
return 0;
}
static int32_t mndBuildAlterMnodeTypeRedoAction(STrans *pTrans,
SDAlterMnodeTypeReq *pAlterMnodeTypeReq, SEpSet *pAlterMnodeTypeEpSet) {
int32_t contLen = tSerializeSDCreateMnodeReq(NULL, 0, pAlterMnodeTypeReq);
void *pReq = taosMemoryMalloc(contLen);
tSerializeSDCreateMnodeReq(pReq, contLen, pAlterMnodeTypeReq);
STransAction action = {
.epSet = *pAlterMnodeTypeEpSet,
.pCont = pReq,
.contLen = contLen,
.msgType = TDMT_DND_ALTER_MNODE_TYPE,
.acceptableCode = TSDB_CODE_MNODE_ALREADY_DEPLOYED,
};
if (mndTransAppendRedoAction(pTrans, &action) != 0) {
taosMemoryFree(pReq);
return -1;
}
return 0;
}
static int32_t mndBuildAlterMnodeRedoAction(STrans *pTrans, SDCreateMnodeReq *pAlterReq, SEpSet *pAlterEpSet) {
int32_t contLen = tSerializeSDCreateMnodeReq(NULL, 0, pAlterReq);
void *pReq = taosMemoryMalloc(contLen);
......@@ -347,6 +377,7 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno
SSdb *pSdb = pMnode->pSdb;
void *pIter = NULL;
int32_t numOfReplicas = 0;
int32_t numOfLearnerReplicas = 0;
SDCreateMnodeReq createReq = {0};
SEpSet createEpset = {0};
......@@ -355,18 +386,29 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno
pIter = sdbFetch(pSdb, SDB_MNODE, pIter, (void **)&pMObj);
if (pIter == NULL) break;
createReq.replicas[numOfReplicas].id = pMObj->id;
createReq.replicas[numOfReplicas].port = pMObj->pDnode->port;
memcpy(createReq.replicas[numOfReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
if(pMObj->role == TAOS_SYNC_ROLE_VOTER){
createReq.replicas[numOfReplicas].id = pMObj->id;
createReq.replicas[numOfReplicas].port = pMObj->pDnode->port;
memcpy(createReq.replicas[numOfReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
numOfReplicas++;
}
else{
createReq.learnerReplicas[numOfLearnerReplicas].id = pMObj->id;
createReq.learnerReplicas[numOfLearnerReplicas].port = pMObj->pDnode->port;
memcpy(createReq.learnerReplicas[numOfLearnerReplicas].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
numOfLearnerReplicas++;
}
numOfReplicas++;
sdbRelease(pSdb, pMObj);
}
createReq.replica = numOfReplicas + 1;
createReq.replicas[numOfReplicas].id = pDnode->id;
createReq.replicas[numOfReplicas].port = pDnode->port;
memcpy(createReq.replicas[numOfReplicas].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
createReq.replica = numOfReplicas;
createReq.learnerReplica = numOfLearnerReplicas + 1;
createReq.learnerReplicas[numOfLearnerReplicas].id = pDnode->id;
createReq.learnerReplicas[numOfLearnerReplicas].port = pDnode->port;
memcpy(createReq.learnerReplicas[numOfLearnerReplicas].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
createReq.lastIndex = pObj->lastIndex;
createEpset.inUse = 0;
createEpset.numOfEps = 1;
......@@ -378,23 +420,78 @@ static int32_t mndSetCreateMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDno
return 0;
}
static int32_t mndSetAlterMnodeTypeRedoActions(SMnode *pMnode, STrans *pTrans, SDnodeObj *pDnode, SMnodeObj *pObj) {
SSdb *pSdb = pMnode->pSdb;
void *pIter = NULL;
SDAlterMnodeTypeReq alterReq = {0};
SEpSet createEpset = {0};
while (1) {
SMnodeObj *pMObj = NULL;
pIter = sdbFetch(pSdb, SDB_MNODE, pIter, (void **)&pMObj);
if (pIter == NULL) break;
if(pMObj->role == TAOS_SYNC_ROLE_VOTER){
alterReq.replicas[alterReq.replica].id = pMObj->id;
alterReq.replicas[alterReq.replica].port = pMObj->pDnode->port;
memcpy(alterReq.replicas[alterReq.replica].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
alterReq.replica++;
}
else{
alterReq.learnerReplicas[alterReq.learnerReplica].id = pMObj->id;
alterReq.learnerReplicas[alterReq.learnerReplica].port = pMObj->pDnode->port;
memcpy(alterReq.learnerReplicas[alterReq.learnerReplica].fqdn, pMObj->pDnode->fqdn, TSDB_FQDN_LEN);
alterReq.learnerReplica++;
}
sdbRelease(pSdb, pMObj);
}
alterReq.replicas[alterReq.replica].id = pDnode->id;
alterReq.replicas[alterReq.replica].port = pDnode->port;
memcpy(alterReq.replicas[alterReq.replica].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
alterReq.replica++;
alterReq.lastIndex = pObj->lastIndex;
createEpset.inUse = 0;
createEpset.numOfEps = 1;
createEpset.eps[0].port = pDnode->port;
memcpy(createEpset.eps[0].fqdn, pDnode->fqdn, TSDB_FQDN_LEN);
if (mndBuildAlterMnodeTypeRedoAction(pTrans, &alterReq, &createEpset) != 0) return -1;
return 0;
}
static int32_t mndCreateMnode(SMnode *pMnode, SRpcMsg *pReq, SDnodeObj *pDnode, SMCreateMnodeReq *pCreate) {
int32_t code = -1;
SMnodeObj mnodeObj = {0};
mnodeObj.id = pDnode->id;
mnodeObj.createdTime = taosGetTimestampMs();
mnodeObj.updateTime = mnodeObj.createdTime;
STrans *pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_GLOBAL, pReq, "create-mnode");
if (pTrans == NULL) goto _OVER;
mndTransSetSerial(pTrans);
mInfo("trans:%d, used to create mnode:%d", pTrans->id, pCreate->dnodeId);
if (mndTrancCheckConflict(pMnode, pTrans) != 0) goto _OVER;
SMnodeObj mnodeObj = {0};
mnodeObj.id = pDnode->id;
mnodeObj.createdTime = taosGetTimestampMs();
mnodeObj.updateTime = mnodeObj.createdTime;
mnodeObj.role = TAOS_SYNC_ROLE_LEARNER;
mnodeObj.lastIndex = pMnode->applied;
if (mndSetCreateMnodeRedoActions(pMnode, pTrans, pDnode, &mnodeObj) != 0) goto _OVER;
if (mndSetCreateMnodeRedoLogs(pMnode, pTrans, &mnodeObj) != 0) goto _OVER;
if (mndSetCreateMnodeCommitLogs(pMnode, pTrans, &mnodeObj) != 0) goto _OVER;
SMnodeObj mnodeLeaderObj = {0};
mnodeLeaderObj.id = pDnode->id;
mnodeLeaderObj.createdTime = taosGetTimestampMs();
mnodeLeaderObj.updateTime = mnodeLeaderObj.createdTime;
mnodeLeaderObj.role = TAOS_SYNC_ROLE_VOTER;
mnodeLeaderObj.lastIndex = pMnode->applied + 1;
if (mndSetAlterMnodeTypeRedoActions(pMnode, pTrans, pDnode, &mnodeLeaderObj) != 0) goto _OVER;
if (mndSetCreateMnodeCommitLogs(pMnode, pTrans, &mnodeLeaderObj) != 0) goto _OVER;
if (mndTransPrepare(pMnode, pTrans) != 0) goto _OVER;
code = 0;
......@@ -514,6 +611,7 @@ static int32_t mndSetDropMnodeRedoActions(SMnode *pMnode, STrans *pTrans, SDnode
int32_t mndSetDropMnodeInfoToTrans(SMnode *pMnode, STrans *pTrans, SMnodeObj *pObj, bool force) {
if (pObj == NULL) return 0;
pObj->lastIndex = pMnode->applied;
if (mndSetDropMnodeRedoActions(pMnode, pTrans, pObj->pDnode, pObj, force) != 0) return -1;
if (mndSetDropMnodeCommitLogs(pMnode, pTrans, pObj) != 0) return -1;
return 0;
......@@ -730,7 +828,8 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
void *pIter = NULL;
int32_t updatingMnodes = 0;
int32_t readyMnodes = 0;
SSyncCfg cfg = {.myIndex = -1};
SSyncCfg cfg = {.myIndex = -1, .lastIndex = 0,};
SyncIndex maxIndex = 0;
while (1) {
pIter = sdbFetchAll(pSdb, SDB_MNODE, pIter, (void **)&pObj, &objStatus, false);
......@@ -745,26 +844,41 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
}
if (objStatus == SDB_STATUS_READY || objStatus == SDB_STATUS_CREATING) {
SNodeInfo *pNode = &cfg.nodeInfo[cfg.replicaNum];
SNodeInfo *pNode = &cfg.nodeInfo[cfg.totalReplicaNum];
pNode->nodeId = pObj->pDnode->id;
pNode->clusterId = mndGetClusterId(pMnode);
pNode->nodePort = pObj->pDnode->port;
pNode->nodeRole = pObj->role;
tstrncpy(pNode->nodeFqdn, pObj->pDnode->fqdn, TSDB_FQDN_LEN);
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
mInfo("vgId:1, ep:%s:%u dnode:%d", pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
if (pObj->pDnode->id == pMnode->selfDnodeId) {
cfg.myIndex = cfg.replicaNum;
cfg.myIndex = cfg.totalReplicaNum;
}
if(pNode->nodeRole == TAOS_SYNC_ROLE_VOTER){
cfg.replicaNum++;
}
cfg.totalReplicaNum++;
if(pObj->lastIndex > cfg.lastIndex){
cfg.lastIndex = pObj->lastIndex;
}
}
if (objStatus == SDB_STATUS_DROPPING) {
if(pObj->lastIndex > cfg.lastIndex){
cfg.lastIndex = pObj->lastIndex;
}
cfg.replicaNum++;
}
mInfo("vgId:1, mnode:%d, role:%d, lastIndex:%" PRId64, pObj->id, pObj->role, pObj->lastIndex);
sdbReleaseLock(pSdb, pObj, false);
}
if (readyMnodes <= 0 || updatingMnodes <= 0) {
mInfo("vgId:1, mnode sync not reconfig since readyMnodes:%d updatingMnodes:%d", readyMnodes, updatingMnodes);
return;
}
//if (readyMnodes <= 0 || updatingMnodes <= 0) {
// mInfo("vgId:1, mnode sync not reconfig since readyMnodes:%d updatingMnodes:%d", readyMnodes, updatingMnodes);
// return;
//}
if (cfg.myIndex == -1) {
#if 1
......@@ -777,12 +891,14 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
return;
}
if (updatingMnodes > 0) {
mInfo("vgId:1, mnode sync reconfig, replica:%d myIndex:%d", cfg.replicaNum, cfg.myIndex);
for (int32_t i = 0; i < cfg.replicaNum; ++i) {
if (pMnode->syncMgmt.sync > 0) {
mInfo("vgId:1, mnode sync reconfig, totalReplica:%d replica:%d myIndex:%d",
cfg.totalReplicaNum, cfg.replicaNum, cfg.myIndex);
for (int32_t i = 0; i < cfg.totalReplicaNum; ++i) {
SNodeInfo *pNode = &cfg.nodeInfo[i];
mInfo("vgId:1, index:%d, ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId,
pNode->clusterId);
mInfo("vgId:1, index:%d, ep:%s:%u dnode:%d cluster:%" PRId64 " role:%d", i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId, pNode->clusterId, pNode->nodeRole);
}
int32_t code = syncReconfig(pMnode->syncMgmt.sync, &cfg);
......
......@@ -237,6 +237,23 @@ static void mndBecomeFollower(const SSyncFSM *pFsm) {
taosThreadMutexUnlock(&pMgmt->lock);
}
static void mndBecomeLearner(const SSyncFSM *pFsm) {
SMnode *pMnode = pFsm->data;
SSyncMgmt *pMgmt = &pMnode->syncMgmt;
mInfo("vgId:1, become learner");
taosThreadMutexLock(&pMgmt->lock);
if (pMgmt->transId != 0) {
mInfo("vgId:1, become learner and post sem, trans:%d, failed to propose since not leader", pMgmt->transId);
pMgmt->transId = 0;
pMgmt->transSec = 0;
pMgmt->transSeq = 0;
pMgmt->errCode = TSDB_CODE_SYN_NOT_LEADER;
tsem_post(&pMgmt->syncSem);
}
taosThreadMutexUnlock(&pMgmt->lock);
}
static void mndBecomeLeader(const SSyncFSM *pFsm) {
mInfo("vgId:1, become leader");
SMnode *pMnode = pFsm->data;
......@@ -278,6 +295,7 @@ SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) {
pFsm->FpReConfigCb = NULL;
pFsm->FpBecomeLeaderCb = mndBecomeLeader;
pFsm->FpBecomeFollowerCb = mndBecomeFollower;
pFsm->FpBecomeLearnerCb = mndBecomeLearner;
pFsm->FpGetSnapshot = mndSyncGetSnapshot;
pFsm->FpGetSnapshotInfo = mndSyncGetSnapshotInfo;
pFsm->FpSnapshotStartRead = mndSnapshotStartRead;
......@@ -317,13 +335,16 @@ int32_t mndInitSync(SMnode *pMnode) {
mInfo("vgId:1, start to open sync, replica:%d selfIndex:%d", pMgmt->numOfReplicas, pMgmt->selfIndex);
SSyncCfg *pCfg = &syncInfo.syncCfg;
pCfg->totalReplicaNum = pMgmt->numOfTotalReplicas;
pCfg->replicaNum = pMgmt->numOfReplicas;
pCfg->myIndex = pMgmt->selfIndex;
for (int32_t i = 0; i < pMgmt->numOfReplicas; ++i) {
pCfg->lastIndex = pMgmt->lastIndex;
for (int32_t i = 0; i < pMgmt->numOfTotalReplicas; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
pNode->nodeId = pMgmt->replicas[i].id;
pNode->nodePort = pMgmt->replicas[i].port;
tstrncpy(pNode->nodeFqdn, pMgmt->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
pNode->nodeRole = pMgmt->nodeRoles[i];
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
mInfo("vgId:1, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId,
pNode->clusterId);
......
......@@ -62,6 +62,7 @@ int32_t mndInitVgroup(SMnode *pMnode) {
mndSetMsgHandle(pMnode, TDMT_VND_COMPACT_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_VND_DISABLE_WRITE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_SYNC_FORCE_FOLLOWER_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_DND_ALTER_VNODE_TYPE_RSP, mndTransProcessRsp);
mndSetMsgHandle(pMnode, TDMT_MND_REDISTRIBUTE_VGROUP, mndProcessRedistributeVgroupMsg);
mndSetMsgHandle(pMnode, TDMT_MND_SPLIT_VGROUP, mndProcessSplitVgroupMsg);
......@@ -203,7 +204,7 @@ static int32_t mndVgroupActionUpdate(SSdb *pSdb, SVgObj *pOld, SVgObj *pNew) {
pNew->compStorage = pOld->compStorage;
pNew->pointsWritten = pOld->pointsWritten;
pNew->compact = pOld->compact;
memcpy(pOld->vnodeGid, pNew->vnodeGid, TSDB_MAX_REPLICA * sizeof(SVnodeGid));
memcpy(pOld->vnodeGid, pNew->vnodeGid, (TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA) * sizeof(SVnodeGid));
return 0;
}
......@@ -244,8 +245,10 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
createReq.compression = pDb->cfg.compression;
createReq.strict = pDb->cfg.strict;
createReq.cacheLast = pDb->cfg.cacheLast;
createReq.replica = pVgroup->replica;
createReq.replica = 0;
createReq.learnerReplica = 0;
createReq.selfIndex = -1;
createReq.learnerSelfIndex = -1;
createReq.hashBegin = pVgroup->hashBegin;
createReq.hashEnd = pVgroup->hashEnd;
createReq.hashMethod = pDb->cfg.hashMethod;
......@@ -263,7 +266,15 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
createReq.tsdbPageSize = pDb->cfg.tsdbPageSize;
for (int32_t v = 0; v < pVgroup->replica; ++v) {
SReplica *pReplica = &createReq.replicas[v];
SReplica *pReplica = NULL;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
pReplica = &createReq.replicas[createReq.replica];
}
else{
pReplica = &createReq.learnerReplicas[createReq.learnerReplica];
}
SVnodeGid *pVgid = &pVgroup->vnodeGid[v];
SDnodeObj *pVgidDnode = mndAcquireDnode(pMnode, pVgid->dnodeId);
if (pVgidDnode == NULL) {
......@@ -275,21 +286,40 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
memcpy(pReplica->fqdn, pVgidDnode->fqdn, TSDB_FQDN_LEN);
mndReleaseDnode(pMnode, pVgidDnode);
if (pDnode->id == pVgid->dnodeId) {
createReq.selfIndex = v;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
if (pDnode->id == pVgid->dnodeId) {
createReq.selfIndex = createReq.replica;
}
}
else{
if (pDnode->id == pVgid->dnodeId) {
createReq.learnerSelfIndex = createReq.learnerReplica;
}
}
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
createReq.replica++;
}
else{
createReq.learnerReplica++;
}
}
if (createReq.selfIndex == -1) {
if (createReq.selfIndex == -1 && createReq.learnerSelfIndex == -1) {
terrno = TSDB_CODE_APP_ERROR;
return NULL;
}
mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d strict:%d", createReq.vgId, createReq.replica,
createReq.selfIndex, createReq.strict);
mInfo("vgId:%d, build create vnode req, replica:%d selfIndex:%d learnerReplica:%d learnerSelfIndex:%d strict:%d",
createReq.vgId, createReq.replica, createReq.selfIndex, createReq.learnerReplica,
createReq.learnerReplica, createReq.strict);
for (int32_t i = 0; i < createReq.replica; ++i) {
mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.replicas[i].fqdn, createReq.replicas[i].port);
}
for (int32_t i = 0; i < createReq.learnerReplica; ++i) {
mInfo("vgId:%d, replica:%d ep:%s:%u", createReq.vgId, i, createReq.learnerReplicas[i].fqdn,
createReq.learnerReplicas[i].port);
}
int32_t contLen = tSerializeSCreateVnodeReq(NULL, 0, &createReq);
if (contLen < 0) {
......@@ -356,12 +386,24 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p
SAlterVnodeReplicaReq alterReq = {
.vgId = pVgroup->vgId,
.strict = pDb->cfg.strict,
.replica = pVgroup->replica,
.replica = 0,
.learnerReplica = 0,
.selfIndex = -1,
.learnerSelfIndex = -1,
};
for (int32_t v = 0; v < pVgroup->replica; ++v) {
SReplica *pReplica = &alterReq.replicas[v];
SReplica *pReplica = NULL;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
pReplica = &alterReq.replicas[alterReq.replica];
alterReq.replica++;
}
else{
pReplica = &alterReq.learnerReplicas[alterReq.learnerReplica];
alterReq.learnerReplica++;
}
SVnodeGid *pVgid = &pVgroup->vnodeGid[v];
SDnodeObj *pVgidDnode = mndAcquireDnode(pMnode, pVgid->dnodeId);
if (pVgidDnode == NULL) return NULL;
......@@ -371,18 +413,30 @@ static void *mndBuildAlterVnodeReplicaReq(SMnode *pMnode, SDbObj *pDb, SVgObj *p
memcpy(pReplica->fqdn, pVgidDnode->fqdn, TSDB_FQDN_LEN);
mndReleaseDnode(pMnode, pVgidDnode);
if (dnodeId == pVgid->dnodeId) {
alterReq.selfIndex = v;
if(pVgroup->vnodeGid[v].nodeRole == TAOS_SYNC_ROLE_VOTER){
if (dnodeId == pVgid->dnodeId) {
alterReq.selfIndex = v;
}
}
else{
if (dnodeId == pVgid->dnodeId) {
alterReq.learnerSelfIndex = v;
}
}
}
alterReq.replica = pVgroup->replica;
mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d strict:%d", alterReq.vgId, alterReq.replica,
alterReq.selfIndex, alterReq.strict);
mInfo("vgId:%d, build alter vnode req, replica:%d selfIndex:%d learnerReplica:%d learnerSelfIndex:%d strict:%d",
alterReq.vgId, alterReq.replica, alterReq.selfIndex, alterReq.learnerReplica,
alterReq.learnerSelfIndex, alterReq.strict);
for (int32_t i = 0; i < alterReq.replica; ++i) {
mInfo("vgId:%d, replica:%d ep:%s:%u", alterReq.vgId, i, alterReq.replicas[i].fqdn, alterReq.replicas[i].port);
}
for (int32_t i = 0; i < alterReq.learnerReplica; ++i) {
mInfo("vgId:%d, learnerReplica:%d ep:%s:%u", alterReq.vgId, i,
alterReq.learnerReplicas[i].fqdn, alterReq.learnerReplicas[i].port);
}
if (alterReq.selfIndex == -1) {
if (alterReq.selfIndex == -1 && alterReq.learnerSelfIndex == -1) {
terrno = TSDB_CODE_APP_ERROR;
return NULL;
}
......@@ -1194,6 +1248,30 @@ int32_t mndAddAlterVnodeReplicaAction(SMnode *pMnode, STrans *pTrans, SDbObj *pD
return 0;
}
int32_t mndAddAlterVnodeTypeAction(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroup, int32_t dnodeId) {
SDnodeObj *pDnode = mndAcquireDnode(pMnode, dnodeId);
if (pDnode == NULL) return -1;
STransAction action = {0};
action.epSet = mndGetDnodeEpset(pDnode);
mndReleaseDnode(pMnode, pDnode);
int32_t contLen = 0;
void *pReq = mndBuildAlterVnodeReplicaReq(pMnode, pDb, pVgroup, dnodeId, &contLen);
if (pReq == NULL) return -1;
action.pCont = pReq;
action.contLen = contLen;
action.msgType = TDMT_DND_ALTER_VNODE_TYPE;
if (mndTransAppendRedoAction(pTrans, &action) != 0) {
taosMemoryFree(pReq);
return -1;
}
return 0;
}
static int32_t mndAddDisableVnodeWriteAction(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroup,
int32_t dnodeId) {
SDnodeObj *pDnode = mndAcquireDnode(pMnode, dnodeId);
......@@ -1953,18 +2031,33 @@ int32_t mndBuildAlterVgroupAction(SMnode *pMnode, STrans *pTrans, SDbObj *pOldDb
mInfo("db:%s, vgId:%d, will add 2 vnodes, vn:0 dnode:%d", pVgroup->dbName, pVgroup->vgId,
pVgroup->vnodeGid[0].dnodeId);
//add first
if (mndAddVnodeToVgroup(pMnode, pTrans, &newVgroup, pArray) != 0) return -1;
newVgroup.vnodeGid[0].nodeRole = TAOS_SYNC_ROLE_VOTER;
newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_LEARNER;
if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[0].dnodeId) != 0)
return -1;
if (mndAddCreateVnodeAction(pMnode, pTrans, pNewDb, &newVgroup, &newVgroup.vnodeGid[1]) != 0) return -1;
newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_VOTER;
if (mndAddAlterVnodeTypeAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[1].dnodeId) != 0)
return -1;
if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pNewDb, &newVgroup) != 0) return -1;
//add second
if (mndAddVnodeToVgroup(pMnode, pTrans, &newVgroup, pArray) != 0) return -1;
newVgroup.vnodeGid[0].nodeRole = TAOS_SYNC_ROLE_VOTER;
newVgroup.vnodeGid[1].nodeRole = TAOS_SYNC_ROLE_VOTER;
newVgroup.vnodeGid[2].nodeRole = TAOS_SYNC_ROLE_VOTER;
if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[0].dnodeId) != 0)
return -1;
if (mndAddAlterVnodeReplicaAction(pMnode, pTrans, pNewDb, &newVgroup, newVgroup.vnodeGid[1].dnodeId) != 0)
return -1;
if (mndAddCreateVnodeAction(pMnode, pTrans, pNewDb, &newVgroup, &newVgroup.vnodeGid[2]) != 0) return -1;
if (mndAddAlterVnodeConfirmAction(pMnode, pTrans, pNewDb, &newVgroup) != 0) return -1;
} else if (newVgroup.replica == 3 && pNewDb->cfg.replications == 1) {
mInfo("db:%s, vgId:%d, will remove 2 vnodes, vn:0 dnode:%d vn:1 dnode:%d vn:2 dnode:%d", pVgroup->dbName,
......
......@@ -68,6 +68,7 @@ void vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnapshot);
void vnodeGetInfo(SVnode *pVnode, const char **dbname, int32_t *vgId);
int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen);
int32_t vnodeGetAllTableList(SVnode *pVnode, uint64_t uid, SArray *list);
int32_t vnodeIsCatchUp(SVnode *pVnode);
int32_t vnodeGetCtbIdList(SVnode *pVnode, int64_t suid, SArray *list);
int32_t vnodeGetCtbIdListByFilter(SVnode *pVnode, int64_t suid, SArray *list, bool (*filter)(void *arg), void *arg);
......
......@@ -57,6 +57,28 @@ int vnodeCheckCfg(const SVnodeCfg *pCfg) {
return 0;
}
const char* vnodeRoleToStr(ESyncRole role) {
switch (role) {
case TAOS_SYNC_ROLE_VOTER:
return "voter";
case TAOS_SYNC_ROLE_LEARNER:
return "learner";
default:
return "unknown";
}
}
const ESyncRole vnodeStrToRole(char* str) {
if(strcmp(str, "voter") == 0){
return TAOS_SYNC_ROLE_VOTER;
}
if(strcmp(str, "learner") == 0){
return TAOS_SYNC_ROLE_LEARNER;
}
return TAOS_SYNC_ROLE_ERROR;
}
int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
const SVnodeCfg *pCfg = (SVnodeCfg *)pObj;
......@@ -117,6 +139,7 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
if (tjsonAddIntegerToObject(pJson, "hashSuffix", pCfg->hashSuffix) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "syncCfg.replicaNum", pCfg->syncCfg.replicaNum) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "syncCfg.totalReplicaNum", pCfg->syncCfg.totalReplicaNum) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "syncCfg.myIndex", pCfg->syncCfg.myIndex) < 0) return -1;
if (tjsonAddIntegerToObject(pJson, "vndStats.stables", pCfg->vndStats.numOfSTables) < 0) return -1;
......@@ -128,9 +151,9 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
SJson *nodeInfo = tjsonCreateArray();
if (nodeInfo == NULL) return -1;
if (tjsonAddItemToObject(pJson, "syncCfg.nodeInfo", nodeInfo) < 0) return -1;
vDebug("vgId:%d, encode config, replicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.myIndex);
for (int i = 0; i < pCfg->syncCfg.replicaNum; ++i) {
vDebug("vgId:%d, encode config, replicas:%d totalReplicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.totalReplicaNum, pCfg->syncCfg.myIndex);
for (int i = 0; i < pCfg->syncCfg.totalReplicaNum; ++i) {
SJson *info = tjsonCreateObject();
SNodeInfo *pNode = (SNodeInfo *)&pCfg->syncCfg.nodeInfo[i];
if (info == NULL) return -1;
......@@ -138,6 +161,7 @@ int vnodeEncodeConfig(const void *pObj, SJson *pJson) {
if (tjsonAddStringToObject(info, "nodeFqdn", pNode->nodeFqdn) < 0) return -1;
if (tjsonAddIntegerToObject(info, "nodeId", pNode->nodeId) < 0) return -1;
if (tjsonAddIntegerToObject(info, "clusterId", pNode->clusterId) < 0) return -1;
if (tjsonAddStringToObject(info, "nodeRole", vnodeRoleToStr(pNode->nodeRole)) < 0) return -1;
if (tjsonAddItemToArray(nodeInfo, info) < 0) return -1;
vDebug("vgId:%d, encode config, replica:%d ep:%s:%u dnode:%d", pCfg->vgId, i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId);
......@@ -235,6 +259,8 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) {
tjsonGetNumberValue(pJson, "syncCfg.replicaNum", pCfg->syncCfg.replicaNum, code);
if (code < 0) return -1;
tjsonGetNumberValue(pJson, "syncCfg.totalReplicaNum", pCfg->syncCfg.totalReplicaNum, code);
if (code < 0) return -1;
tjsonGetNumberValue(pJson, "syncCfg.myIndex", pCfg->syncCfg.myIndex, code);
if (code < 0) return -1;
......@@ -251,10 +277,13 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) {
SJson *nodeInfo = tjsonGetObjectItem(pJson, "syncCfg.nodeInfo");
int arraySize = tjsonGetArraySize(nodeInfo);
if (arraySize != pCfg->syncCfg.replicaNum) return -1;
if(pCfg->syncCfg.totalReplicaNum == 0 && pCfg->syncCfg.replicaNum > 0){
pCfg->syncCfg.totalReplicaNum = pCfg->syncCfg.replicaNum;
}
if (arraySize != pCfg->syncCfg.totalReplicaNum) return -1;
vDebug("vgId:%d, decode config, replicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.myIndex);
vDebug("vgId:%d, decode config, replicas:%d totalReplicas:%d selfIndex:%d", pCfg->vgId, pCfg->syncCfg.replicaNum,
pCfg->syncCfg.totalReplicaNum, pCfg->syncCfg.myIndex);
for (int i = 0; i < arraySize; ++i) {
SJson *info = tjsonGetArrayItem(nodeInfo, i);
SNodeInfo *pNode = &pCfg->syncCfg.nodeInfo[i];
......@@ -266,6 +295,15 @@ int vnodeDecodeConfig(const SJson *pJson, void *pObj) {
if (code < 0) return -1;
tjsonGetNumberValue(info, "clusterId", pNode->clusterId, code);
if (code < 0) return -1;
char role[10] = {0};
code = tjsonGetStringValue(info, "nodeRole", role);
if (code < 0) return -1;
if(strlen(role) != 0){
pNode->nodeRole = vnodeStrToRole(role);
}
else{
pNode->nodeRole = TAOS_SYNC_ROLE_VOTER;
}
vDebug("vgId:%d, decode config, replica:%d ep:%s:%u dnode:%d", pCfg->vgId, i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId);
}
......
......@@ -76,19 +76,41 @@ int32_t vnodeAlterReplica(const char *path, SAlterVnodeReplicaReq *pReq, STfs *p
}
SSyncCfg *pCfg = &info.config.syncCfg;
pCfg->myIndex = pReq->selfIndex;
pCfg->replicaNum = pReq->replica;
pCfg->replicaNum = 0;
pCfg->totalReplicaNum = 0;
memset(&pCfg->nodeInfo, 0, sizeof(pCfg->nodeInfo));
vInfo("vgId:%d, save config while alter, replicas:%d selfIndex:%d", pReq->vgId, pCfg->replicaNum, pCfg->myIndex);
for (int i = 0; i < pReq->replica; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
pNode->nodeId = pReq->replicas[i].id;
pNode->nodePort = pReq->replicas[i].port;
tstrncpy(pNode->nodeFqdn, pReq->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
pNode->nodeRole = TAOS_SYNC_ROLE_VOTER;
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
pCfg->replicaNum++;
}
if(pReq->selfIndex != -1){
pCfg->myIndex = pReq->selfIndex;
}
for (int i = pCfg->replicaNum; i < pReq->replica + pReq->learnerReplica; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
pNode->nodeId = pReq->learnerReplicas[pCfg->totalReplicaNum].id;
pNode->nodePort = pReq->learnerReplicas[pCfg->totalReplicaNum].port;
pNode->nodeRole = TAOS_SYNC_ROLE_LEARNER;
tstrncpy(pNode->nodeFqdn, pReq->learnerReplicas[pCfg->totalReplicaNum].fqdn, sizeof(pNode->nodeFqdn));
(void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
pCfg->totalReplicaNum++;
}
pCfg->totalReplicaNum += pReq->replica;
if(pReq->learnerSelfIndex != -1){
pCfg->myIndex = pReq->replica + pReq->learnerSelfIndex;
}
vInfo("vgId:%d, save config while alter, replicas:%d totalReplicas:%d selfIndex:%d",
pReq->vgId, pCfg->replicaNum, pCfg->totalReplicaNum, pCfg->myIndex);
info.config.syncCfg = *pCfg;
ret = vnodeSaveInfo(dir, &info);
......@@ -181,6 +203,7 @@ int32_t vnodeAlterHashRange(const char *srcPath, const char *dstPath, SAlterVnod
SSyncCfg *pCfg = &info.config.syncCfg;
pCfg->myIndex = 0;
pCfg->replicaNum = 1;
pCfg->totalReplicaNum = 1;
memset(&pCfg->nodeInfo, 0, sizeof(pCfg->nodeInfo));
vInfo("vgId:%d, alter vnode replicas to 1", pReq->srcVgId);
......@@ -248,7 +271,7 @@ SVnode *vnodeOpen(const char *path, STfs *pTfs, SMsgCb msgCb) {
// save vnode info on dnode ep changed
bool updated = false;
SSyncCfg *pCfg = &info.config.syncCfg;
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
if (tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort)) {
updated = true;
......@@ -404,6 +427,10 @@ void vnodeClose(SVnode *pVnode) {
// start the sync timer after the queue is ready
int32_t vnodeStart(SVnode *pVnode) { return vnodeSyncStart(pVnode); }
int32_t vnodeIsCatchUp(SVnode *pVnode){
return syncIsCatchUp(pVnode->sync);
}
void vnodeStop(SVnode *pVnode) {}
int64_t vnodeGetSyncHandle(SVnode *pVnode) { return pVnode->sync; }
......
......@@ -1447,7 +1447,8 @@ int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen) {
}
static int32_t vnodeProcessAlterConfirmReq(SVnode *pVnode, int64_t version, void *pReq, int32_t len, SRpcMsg *pRsp) {
vInfo("vgId:%d, alter replica confim msg is processed", TD_VID(pVnode));
vInfo("vgId:%d, vnode management handle msgType:alter-confirm, alter replica confim msg is processed",
TD_VID(pVnode));
pRsp->msgType = TDMT_VND_ALTER_CONFIRM_RSP;
pRsp->code = TSDB_CODE_SUCCESS;
pRsp->pCont = NULL;
......
......@@ -566,6 +566,19 @@ static void vnodeBecomeFollower(const SSyncFSM *pFsm) {
taosThreadMutexUnlock(&pVnode->lock);
}
static void vnodeBecomeLearner(const SSyncFSM *pFsm) {
SVnode *pVnode = pFsm->data;
vInfo("vgId:%d, become learner", pVnode->config.vgId);
taosThreadMutexLock(&pVnode->lock);
if (pVnode->blocked) {
pVnode->blocked = false;
vDebug("vgId:%d, become learner and post block", pVnode->config.vgId);
tsem_post(&pVnode->syncSem);
}
taosThreadMutexUnlock(&pVnode->lock);
}
static void vnodeBecomeLeader(const SSyncFSM *pFsm) {
SVnode *pVnode = pFsm->data;
vDebug("vgId:%d, become leader", pVnode->config.vgId);
......@@ -607,6 +620,7 @@ static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) {
pFsm->FpApplyQueueItems = vnodeApplyQueueItems;
pFsm->FpBecomeLeaderCb = vnodeBecomeLeader;
pFsm->FpBecomeFollowerCb = vnodeBecomeFollower;
pFsm->FpBecomeLearnerCb = vnodeBecomeLearner;
pFsm->FpReConfigCb = NULL;
pFsm->FpSnapshotStartRead = vnodeSnapshotStartRead;
pFsm->FpSnapshotStopRead = vnodeSnapshotStopRead;
......@@ -639,7 +653,7 @@ int32_t vnodeSyncOpen(SVnode *pVnode, char *path) {
SSyncCfg *pCfg = &syncInfo.syncCfg;
vInfo("vgId:%d, start to open sync, replica:%d selfIndex:%d", pVnode->config.vgId, pCfg->replicaNum, pCfg->myIndex);
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SNodeInfo *pNode = &pCfg->nodeInfo[i];
vInfo("vgId:%d, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, pVnode->config.vgId, i, pNode->nodeFqdn, pNode->nodePort,
pNode->nodeId, pNode->clusterId);
......
......@@ -24,12 +24,13 @@ extern "C" {
// SIndexMgr -----------------------------
typedef struct SSyncIndexMgr {
SRaftId (*replicas)[TSDB_MAX_REPLICA];
SyncIndex index[TSDB_MAX_REPLICA];
SyncTerm privateTerm[TSDB_MAX_REPLICA]; // for advanced function
int64_t startTimeArr[TSDB_MAX_REPLICA];
int64_t recvTimeArr[TSDB_MAX_REPLICA];
SRaftId (*replicas)[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncIndex index[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SyncTerm privateTerm[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA]; // for advanced function
int64_t startTimeArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int64_t recvTimeArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int32_t replicaNum;
int32_t totalReplicaNum;
SSyncNode *pNode;
} SSyncIndexMgr;
......
......@@ -54,13 +54,13 @@ typedef struct SSyncLogReplMgr SSyncLogReplMgr;
#define MAX_CONFIG_INDEX_COUNT 256
typedef struct SRaftCfg {
SSyncCfg cfg;
int32_t batchSize;
int8_t isStandBy;
int8_t snapshotStrategy;
SyncIndex lastConfigIndex;
int32_t configIndexCount;
SyncIndex configIndexArr[MAX_CONFIG_INDEX_COUNT];
SSyncCfg cfg;
int32_t batchSize;
int8_t isStandBy;
int8_t snapshotStrategy;
SyncIndex lastConfigIndex;
int32_t configIndexCount;
SyncIndex configIndexArr[MAX_CONFIG_INDEX_COUNT];
} SRaftCfg;
typedef struct SRaftId {
......@@ -127,12 +127,13 @@ typedef struct SSyncNode {
SRaftId myRaftId;
int32_t peersNum;
SNodeInfo peersNodeInfo[TSDB_MAX_REPLICA];
SEpSet peersEpset[TSDB_MAX_REPLICA];
SRaftId peersId[TSDB_MAX_REPLICA];
SNodeInfo peersNodeInfo[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SEpSet peersEpset[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SRaftId peersId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
int32_t replicaNum;
SRaftId replicasId[TSDB_MAX_REPLICA];
int32_t totalReplicaNum;
SRaftId replicasId[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
// raft algorithm
SSyncFSM* pFsm;
......@@ -188,7 +189,7 @@ typedef struct SSyncNode {
uint64_t heartbeatTimerCounter;
// peer heartbeat timer
SSyncTimer peerHeartbeatTimerArr[TSDB_MAX_REPLICA];
SSyncTimer peerHeartbeatTimerArr[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
// tools
SSyncRespMgr* pSyncRespMgr;
......@@ -196,13 +197,13 @@ typedef struct SSyncNode {
// restore state
bool restoreFinish;
// SSnapshot* pSnapshot;
SSyncSnapshotSender* senders[TSDB_MAX_REPLICA];
SSyncSnapshotSender* senders[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SSyncSnapshotReceiver* pNewNodeReceiver;
// log replication mgr
SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA];
SSyncLogReplMgr* logReplMgrs[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
SPeerState peerStates[TSDB_MAX_REPLICA];
SPeerState peerStates[TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA];
// is config changing
bool changing;
......@@ -275,6 +276,7 @@ void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term);
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term);
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm);
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr);
void syncNodeBecomeLearner(SSyncNode* pSyncNode, const char* debugStr);
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr);
void syncNodeCandidate2Leader(SSyncNode* pSyncNode);
void syncNodeFollower2Candidate(SSyncNode* pSyncNode);
......
......@@ -237,6 +237,7 @@ typedef struct SyncLeaderTransfer {
typedef enum {
SYNC_LOCAL_CMD_STEP_DOWN = 100,
SYNC_LOCAL_CMD_FOLLOWER_CMT,
SYNC_LOCAL_CMD_LEARNER_CMT,
} ESyncLocalCmd;
typedef struct SyncLocalCmd {
......
......@@ -56,6 +56,8 @@ typedef struct SSyncLogBuffer {
int64_t size;
TdThreadMutex mutex;
TdThreadMutexAttr attr;
int64_t totalIndex;
bool isCatchup;
} SSyncLogBuffer;
// SSyncLogRepMgr
......
......@@ -137,8 +137,10 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
pReply->term = pMsg->term;
}
syncNodeStepDown(ths, pMsg->term);
resetElect = true;
if(ths->raftCfg.cfg.nodeInfo[ths->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){
syncNodeStepDown(ths, pMsg->term);
resetElect = true;
}
if (pMsg->dataLen < sizeof(SSyncRaftEntry)) {
sError("vgId:%d, incomplete append entries received. prev index:%" PRId64 ", term:%" PRId64 ", datalen:%d",
......
......@@ -41,6 +41,8 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) {
int32_t ret = 0;
for (int i = 0; i < pNode->peersNum; ++i) {
if(pNode->peersNodeInfo[i].nodeRole == TAOS_SYNC_ROLE_LEARNER) continue;
SRpcMsg rpcMsg = {0};
ret = syncBuildRequestVote(&rpcMsg, pNode->vgId);
if (ret < 0) {
......
......@@ -26,6 +26,7 @@ SSyncIndexMgr *syncIndexMgrCreate(SSyncNode *pNode) {
pIndexMgr->replicas = &pNode->replicasId;
pIndexMgr->replicaNum = pNode->replicaNum;
pIndexMgr->totalReplicaNum = pNode->totalReplicaNum;
pIndexMgr->pNode = pNode;
syncIndexMgrClear(pIndexMgr);
......@@ -35,6 +36,7 @@ SSyncIndexMgr *syncIndexMgrCreate(SSyncNode *pNode) {
void syncIndexMgrUpdate(SSyncIndexMgr *pIndexMgr, SSyncNode *pNode) {
pIndexMgr->replicas = &pNode->replicasId;
pIndexMgr->replicaNum = pNode->replicaNum;
pIndexMgr->totalReplicaNum = pNode->totalReplicaNum;
pIndexMgr->pNode = pNode;
syncIndexMgrClear(pIndexMgr);
}
......@@ -50,14 +52,14 @@ void syncIndexMgrClear(SSyncIndexMgr *pIndexMgr) {
memset(pIndexMgr->privateTerm, 0, sizeof(pIndexMgr->privateTerm));
int64_t timeNow = taosGetTimestampMs();
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
pIndexMgr->startTimeArr[i] = 0;
pIndexMgr->recvTimeArr[i] = timeNow;
}
}
void syncIndexMgrSetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncIndex index) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->index)[i] = index;
return;
......@@ -69,7 +71,7 @@ void syncIndexMgrSetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, Sync
}
SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pRaftId) {
for (int i = 0; i < pNode->replicaNum; i++) {
for (int i = 0; i < pNode->totalReplicaNum; i++) {
if (syncUtilSameId(&pNode->replicasId[i], pRaftId)) {
return pNode->logReplMgrs[i];
}
......@@ -80,7 +82,7 @@ SSyncLogReplMgr *syncNodeGetLogReplMgr(SSyncNode *pNode, SRaftId *pRaftId) {
}
SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
SyncIndex idx = (pIndexMgr->index)[i];
return idx;
......@@ -93,7 +95,7 @@ SyncIndex syncIndexMgrGetIndex(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId)
}
void syncIndexMgrSetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, int64_t startTime) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->startTimeArr)[i] = startTime;
return;
......@@ -105,7 +107,7 @@ void syncIndexMgrSetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId,
}
int64_t syncIndexMgrGetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
int64_t startTime = (pIndexMgr->startTimeArr)[i];
return startTime;
......@@ -118,7 +120,7 @@ int64_t syncIndexMgrGetStartTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftI
}
void syncIndexMgrSetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, int64_t recvTime) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->recvTimeArr)[i] = recvTime;
return;
......@@ -130,7 +132,7 @@ void syncIndexMgrSetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, i
}
int64_t syncIndexMgrGetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
int64_t recvTime = (pIndexMgr->recvTimeArr)[i];
return recvTime;
......@@ -143,7 +145,7 @@ int64_t syncIndexMgrGetRecvTime(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId
}
void syncIndexMgrSetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncTerm term) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
(pIndexMgr->privateTerm)[i] = term;
return;
......@@ -155,7 +157,7 @@ void syncIndexMgrSetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId, SyncT
}
SyncTerm syncIndexMgrGetTerm(SSyncIndexMgr *pIndexMgr, const SRaftId *pRaftId) {
for (int i = 0; i < pIndexMgr->replicaNum; ++i) {
for (int i = 0; i < pIndexMgr->totalReplicaNum; ++i) {
if (syncUtilSameId(&((*(pIndexMgr->replicas))[i]), pRaftId)) {
SyncTerm term = (pIndexMgr->privateTerm)[i];
return term;
......
此差异已折叠。
......@@ -250,6 +250,8 @@ int32_t syncLogBufferInitWithoutLock(SSyncLogBuffer* pBuf, SSyncNode* pNode) {
// update startIndex
pBuf->startIndex = takeDummy ? index : index + 1;
pBuf->isCatchup = false;
sInfo("vgId:%d, init sync log buffer. buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId,
pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex);
......@@ -332,6 +334,15 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt
goto _out;
}
if(pNode->raftCfg.cfg.nodeInfo[pNode->raftCfg.cfg.myIndex].nodeRole == TAOS_SYNC_ROLE_LEARNER &&
index > 0 && index > pBuf->totalIndex){
pBuf->totalIndex = index;
sTrace("vgId:%d, update learner progress. index:%" PRId64 ", term:%" PRId64 ": prevterm:%" PRId64
" != lastmatch:%" PRId64 ". log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")",
pNode->vgId, pEntry->index, pEntry->term, prevTerm, lastMatchTerm, pBuf->startIndex, pBuf->commitIndex,
pBuf->matchIndex, pBuf->endIndex);
}
if (index - pBuf->startIndex >= pBuf->size) {
sWarn("vgId:%d, out of buffer range. index:%" PRId64 ", term:%" PRId64 ". log buffer: [%" PRId64 " %" PRId64
" %" PRId64 ", %" PRId64 ")",
......@@ -491,7 +502,7 @@ _out:
}
int32_t syncFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry,
int32_t applyCode) {
int32_t applyCode) {
if (pNode->replicaNum == 1 && pNode->restoreFinish && pNode->vgId != 1) {
return 0;
}
......@@ -965,7 +976,7 @@ void syncLogReplDestroy(SSyncLogReplMgr* pMgr) {
}
int32_t syncNodeLogReplInit(SSyncNode* pNode) {
for (int i = 0; i < TSDB_MAX_REPLICA; i++) {
for (int i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; i++) {
ASSERT(pNode->logReplMgrs[i] == NULL);
pNode->logReplMgrs[i] = syncLogReplCreate();
if (pNode->logReplMgrs[i] == NULL) {
......@@ -978,7 +989,7 @@ int32_t syncNodeLogReplInit(SSyncNode* pNode) {
}
void syncNodeLogReplDestroy(SSyncNode* pNode) {
for (int i = 0; i < TSDB_MAX_REPLICA; i++) {
for (int i = 0; i < TSDB_MAX_REPLICA + TSDB_MAX_LEARNER_REPLICA; i++) {
syncLogReplDestroy(pNode->logReplMgrs[i]);
pNode->logReplMgrs[i] = NULL;
}
......@@ -1108,7 +1119,7 @@ int32_t syncLogBufferReset(SSyncLogBuffer* pBuf, SSyncNode* pNode) {
pBuf->endIndex = pBuf->matchIndex + 1;
// reset repl mgr
for (int i = 0; i < pNode->replicaNum; i++) {
for (int i = 0; i < pNode->totalReplicaNum; i++) {
SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i];
syncLogReplReset(pMgr);
}
......
......@@ -18,21 +18,45 @@
#include "syncUtil.h"
#include "tjson.h"
const char* syncRoleToStr(ESyncRole role) {
switch (role) {
case TAOS_SYNC_ROLE_VOTER:
return "voter";
case TAOS_SYNC_ROLE_LEARNER:
return "learner";
default:
return "unknown";
}
}
const ESyncRole syncStrToRole(char* str) {
if(strcmp(str, "voter") == 0){
return TAOS_SYNC_ROLE_VOTER;
}
if(strcmp(str, "learner") == 0){
return TAOS_SYNC_ROLE_LEARNER;
}
return TAOS_SYNC_ROLE_ERROR;
}
static int32_t syncEncodeSyncCfg(const void *pObj, SJson *pJson) {
SSyncCfg *pCfg = (SSyncCfg *)pObj;
if (tjsonAddDoubleToObject(pJson, "totalReplicaNum", pCfg->totalReplicaNum) < 0) return -1;
if (tjsonAddDoubleToObject(pJson, "replicaNum", pCfg->replicaNum) < 0) return -1;
if (tjsonAddDoubleToObject(pJson, "myIndex", pCfg->myIndex) < 0) return -1;
SJson *nodeInfo = tjsonCreateArray();
if (nodeInfo == NULL) return -1;
if (tjsonAddItemToObject(pJson, "nodeInfo", nodeInfo) < 0) return -1;
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SJson *info = tjsonCreateObject();
if (info == NULL) return -1;
if (tjsonAddDoubleToObject(info, "nodePort", pCfg->nodeInfo[i].nodePort) < 0) return -1;
if (tjsonAddStringToObject(info, "nodeFqdn", pCfg->nodeInfo[i].nodeFqdn) < 0) return -1;
if (tjsonAddIntegerToObject(info, "nodeId", pCfg->nodeInfo[i].nodeId) < 0) return -1;
if (tjsonAddIntegerToObject(info, "clusterId", pCfg->nodeInfo[i].clusterId) < 0) return -1;
if (tjsonAddStringToObject(info, "nodeRole", syncRoleToStr(pCfg->nodeInfo[i].nodeRole)) < 0) return -1;
if (tjsonAddItemToArray(nodeInfo, info) < 0) return -1;
}
......@@ -90,7 +114,8 @@ int32_t syncWriteCfgFile(SSyncNode *pNode) {
if (taosRenameFile(file, realfile) != 0) goto _OVER;
code = 0;
sInfo("vgId:%d, succeed to write sync cfg file:%s, len:%d", pNode->vgId, realfile, len);
sInfo("vgId:%d, succeed to write sync cfg file:%s, len:%d, lastConfigIndex:%" PRId64, pNode->vgId,
realfile, len, pNode->raftCfg.lastConfigIndex);
_OVER:
if (pJson != NULL) tjsonDelete(pJson);
......@@ -108,6 +133,7 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) {
SSyncCfg *pCfg = (SSyncCfg *)pObj;
int32_t code = 0;
tjsonGetInt32ValueFromDouble(pJson, "totalReplicaNum", pCfg->totalReplicaNum, code);
tjsonGetInt32ValueFromDouble(pJson, "replicaNum", pCfg->replicaNum, code);
if (code < 0) return -1;
tjsonGetInt32ValueFromDouble(pJson, "myIndex", pCfg->myIndex, code);
......@@ -115,9 +141,9 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) {
SJson *nodeInfo = tjsonGetObjectItem(pJson, "nodeInfo");
if (nodeInfo == NULL) return -1;
pCfg->replicaNum = tjsonGetArraySize(nodeInfo);
pCfg->totalReplicaNum = tjsonGetArraySize(nodeInfo);
for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
for (int32_t i = 0; i < pCfg->totalReplicaNum; ++i) {
SJson *info = tjsonGetArrayItem(nodeInfo, i);
if (info == NULL) return -1;
tjsonGetUInt16ValueFromDouble(info, "nodePort", pCfg->nodeInfo[i].nodePort, code);
......@@ -126,6 +152,15 @@ static int32_t syncDecodeSyncCfg(const SJson *pJson, void *pObj) {
if (code < 0) return -1;
tjsonGetNumberValue(info, "nodeId", pCfg->nodeInfo[i].nodeId, code);
tjsonGetNumberValue(info, "clusterId", pCfg->nodeInfo[i].clusterId, code);
char role[10] = {0};
code = tjsonGetStringValue(info, "nodeRole", role);
if(code < 0) return -1;
if(strlen(role) != 0){
pCfg->nodeInfo[i].nodeRole = syncStrToRole(role);
}
else{
pCfg->nodeInfo[i].nodeRole = TAOS_SYNC_ROLE_VOTER;
}
}
return 0;
......
......@@ -66,10 +66,10 @@ int32_t syncNodeReplicate(SSyncNode* pNode) {
}
int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) {
if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) {
if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->raftCfg.cfg.totalReplicaNum == 1) {
return -1;
}
for (int32_t i = 0; i < pNode->replicaNum; i++) {
for (int32_t i = 0; i < pNode->totalReplicaNum; i++) {
if (syncUtilSameId(&pNode->replicasId[i], &pNode->myRaftId)) {
continue;
}
......
......@@ -795,13 +795,18 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) {
return -1;
}
if (pMsg->term > raftStoreGetTerm(pSyncNode)) {
syncNodeStepDown(pSyncNode, pMsg->term);
if(pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){
if (pMsg->term > raftStoreGetTerm(pSyncNode)) {
syncNodeStepDown(pSyncNode, pMsg->term);
}
}
else{
syncNodeUpdateTermWithoutStepDown(pSyncNode, pMsg->term);
}
// state, term, seq/ack
int32_t code = 0;
if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) {
if (pMsg->term == raftStoreGetTerm(pSyncNode)) {
if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) {
syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq pre-snapshot");
......
......@@ -58,7 +58,7 @@ static void syncNodeCleanConfigIndex(SSyncNode* ths) {
static int32_t syncNodeTimerRoutine(SSyncNode* ths) {
ths->tmrRoutineNum++;
if (ths->tmrRoutineNum % 60 == 0 && ths->replicaNum > 1) {
if (ths->tmrRoutineNum % 60 == 0 && ths->totalReplicaNum > 1) {
sNInfo(ths, "timer routines");
} else {
sNTrace(ths, "timer routines");
......
......@@ -30,7 +30,7 @@ SVotesGranted *voteGrantedCreate(SSyncNode *pNode) {
return NULL;
}
pVotesGranted->replicas = &pNode->replicasId;
pVotesGranted->replicas = (void*)&pNode->replicasId;
pVotesGranted->replicaNum = pNode->replicaNum;
voteGrantedClearVotes(pVotesGranted);
......@@ -49,7 +49,7 @@ void voteGrantedDestroy(SVotesGranted *pVotesGranted) {
}
void voteGrantedUpdate(SVotesGranted *pVotesGranted, SSyncNode *pNode) {
pVotesGranted->replicas = &pNode->replicasId;
pVotesGranted->replicas = (void*)&pNode->replicasId;
pVotesGranted->replicaNum = pNode->replicaNum;
voteGrantedClearVotes(pVotesGranted);
......@@ -115,7 +115,7 @@ SVotesRespond *votesRespondCreate(SSyncNode *pNode) {
return NULL;
}
pVotesRespond->replicas = &pNode->replicasId;
pVotesRespond->replicas = (void*)&pNode->replicasId;
pVotesRespond->replicaNum = pNode->replicaNum;
pVotesRespond->term = 0;
pVotesRespond->pNode = pNode;
......@@ -130,7 +130,7 @@ void votesRespondDestory(SVotesRespond *pVotesRespond) {
}
void votesRespondUpdate(SVotesRespond *pVotesRespond, SSyncNode *pNode) {
pVotesRespond->replicas = &pNode->replicasId;
pVotesRespond->replicas = (void*)&pNode->replicasId;
pVotesRespond->replicaNum = pNode->replicaNum;
pVotesRespond->term = 0;
pVotesRespond->pNode = pNode;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册