syncMain.c 102.8 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

M
Minghao Li 已提交
16
#include "sync.h"
M
Minghao Li 已提交
17 18
#include "syncAppendEntries.h"
#include "syncAppendEntriesReply.h"
M
Minghao Li 已提交
19
#include "syncCommit.h"
M
Minghao Li 已提交
20
#include "syncElection.h"
M
Minghao Li 已提交
21
#include "syncEnv.h"
M
Minghao Li 已提交
22
#include "syncIndexMgr.h"
M
Minghao Li 已提交
23
#include "syncInt.h"
M
Minghao Li 已提交
24
#include "syncMessage.h"
M
Minghao Li 已提交
25
#include "syncRaftCfg.h"
M
Minghao Li 已提交
26
#include "syncRaftLog.h"
M
Minghao Li 已提交
27
#include "syncRaftStore.h"
M
Minghao Li 已提交
28
#include "syncReplication.h"
M
Minghao Li 已提交
29 30
#include "syncRequestVote.h"
#include "syncRequestVoteReply.h"
M
Minghao Li 已提交
31
#include "syncRespMgr.h"
M
Minghao Li 已提交
32
#include "syncSnapshot.h"
M
Minghao Li 已提交
33
#include "syncTimeout.h"
M
Minghao Li 已提交
34
#include "syncUtil.h"
M
Minghao Li 已提交
35
#include "syncVoteMgr.h"
M
Minghao Li 已提交
36
#include "tref.h"
M
Minghao Li 已提交
37

38
bool gRaftDetailLog = false;
39

M
Minghao Li 已提交
40 41 42
static int32_t tsNodeRefId = -1;

// ------ local funciton ---------
M
Minghao Li 已提交
43
// enqueue message ----
M
Minghao Li 已提交
44 45 46 47 48
static void    syncNodeEqPingTimer(void* param, void* tmrId);
static void    syncNodeEqElectTimer(void* param, void* tmrId);
static void    syncNodeEqHeartbeatTimer(void* param, void* tmrId);
static int32_t syncNodeEqNoop(SSyncNode* ths);
static int32_t syncNodeAppendNoop(SSyncNode* ths);
M
Minghao Li 已提交
49

M
Minghao Li 已提交
50
// process message ----
M
Minghao Li 已提交
51 52
int32_t syncNodeOnPingCb(SSyncNode* ths, SyncPing* pMsg);
int32_t syncNodeOnPingReplyCb(SSyncNode* ths, SyncPingReply* pMsg);
M
Minghao Li 已提交
53 54 55

// life cycle
static void syncFreeNode(void* param);
M
Minghao Li 已提交
56 57 58
// ---------------------------------

int32_t syncInit() {
M
Minghao Li 已提交
59 60 61 62 63 64 65 66 67
  int32_t ret = 0;

  if (!syncEnvIsStart()) {
    tsNodeRefId = taosOpenRef(200, syncFreeNode);
    if (tsNodeRefId < 0) {
      sError("failed to init node ref");
      syncCleanUp();
      ret = -1;
    } else {
S
Shengliang Guan 已提交
68
      sDebug("sync rsetId:%" PRId64 " is open", tsNodeRefId);
M
Minghao Li 已提交
69 70
      ret = syncEnvStart();
    }
M
Minghao Li 已提交
71 72
  }

M
Minghao Li 已提交
73
  return ret;
M
Minghao Li 已提交
74
}
M
Minghao Li 已提交
75

M
Minghao Li 已提交
76 77
void syncCleanUp() {
  int32_t ret = syncEnvStop();
M
Minghao Li 已提交
78
  ASSERT(ret == 0);
M
Minghao Li 已提交
79 80

  if (tsNodeRefId != -1) {
S
Shengliang Guan 已提交
81
    sDebug("sync rsetId:%" PRId64 " is closed", tsNodeRefId);
M
Minghao Li 已提交
82 83 84
    taosCloseRef(tsNodeRefId);
    tsNodeRefId = -1;
  }
M
Minghao Li 已提交
85
}
M
Minghao Li 已提交
86

M
Minghao Li 已提交
87
int64_t syncOpen(const SSyncInfo* pSyncInfo) {
M
Minghao Li 已提交
88
  SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo);
M
Minghao Li 已提交
89
  ASSERT(pSyncNode != NULL);
M
Minghao Li 已提交
90

M
Minghao Li 已提交
91 92 93 94 95 96
  pSyncNode->rid = taosAddRef(tsNodeRefId, pSyncNode);
  if (pSyncNode->rid < 0) {
    syncFreeNode(pSyncNode);
    return -1;
  }

S
Shengliang Guan 已提交
97
  sDebug("vgId:%d, sync rid:%" PRId64 " is added to rsetId:%" PRId64, pSyncInfo->vgId, pSyncNode->rid, tsNodeRefId);
M
Minghao Li 已提交
98
  return pSyncNode->rid;
M
Minghao Li 已提交
99
}
M
Minghao Li 已提交
100

M
Minghao Li 已提交
101 102 103 104 105
void syncStart(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return;
  }
M
Minghao Li 已提交
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120

  if (pSyncNode->pRaftCfg->isStandBy) {
    syncNodeStartStandBy(pSyncNode);
  } else {
    syncNodeStart(pSyncNode);
  }

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

void syncStartNormal(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return;
  }
M
Minghao Li 已提交
121 122 123 124 125
  syncNodeStart(pSyncNode);

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

M
Minghao Li 已提交
126 127 128 129 130 131 132 133 134 135
void syncStartStandBy(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return;
  }
  syncNodeStartStandBy(pSyncNode);

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

M
Minghao Li 已提交
136
void syncStop(int64_t rid) {
M
Minghao Li 已提交
137
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
S
Shengliang Guan 已提交
138 139 140
  if (pSyncNode == NULL) return;

  int32_t vgId = pSyncNode->vgId;
M
Minghao Li 已提交
141
  syncNodeClose(pSyncNode);
M
Minghao Li 已提交
142 143 144

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  taosRemoveRef(tsNodeRefId, rid);
S
Shengliang Guan 已提交
145
  sDebug("vgId:%d, sync rid:%" PRId64 " is removed from rsetId:%" PRId64, vgId, rid, tsNodeRefId);
M
Minghao Li 已提交
146
}
M
Minghao Li 已提交
147

M
Minghao Li 已提交
148 149 150
int32_t syncSetStandby(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
151
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
S
Shengliang Guan 已提交
152
    sError("failed to set standby since accquire ref error, rid:%" PRId64, rid);
M
Minghao Li 已提交
153
    return -1;
M
Minghao Li 已提交
154 155
  }

156
  if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
157 158 159 160 161 162
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
      terrno = TSDB_CODE_SYN_IS_LEADER;
    } else {
      terrno = TSDB_CODE_SYN_STANDBY_NOT_READY;
    }
    sError("failed to set standby since it is not follower, state:%s rid:%" PRId64, syncStr(pSyncNode->state), rid);
M
Minghao Li 已提交
163
    taosReleaseRef(tsNodeRefId, pSyncNode->rid);
M
Minghao Li 已提交
164
    return -1;
M
Minghao Li 已提交
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
  }

  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
  ASSERT(ret == 0);

  pSyncNode->pRaftCfg->isStandBy = 1;
  raftCfgPersist(pSyncNode->pRaftCfg);

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
180
  sInfo("vgId:%d, set to standby", pSyncNode->vgId);
M
Minghao Li 已提交
181 182 183
  return 0;
}

M
Minghao Li 已提交
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pNewCfg) {
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewCfg);
  if (!IamInNew) {
    return false;
  }

  if (pNewCfg->replicaNum > pSyncNode->replicaNum + 1) {
    return false;
  }

  if (pNewCfg->replicaNum < pSyncNode->replicaNum - 1) {
    return false;
  }

  return true;
}

M
Minghao Li 已提交
201 202 203
int32_t syncReconfigBuild(int64_t rid, const SSyncCfg* pNewCfg, SRpcMsg* pRpcMsg) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
204 205
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
M
Minghao Li 已提交
206 207
  }
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
208
  int32_t ret = 0;
209

M
Minghao Li 已提交
210
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
M
Minghao Li 已提交
211
    taosReleaseRef(tsNodeRefId, pSyncNode->rid);
M
Minghao Li 已提交
212 213
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
    sError("syncNodeCheckNewConfig error");
M
Minghao Li 已提交
214
    return -1;
M
Minghao Li 已提交
215 216 217 218 219 220 221 222 223
  }

  char* newconfig = syncCfg2Str((SSyncCfg*)pNewCfg);
  pRpcMsg->msgType = TDMT_SYNC_CONFIG_CHANGE;
  pRpcMsg->info.noResp = 1;
  pRpcMsg->contLen = strlen(newconfig) + 1;
  pRpcMsg->pCont = rpcMallocCont(pRpcMsg->contLen);
  snprintf(pRpcMsg->pCont, pRpcMsg->contLen, "%s", newconfig);
  taosMemoryFree(newconfig);
224

M
Minghao Li 已提交
225 226 227 228 229 230 231
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return ret;
}

int32_t syncReconfig(int64_t rid, const SSyncCfg* pNewCfg) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
232 233
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
M
Minghao Li 已提交
234 235 236
  }
  ASSERT(rid == pSyncNode->rid);

M
Minghao Li 已提交
237
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
M
Minghao Li 已提交
238
    taosReleaseRef(tsNodeRefId, pSyncNode->rid);
M
Minghao Li 已提交
239 240
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
    sError("syncNodeCheckNewConfig error");
M
Minghao Li 已提交
241
    return -1;
M
Minghao Li 已提交
242
  }
243

M
Minghao Li 已提交
244
  char*   newconfig = syncCfg2Str((SSyncCfg*)pNewCfg);
M
Minghao Li 已提交
245 246
  int32_t ret = 0;

M
Minghao Li 已提交
247
  SRpcMsg rpcMsg = {0};
248
  rpcMsg.msgType = TDMT_SYNC_CONFIG_CHANGE;
S
Shengliang Guan 已提交
249
  rpcMsg.info.noResp = 1;
250
  rpcMsg.contLen = strlen(newconfig) + 1;
M
Minghao Li 已提交
251
  rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
252 253
  snprintf(rpcMsg.pCont, rpcMsg.contLen, "%s", newconfig);
  taosMemoryFree(newconfig);
M
Minghao Li 已提交
254 255 256
  ret = syncNodePropose(pSyncNode, &rpcMsg, false);

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
M
Minghao Li 已提交
257 258
  return ret;
}
M
Minghao Li 已提交
259

260
int32_t syncLeaderTransfer(int64_t rid) {
M
Minghao Li 已提交
261 262
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
263 264
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
M
Minghao Li 已提交
265 266 267
  }
  ASSERT(rid == pSyncNode->rid);

M
Minghao Li 已提交
268
  int32_t ret = syncNodeLeaderTransfer(pSyncNode);
M
Minghao Li 已提交
269
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
270 271 272 273 274 275
  return ret;
}

int32_t syncLeaderTransferTo(int64_t rid, SNodeInfo newLeader) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
276 277
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
278
  }
M
Minghao Li 已提交
279
  ASSERT(rid == pSyncNode->rid);
280

M
Minghao Li 已提交
281
  int32_t ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
282 283 284 285
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return ret;
}

M
Minghao Li 已提交
286 287
int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode) {
  if (pSyncNode->peersNum == 0) {
288
    sDebug("only one replica, cannot leader transfer");
M
Minghao Li 已提交
289 290 291 292 293 294 295 296 297 298 299 300 301
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
  }

  SNodeInfo newLeader = (pSyncNode->peersNodeInfo)[0];
  int32_t   ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
  return ret;
}

int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader) {
  int32_t ret = 0;

  if (pSyncNode->replicaNum == 1) {
302
    sDebug("only one replica, cannot leader transfer");
M
Minghao Li 已提交
303 304 305 306
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
  }

M
Minghao Li 已提交
307 308 309 310 311 312
  do {
    char logBuf[128];
    snprintf(logBuf, sizeof(logBuf), "begin leader transfer to %s:%u", newLeader.nodeFqdn, newLeader.nodePort);
    syncNodeEventLog(pSyncNode, logBuf);
  } while (0);

M
Minghao Li 已提交
313 314 315 316 317 318 319 320 321 322 323 324 325
  SyncLeaderTransfer* pMsg = syncLeaderTransferBuild(pSyncNode->vgId);
  pMsg->newLeaderId.addr = syncUtilAddr2U64(newLeader.nodeFqdn, newLeader.nodePort);
  pMsg->newLeaderId.vgId = pSyncNode->vgId;
  pMsg->newNodeInfo = newLeader;
  ASSERT(pMsg != NULL);
  SRpcMsg rpcMsg = {0};
  syncLeaderTransfer2RpcMsg(pMsg, &rpcMsg);
  syncLeaderTransferDestroy(pMsg);

  ret = syncNodePropose(pSyncNode, &rpcMsg, false);
  return ret;
}

326 327 328 329 330
bool syncCanLeaderTransfer(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return false;
  }
M
Minghao Li 已提交
331
  ASSERT(rid == pSyncNode->rid);
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357

  if (pSyncNode->replicaNum == 1) {
    taosReleaseRef(tsNodeRefId, pSyncNode->rid);
    return false;
  }

  if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
    taosReleaseRef(tsNodeRefId, pSyncNode->rid);
    return true;
  }

  bool matchOK = true;
  if (pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE || pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    SyncIndex myCommitIndex = pSyncNode->commitIndex;
    for (int i = 0; i < pSyncNode->peersNum; ++i) {
      SyncIndex peerMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId)[i]);
      if (peerMatchIndex < myCommitIndex) {
        matchOK = false;
      }
    }
  }

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return matchOK;
}

358
int32_t syncForwardToPeer(int64_t rid, SRpcMsg* pMsg, bool isWeak) {
M
Minghao Li 已提交
359 360 361
  int32_t ret = syncPropose(rid, pMsg, isWeak);
  return ret;
}
M
Minghao Li 已提交
362

M
Minghao Li 已提交
363 364 365 366 367
ESyncState syncGetMyRole(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return TAOS_SYNC_STATE_ERROR;
  }
M
Minghao Li 已提交
368
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
369 370 371 372
  ESyncState state = pSyncNode->state;

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return state;
M
Minghao Li 已提交
373 374
}

M
Minghao Li 已提交
375 376 377 378 379
bool syncIsReady(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return false;
  }
M
Minghao Li 已提交
380
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
381 382
  bool b = (pSyncNode->state == TAOS_SYNC_STATE_LEADER) && pSyncNode->restoreFinish;
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
383 384 385 386 387 388 389 390 391

  // if false, set error code
  if (false == b) {
    if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
      terrno = TSDB_CODE_SYN_NOT_LEADER;
    } else {
      terrno = TSDB_CODE_APP_NOT_READY;
    }
  }
M
Minghao Li 已提交
392 393 394
  return b;
}

M
Minghao Li 已提交
395 396 397 398 399
bool syncIsRestoreFinish(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return false;
  }
M
Minghao Li 已提交
400
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
401 402 403 404 405 406
  bool b = pSyncNode->restoreFinish;

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return b;
}

407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) {
  if (index < SYNC_INDEX_BEGIN) {
    return -1;
  }

  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return -1;
  }
  ASSERT(rid == pSyncNode->rid);

  SSyncRaftEntry* pEntry = NULL;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
  if (code != 0) {
    if (pEntry != NULL) {
      syncEntryDestory(pEntry);
    }
    taosReleaseRef(tsNodeRefId, pSyncNode->rid);
    return -1;
  }
  ASSERT(pEntry != NULL);

  pSnapshot->data = NULL;
  pSnapshot->lastApplyIndex = index;
  pSnapshot->lastApplyTerm = pEntry->term;
  pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index);

  syncEntryDestory(pEntry);
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return 0;
}

439 440 441 442 443
int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return -1;
  }
M
Minghao Li 已提交
444
  ASSERT(rid == pSyncNode->rid);
445 446
  sMeta->lastConfigIndex = pSyncNode->pRaftCfg->lastConfigIndex;

S
Shengliang Guan 已提交
447
  sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->pRaftCfg->lastConfigIndex);
448 449 450 451 452

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return 0;
}

453 454 455 456 457
int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return -1;
  }
M
Minghao Li 已提交
458
  ASSERT(rid == pSyncNode->rid);
459 460 461 462 463 464 465 466 467 468 469

  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

  for (int i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
  sMeta->lastConfigIndex = lastIndex;
470
  sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex,
S
Shengliang Guan 已提交
471
         sMeta->lastConfigIndex);
472 473 474 475 476

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return 0;
}

477 478 479 480 481 482 483 484 485 486
SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

  for (int i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotLastApplyIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
S
Shengliang Guan 已提交
487
  sTrace("vgId:%d, sync get last config index, index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId,
S
Shengliang Guan 已提交
488
         snapshotLastApplyIndex, lastIndex);
489 490 491 492

  return lastIndex;
}

M
Minghao Li 已提交
493 494 495 496 497
const char* syncGetMyRoleStr(int64_t rid) {
  const char* s = syncUtilState2String(syncGetMyRole(rid));
  return s;
}

M
Minghao Li 已提交
498
SyncTerm syncGetMyTerm(int64_t rid) {
M
Minghao Li 已提交
499 500 501 502
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return TAOS_SYNC_STATE_ERROR;
  }
M
Minghao Li 已提交
503
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
504
  SyncTerm term = pSyncNode->pRaftStore->currentTerm;
M
Minghao Li 已提交
505 506

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
M
Minghao Li 已提交
507
  return term;
M
Minghao Li 已提交
508 509
}

M
Minghao Li 已提交
510
SyncGroupId syncGetVgId(int64_t rid) {
M
Minghao Li 已提交
511 512
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
513 514
    return TAOS_SYNC_STATE_ERROR;
  }
M
Minghao Li 已提交
515
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
516
  SyncGroupId vgId = pSyncNode->vgId;
M
Minghao Li 已提交
517 518

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
M
Minghao Li 已提交
519
  return vgId;
M
Minghao Li 已提交
520 521
}

M
Minghao Li 已提交
522 523 524 525 526 527
void syncGetEpSet(int64_t rid, SEpSet* pEpSet) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    memset(pEpSet, 0, sizeof(*pEpSet));
    return;
  }
M
Minghao Li 已提交
528
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
529 530
  pEpSet->numOfEps = 0;
  for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
531 532
    snprintf(pEpSet->eps[i].fqdn, sizeof(pEpSet->eps[i].fqdn), "%s", (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodeFqdn);
    pEpSet->eps[i].port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort;
M
Minghao Li 已提交
533
    (pEpSet->numOfEps)++;
S
Shengliang Guan 已提交
534
    sInfo("vgId:%d, sync get epset: index:%d %s:%d", pSyncNode->vgId, i, pEpSet->eps[i].fqdn, pEpSet->eps[i].port);
M
Minghao Li 已提交
535 536
  }
  pEpSet->inUse = pSyncNode->pRaftCfg->cfg.myIndex;
S
Shengliang Guan 已提交
537
  sInfo("vgId:%d, sync get epset in-use:%d", pSyncNode->vgId, pEpSet->inUse);
538 539 540

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}
M
Minghao Li 已提交
541

542 543 544 545 546 547 548 549 550 551 552 553
void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    memset(pEpSet, 0, sizeof(*pEpSet));
    return;
  }
  ASSERT(rid == pSyncNode->rid);
  pEpSet->numOfEps = 0;
  for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
    snprintf(pEpSet->eps[i].fqdn, sizeof(pEpSet->eps[i].fqdn), "%s", (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodeFqdn);
    pEpSet->eps[i].port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort;
    (pEpSet->numOfEps)++;
M
Minghao Li 已提交
554 555
    sInfo("vgId:%d, sync get retry epset: index:%d %s:%d", pSyncNode->vgId, i, pEpSet->eps[i].fqdn,
          pEpSet->eps[i].port);
556 557
  }
  pEpSet->inUse = (pSyncNode->pRaftCfg->cfg.myIndex + 1) % pEpSet->numOfEps;
M
Minghao Li 已提交
558
  sInfo("vgId:%d, sync get retry epset in-use:%d", pSyncNode->vgId, pEpSet->inUse);
M
Minghao Li 已提交
559 560 561 562

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

M
Minghao Li 已提交
563 564 565 566 567
int32_t syncGetRespRpc(int64_t rid, uint64_t index, SRpcMsg* msg) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return TAOS_SYNC_STATE_ERROR;
  }
M
Minghao Li 已提交
568
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
569 570 571 572 573 574 575 576 577 578 579

  SRespStub stub;
  int32_t   ret = syncRespMgrGet(pSyncNode->pSyncRespMgr, index, &stub);
  if (ret == 1) {
    memcpy(msg, &(stub.rpcMsg), sizeof(SRpcMsg));
  }

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return ret;
}

S
Shengliang Guan 已提交
580
int32_t syncGetAndDelRespRpc(int64_t rid, uint64_t index, SRpcHandleInfo* pInfo) {
M
Minghao Li 已提交
581 582 583 584
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return TAOS_SYNC_STATE_ERROR;
  }
M
Minghao Li 已提交
585
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
586 587 588 589

  SRespStub stub;
  int32_t   ret = syncRespMgrGetAndDel(pSyncNode->pSyncRespMgr, index, &stub);
  if (ret == 1) {
S
Shengliang Guan 已提交
590
    *pInfo = stub.rpcMsg.info;
M
Minghao Li 已提交
591 592
  }

S
Shengliang Guan 已提交
593
  sTrace("vgId:%d, get seq:%" PRIu64 " rpc handle:%p", pSyncNode->vgId, index, pInfo->handle);
M
Minghao Li 已提交
594 595 596 597
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return ret;
}

598
void syncSetMsgCb(int64_t rid, const SMsgCb* msgcb) {
M
Minghao Li 已提交
599 600
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
601
    sTrace("syncSetQ get pSyncNode is NULL, rid:%" PRId64, rid);
M
Minghao Li 已提交
602 603
    return;
  }
M
Minghao Li 已提交
604
  ASSERT(rid == pSyncNode->rid);
S
Shengliang Guan 已提交
605
  pSyncNode->msgcb = msgcb;
M
Minghao Li 已提交
606 607 608 609 610 611 612

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

char* sync2SimpleStr(int64_t rid) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
613
    sTrace("syncSetRpc get pSyncNode is NULL, rid:%" PRId64, rid);
M
Minghao Li 已提交
614 615
    return NULL;
  }
M
Minghao Li 已提交
616
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
617 618 619 620 621 622 623 624 625 626 627
  char* s = syncNode2SimpleStr(pSyncNode);
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);

  return s;
}

void setPingTimerMS(int64_t rid, int32_t pingTimerMS) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return;
  }
M
Minghao Li 已提交
628
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
629 630 631 632 633 634 635 636 637 638 639
  pSyncNode->pingBaseLine = pingTimerMS;
  pSyncNode->pingTimerMS = pingTimerMS;

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

void setElectTimerMS(int64_t rid, int32_t electTimerMS) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return;
  }
M
Minghao Li 已提交
640
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
641 642 643 644 645 646 647 648 649 650
  pSyncNode->electBaseLine = electTimerMS;

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

void setHeartbeatTimerMS(int64_t rid, int32_t hbTimerMS) {
  SSyncNode* pSyncNode = (SSyncNode*)taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    return;
  }
M
Minghao Li 已提交
651
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
652 653 654 655 656 657
  pSyncNode->hbBaseLine = hbTimerMS;
  pSyncNode->heartbeatTimerMS = hbTimerMS;

  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
}

M
Minghao Li 已提交
658
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak) {
659
  SSyncNode* pSyncNode = taosAcquireRef(tsNodeRefId, rid);
660
  if (pSyncNode == NULL) {
M
Minghao Li 已提交
661
    taosReleaseRef(tsNodeRefId, rid);
M
Minghao Li 已提交
662 663
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
664
  }
M
Minghao Li 已提交
665
  ASSERT(rid == pSyncNode->rid);
M
Minghao Li 已提交
666

667
  int32_t ret = syncNodePropose(pSyncNode, pMsg, isWeak);
M
Minghao Li 已提交
668 669 670 671
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return ret;
}

672
int32_t syncProposeBatch(int64_t rid, SRpcMsg** pMsgPArr, bool* pIsWeakArr, int32_t arrSize) {
M
Minghao Li 已提交
673 674 675 676 677 678 679 680 681 682 683 684
  if (arrSize < 0) {
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
  }

  SSyncNode* pSyncNode = taosAcquireRef(tsNodeRefId, rid);
  if (pSyncNode == NULL) {
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
    return -1;
  }
  ASSERT(rid == pSyncNode->rid);

685
  int32_t ret = syncNodeProposeBatch(pSyncNode, pMsgPArr, pIsWeakArr, arrSize);
M
Minghao Li 已提交
686 687 688 689
  taosReleaseRef(tsNodeRefId, pSyncNode->rid);
  return ret;
}

690
static bool syncNodeBatchOK(SRpcMsg** pMsgPArr, int32_t arrSize) {
M
Minghao Li 已提交
691
  for (int32_t i = 0; i < arrSize; ++i) {
692
    if (pMsgPArr[i]->msgType == TDMT_SYNC_CONFIG_CHANGE) {
M
Minghao Li 已提交
693 694 695
      return false;
    }

696
    if (pMsgPArr[i]->msgType == TDMT_SYNC_CONFIG_CHANGE_FINISH) {
M
Minghao Li 已提交
697 698 699 700 701 702 703
      return false;
    }
  }

  return true;
}

704 705
int32_t syncNodeProposeBatch(SSyncNode* pSyncNode, SRpcMsg** pMsgPArr, bool* pIsWeakArr, int32_t arrSize) {
  if (!syncNodeBatchOK(pMsgPArr, arrSize)) {
M
Minghao Li 已提交
706 707 708 709 710 711
    syncNodeErrorLog(pSyncNode, "sync propose batch error");
    terrno = TSDB_CODE_SYN_BATCH_ERROR;
    return -1;
  }

  if (arrSize > SYNC_MAX_BATCH_SIZE) {
M
Minghao Li 已提交
712
    syncNodeErrorLog(pSyncNode, "sync propose batch error");
M
Minghao Li 已提交
713 714 715 716
    terrno = TSDB_CODE_SYN_BATCH_ERROR;
    return -1;
  }

M
Minghao Li 已提交
717
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
M
Minghao Li 已提交
718 719 720 721 722 723 724 725 726 727 728 729 730
    syncNodeErrorLog(pSyncNode, "sync propose not leader");
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return -1;
  }

  if (pSyncNode->changing) {
    syncNodeErrorLog(pSyncNode, "sync propose not ready");
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    return -1;
  }

  SRaftMeta raftArr[SYNC_MAX_BATCH_SIZE];
  for (int i = 0; i < arrSize; ++i) {
M
Minghao Li 已提交
731 732
    do {
      char eventLog[128];
733 734
      snprintf(eventLog, sizeof(eventLog), "propose message, type:%s batch:%d", TMSG_INFO(pMsgPArr[i]->msgType),
               arrSize);
M
Minghao Li 已提交
735 736 737
      syncNodeEventLog(pSyncNode, eventLog);
    } while (0);

M
Minghao Li 已提交
738 739
    SRespStub stub;
    stub.createTime = taosGetTimestampMs();
740
    stub.rpcMsg = *(pMsgPArr[i]);
M
Minghao Li 已提交
741 742 743 744 745 746
    uint64_t seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);

    raftArr[i].isWeak = pIsWeakArr[i];
    raftArr[i].seqNum = seqNum;
  }

747
  SyncClientRequestBatch* pSyncMsg = syncClientRequestBatchBuild(pMsgPArr, raftArr, arrSize, pSyncNode->vgId);
M
Minghao Li 已提交
748 749 750 751 752 753
  ASSERT(pSyncMsg != NULL);

  SRpcMsg rpcMsg;
  syncClientRequestBatch2RpcMsg(pSyncMsg, &rpcMsg);
  taosMemoryFree(pSyncMsg);  // only free msg body, do not free rpc msg content

M
Minghao Li 已提交
754 755 756 757 758 759 760
  if (pSyncNode->replicaNum == 1 && pSyncNode->vgId != 1) {
    int32_t code = syncNodeOnClientRequestBatchCb(pSyncNode, pSyncMsg);
    if (code == 0) {
      // update rpc msg applyIndex
      SRpcMsg* msgArr = syncClientRequestBatchRpcMsgArr(pSyncMsg);
      ASSERT(arrSize == pSyncMsg->dataCount);
      for (int i = 0; i < arrSize; ++i) {
761
        pMsgPArr[i]->info.conn.applyIndex = msgArr[i].info.conn.applyIndex;
M
Minghao Li 已提交
762 763 764 765 766 767 768 769 770 771 772
        syncRespMgrDel(pSyncNode->pSyncRespMgr, raftArr[i].seqNum);
      }

      rpcFreeCont(rpcMsg.pCont);
      terrno = 0;
      return 1;

    } else {
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
      return -1;
    }
M
Minghao Li 已提交
773 774

  } else {
M
Minghao Li 已提交
775 776 777 778 779 780 781 782 783
    if (pSyncNode->FpEqMsg != NULL && (*pSyncNode->FpEqMsg)(pSyncNode->msgcb, &rpcMsg) == 0) {
      // enqueue msg ok
      return 0;

    } else {
      sError("vgId:%d, enqueue msg error, FpEqMsg is NULL", pSyncNode->vgId);
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
      return -1;
    }
M
Minghao Li 已提交
784 785 786 787
  }

  return 0;
}
788

789
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak) {
M
Minghao Li 已提交
790
  int32_t ret = 0;
M
Minghao Li 已提交
791

M
Minghao Li 已提交
792 793
  do {
    char eventLog[128];
S
Shengliang Guan 已提交
794
    snprintf(eventLog, sizeof(eventLog), "propose message, type:%s", TMSG_INFO(pMsg->msgType));
M
Minghao Li 已提交
795 796
    syncNodeEventLog(pSyncNode, eventLog);
  } while (0);
M
Minghao Li 已提交
797

M
Minghao Li 已提交
798
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
M
Minghao Li 已提交
799 800 801
    if (pSyncNode->changing && pMsg->msgType != TDMT_SYNC_CONFIG_CHANGE_FINISH) {
      ret = -1;
      terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
S
Shengliang Guan 已提交
802
      sError("vgId:%d, failed to sync propose since not ready, type:%s", pSyncNode->vgId, TMSG_INFO(pMsg->msgType));
M
Minghao Li 已提交
803 804 805 806 807 808 809 810
      goto _END;
    }

    // config change
    if (pMsg->msgType == TDMT_SYNC_CONFIG_CHANGE) {
      if (!syncNodeCanChange(pSyncNode)) {
        ret = -1;
        terrno = TSDB_CODE_SYN_RECONFIG_NOT_READY;
S
Shengliang Guan 已提交
811
        sError("vgId:%d, failed to sync reconfig since not ready, type:%s", pSyncNode->vgId, TMSG_INFO(pMsg->msgType));
M
Minghao Li 已提交
812 813 814 815 816 817 818
        goto _END;
      }

      ASSERT(!pSyncNode->changing);
      pSyncNode->changing = true;
    }

M
Minghao Li 已提交
819 820 821 822 823 824
    SRespStub stub;
    stub.createTime = taosGetTimestampMs();
    stub.rpcMsg = *pMsg;
    uint64_t seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);

    SyncClientRequest* pSyncMsg = syncClientRequestBuild2(pMsg, seqNum, isWeak, pSyncNode->vgId);
M
Minghao Li 已提交
825 826
    SRpcMsg            rpcMsg;
    syncClientRequest2RpcMsg(pSyncMsg, &rpcMsg);
827

828 829 830 831 832 833
    // optimized one replica
    if (syncNodeIsOptimizedOneReplica(pSyncNode, pMsg)) {
      SyncIndex retIndex;
      int32_t   code = syncNodeOnClientRequestCb(pSyncNode, pSyncMsg, &retIndex);
      if (code == 0) {
        pMsg->info.conn.applyIndex = retIndex;
M
Minghao Li 已提交
834
        pMsg->info.conn.applyTerm = pSyncNode->pRaftStore->currentTerm;
835 836 837
        rpcFreeCont(rpcMsg.pCont);
        syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
        ret = 1;
838 839
        sDebug("vgId:%d, sync optimize index:%" PRId64 ", type:%s", pSyncNode->vgId, retIndex,
               TMSG_INFO(pMsg->msgType));
840 841 842
      } else {
        ret = -1;
        terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
S
Shengliang Guan 已提交
843 844
        sError("vgId:%d, failed to sync optimize index:%" PRId64 ", type:%s", pSyncNode->vgId, retIndex,
               TMSG_INFO(pMsg->msgType));
845 846
      }

M
Minghao Li 已提交
847
    } else {
848 849 850 851 852
      if (pSyncNode->FpEqMsg != NULL && (*pSyncNode->FpEqMsg)(pSyncNode->msgcb, &rpcMsg) == 0) {
        ret = 0;
      } else {
        ret = -1;
        terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
S
Shengliang Guan 已提交
853
        sError("vgId:%d, failed to enqueue msg since its null", pSyncNode->vgId);
854
      }
M
Minghao Li 已提交
855
    }
856

M
Minghao Li 已提交
857
    syncClientRequestDestroy(pSyncMsg);
M
Minghao Li 已提交
858 859
    goto _END;

M
Minghao Li 已提交
860
  } else {
M
Minghao Li 已提交
861 862
    ret = -1;
    terrno = TSDB_CODE_SYN_NOT_LEADER;
S
Shengliang Guan 已提交
863 864
    sError("vgId:%d, sync propose not leader, %s, type:%s", pSyncNode->vgId, syncUtilState2String(pSyncNode->state),
           TMSG_INFO(pMsg->msgType));
M
Minghao Li 已提交
865
    goto _END;
M
Minghao Li 已提交
866
  }
M
Minghao Li 已提交
867

M
Minghao Li 已提交
868
_END:
M
Minghao Li 已提交
869 870 871
  return ret;
}

M
Minghao Li 已提交
872
// open/close --------------
873 874 875
SSyncNode* syncNodeOpen(const SSyncInfo* pOldSyncInfo) {
  SSyncInfo* pSyncInfo = (SSyncInfo*)pOldSyncInfo;

wafwerar's avatar
wafwerar 已提交
876
  SSyncNode* pSyncNode = (SSyncNode*)taosMemoryMalloc(sizeof(SSyncNode));
M
Minghao Li 已提交
877
  ASSERT(pSyncNode != NULL);
M
Minghao Li 已提交
878
  memset(pSyncNode, 0, sizeof(SSyncNode));
M
Minghao Li 已提交
879

M
Minghao Li 已提交
880 881 882 883 884 885 886
  int32_t ret = 0;
  if (!taosDirExist((char*)(pSyncInfo->path))) {
    if (taosMkDir(pSyncInfo->path) != 0) {
      terrno = TAOS_SYSTEM_ERROR(errno);
      sError("failed to create dir:%s since %s", pSyncInfo->path, terrstr());
      return NULL;
    }
887
  }
M
Minghao Li 已提交
888

S
Shengliang Guan 已提交
889
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
890
  if (!taosCheckExistFile(pSyncNode->configPath)) {
M
Minghao Li 已提交
891 892 893
    // create a new raft config file
    SRaftCfgMeta meta;
    meta.isStandBy = pSyncInfo->isStandBy;
M
Minghao Li 已提交
894
    meta.snapshotStrategy = pSyncInfo->snapshotStrategy;
895
    meta.lastConfigIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
896
    meta.batchSize = pSyncInfo->batchSize;
M
Minghao Li 已提交
897
    ret = raftCfgCreateFile((SSyncCfg*)&(pSyncInfo->syncCfg), meta, pSyncNode->configPath);
M
Minghao Li 已提交
898
    ASSERT(ret == 0);
899 900 901 902

  } else {
    // update syncCfg by raft_config.json
    pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
M
Minghao Li 已提交
903
    ASSERT(pSyncNode->pRaftCfg != NULL);
904 905 906
    pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;

    raftCfgClose(pSyncNode->pRaftCfg);
M
Minghao Li 已提交
907 908
  }

M
Minghao Li 已提交
909
  // init by SSyncInfo
M
Minghao Li 已提交
910 911
  pSyncNode->vgId = pSyncInfo->vgId;
  memcpy(pSyncNode->path, pSyncInfo->path, sizeof(pSyncNode->path));
S
Shengliang Guan 已提交
912 913 914
  snprintf(pSyncNode->raftStorePath, sizeof(pSyncNode->raftStorePath), "%s%sraft_store.json", pSyncInfo->path,
           TD_DIRSEP);
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
M
Minghao Li 已提交
915

M
Minghao Li 已提交
916
  pSyncNode->pWal = pSyncInfo->pWal;
S
Shengliang Guan 已提交
917
  pSyncNode->msgcb = pSyncInfo->msgcb;
M
Minghao Li 已提交
918
  pSyncNode->FpSendMsg = pSyncInfo->FpSendMsg;
M
Minghao Li 已提交
919
  pSyncNode->FpEqMsg = pSyncInfo->FpEqMsg;
M
Minghao Li 已提交
920

M
Minghao Li 已提交
921 922
  // init raft config
  pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
M
Minghao Li 已提交
923
  ASSERT(pSyncNode->pRaftCfg != NULL);
M
Minghao Li 已提交
924

M
Minghao Li 已提交
925
  // init internal
M
Minghao Li 已提交
926 927
  pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
  syncUtilnodeInfo2raftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
M
Minghao Li 已提交
928

M
Minghao Li 已提交
929
  // init peersNum, peers, peersId
M
Minghao Li 已提交
930
  pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
M
Minghao Li 已提交
931
  int j = 0;
M
Minghao Li 已提交
932 933 934
  for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
    if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
      pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
M
Minghao Li 已提交
935 936 937
      j++;
    }
  }
M
Minghao Li 已提交
938
  for (int i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
939
    syncUtilnodeInfo2raftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
M
Minghao Li 已提交
940
  }
M
Minghao Li 已提交
941

M
Minghao Li 已提交
942
  // init replicaNum, replicasId
M
Minghao Li 已提交
943 944 945
  pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
  for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
    syncUtilnodeInfo2raftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
M
Minghao Li 已提交
946 947
  }

M
Minghao Li 已提交
948
  // init raft algorithm
M
Minghao Li 已提交
949
  pSyncNode->pFsm = pSyncInfo->pFsm;
M
Minghao Li 已提交
950
  pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);
M
Minghao Li 已提交
951 952
  pSyncNode->leaderCache = EMPTY_RAFT_ID;

M
Minghao Li 已提交
953
  // init life cycle outside
M
Minghao Li 已提交
954

M
Minghao Li 已提交
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
  // TLA+ Spec
  // InitHistoryVars == /\ elections = {}
  //                    /\ allLogs   = {}
  //                    /\ voterLog  = [i \in Server |-> [j \in {} |-> <<>>]]
  // InitServerVars == /\ currentTerm = [i \in Server |-> 1]
  //                   /\ state       = [i \in Server |-> Follower]
  //                   /\ votedFor    = [i \in Server |-> Nil]
  // InitCandidateVars == /\ votesResponded = [i \in Server |-> {}]
  //                      /\ votesGranted   = [i \in Server |-> {}]
  // \* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the
  // \* leader does not send itself messages. It's still easier to include these
  // \* in the functions.
  // InitLeaderVars == /\ nextIndex  = [i \in Server |-> [j \in Server |-> 1]]
  //                   /\ matchIndex = [i \in Server |-> [j \in Server |-> 0]]
  // InitLogVars == /\ log          = [i \in Server |-> << >>]
  //                /\ commitIndex  = [i \in Server |-> 0]
  // Init == /\ messages = [m \in {} |-> 0]
  //         /\ InitHistoryVars
  //         /\ InitServerVars
  //         /\ InitCandidateVars
  //         /\ InitLeaderVars
  //         /\ InitLogVars
  //

M
Minghao Li 已提交
979
  // init TLA+ server vars
M
syncInt  
Minghao Li 已提交
980
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
M
Minghao Li 已提交
981
  pSyncNode->pRaftStore = raftStoreOpen(pSyncNode->raftStorePath);
M
Minghao Li 已提交
982
  ASSERT(pSyncNode->pRaftStore != NULL);
M
Minghao Li 已提交
983

M
Minghao Li 已提交
984
  // init TLA+ candidate vars
M
Minghao Li 已提交
985
  pSyncNode->pVotesGranted = voteGrantedCreate(pSyncNode);
M
Minghao Li 已提交
986
  ASSERT(pSyncNode->pVotesGranted != NULL);
M
Minghao Li 已提交
987
  pSyncNode->pVotesRespond = votesRespondCreate(pSyncNode);
M
Minghao Li 已提交
988
  ASSERT(pSyncNode->pVotesRespond != NULL);
M
Minghao Li 已提交
989

M
Minghao Li 已提交
990 991
  // init TLA+ leader vars
  pSyncNode->pNextIndex = syncIndexMgrCreate(pSyncNode);
M
Minghao Li 已提交
992
  ASSERT(pSyncNode->pNextIndex != NULL);
M
Minghao Li 已提交
993
  pSyncNode->pMatchIndex = syncIndexMgrCreate(pSyncNode);
M
Minghao Li 已提交
994
  ASSERT(pSyncNode->pMatchIndex != NULL);
M
Minghao Li 已提交
995 996 997

  // init TLA+ log vars
  pSyncNode->pLogStore = logStoreCreate(pSyncNode);
M
Minghao Li 已提交
998
  ASSERT(pSyncNode->pLogStore != NULL);
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010

  SyncIndex commitIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot = {0};
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
    ASSERT(code == 0);
    if (snapshot.lastApplyIndex > commitIndex) {
      commitIndex = snapshot.lastApplyIndex;
      syncNodeEventLog(pSyncNode, "reset commit index by snapshot");
    }
  }
  pSyncNode->commitIndex = commitIndex;
M
Minghao Li 已提交
1011

M
Minghao Li 已提交
1012 1013 1014 1015 1016
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
  pSyncNode->electBaseLine = ELECT_TIMER_MS_MIN;
  pSyncNode->hbBaseLine = HEARTBEAT_TIMER_MS;

M
Minghao Li 已提交
1017
  // init ping timer
M
Minghao Li 已提交
1018
  pSyncNode->pPingTimer = NULL;
M
Minghao Li 已提交
1019
  pSyncNode->pingTimerMS = pSyncNode->pingBaseLine;
M
Minghao Li 已提交
1020 1021
  atomic_store_64(&pSyncNode->pingTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->pingTimerLogicClockUser, 0);
M
Minghao Li 已提交
1022
  pSyncNode->FpPingTimerCB = syncNodeEqPingTimer;
M
Minghao Li 已提交
1023
  pSyncNode->pingTimerCounter = 0;
M
Minghao Li 已提交
1024

M
Minghao Li 已提交
1025 1026
  // init elect timer
  pSyncNode->pElectTimer = NULL;
M
Minghao Li 已提交
1027
  pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
M
Minghao Li 已提交
1028 1029
  atomic_store_64(&pSyncNode->electTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->electTimerLogicClockUser, 0);
M
Minghao Li 已提交
1030
  pSyncNode->FpElectTimerCB = syncNodeEqElectTimer;
M
Minghao Li 已提交
1031 1032 1033 1034
  pSyncNode->electTimerCounter = 0;

  // init heartbeat timer
  pSyncNode->pHeartbeatTimer = NULL;
M
Minghao Li 已提交
1035
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
M
Minghao Li 已提交
1036 1037
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClockUser, 0);
M
Minghao Li 已提交
1038
  pSyncNode->FpHeartbeatTimerCB = syncNodeEqHeartbeatTimer;
M
Minghao Li 已提交
1039 1040
  pSyncNode->heartbeatTimerCounter = 0;

M
Minghao Li 已提交
1041
  // init callback
M
Minghao Li 已提交
1042 1043
  pSyncNode->FpOnPing = syncNodeOnPingCb;
  pSyncNode->FpOnPingReply = syncNodeOnPingReplyCb;
M
Minghao Li 已提交
1044
  pSyncNode->FpOnClientRequest = syncNodeOnClientRequestCb;
M
Minghao Li 已提交
1045
  pSyncNode->FpOnTimeout = syncNodeOnTimeoutCb;
M
Minghao Li 已提交
1046

M
Minghao Li 已提交
1047 1048 1049
  pSyncNode->FpOnSnapshotSend = syncNodeOnSnapshotSendCb;
  pSyncNode->FpOnSnapshotRsp = syncNodeOnSnapshotRspCb;

M
Minghao Li 已提交
1050
  if (pSyncNode->pRaftCfg->snapshotStrategy) {
S
Shengliang Guan 已提交
1051
    sInfo("vgId:%d, sync node use snapshot", pSyncNode->vgId);
M
Minghao Li 已提交
1052 1053 1054 1055
    pSyncNode->FpOnRequestVote = syncNodeOnRequestVoteSnapshotCb;
    pSyncNode->FpOnRequestVoteReply = syncNodeOnRequestVoteReplySnapshotCb;
    pSyncNode->FpOnAppendEntries = syncNodeOnAppendEntriesSnapshotCb;
    pSyncNode->FpOnAppendEntriesReply = syncNodeOnAppendEntriesReplySnapshotCb;
M
Minghao Li 已提交
1056 1057

  } else {
S
Shengliang Guan 已提交
1058
    sInfo("vgId:%d, sync node do not use snapshot", pSyncNode->vgId);
M
Minghao Li 已提交
1059 1060 1061 1062
    pSyncNode->FpOnRequestVote = syncNodeOnRequestVoteCb;
    pSyncNode->FpOnRequestVoteReply = syncNodeOnRequestVoteReplyCb;
    pSyncNode->FpOnAppendEntries = syncNodeOnAppendEntriesCb;
    pSyncNode->FpOnAppendEntriesReply = syncNodeOnAppendEntriesReplyCb;
M
Minghao Li 已提交
1063 1064
  }

M
Minghao Li 已提交
1065
  // tools
M
Minghao Li 已提交
1066
  pSyncNode->pSyncRespMgr = syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS);
M
Minghao Li 已提交
1067
  ASSERT(pSyncNode->pSyncRespMgr != NULL);
M
Minghao Li 已提交
1068

1069 1070
  // restore state
  pSyncNode->restoreFinish = false;
1071

M
Minghao Li 已提交
1072 1073 1074 1075 1076 1077 1078 1079
  // snapshot senders
  for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
    SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
    // ASSERT(pSender != NULL);
    (pSyncNode->senders)[i] = pSender;
  }

  // snapshot receivers
1080
  pSyncNode->pNewNodeReceiver = snapshotReceiverCreate(pSyncNode, EMPTY_RAFT_ID);
M
Minghao Li 已提交
1081

M
Minghao Li 已提交
1082 1083 1084
  // is config changing
  pSyncNode->changing = false;

M
Minghao Li 已提交
1085
  // start in syncNodeStart
M
Minghao Li 已提交
1086
  // start raft
M
Minghao Li 已提交
1087
  // syncNodeBecomeFollower(pSyncNode);
M
Minghao Li 已提交
1088

M
Minghao Li 已提交
1089
  syncNodeEventLog(pSyncNode, "sync open");
1090

M
Minghao Li 已提交
1091 1092 1093
  return pSyncNode;
}

M
Minghao Li 已提交
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) {
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot;
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
    ASSERT(code == 0);
    if (snapshot.lastApplyIndex > pSyncNode->commitIndex) {
      pSyncNode->commitIndex = snapshot.lastApplyIndex;
    }
  }
}

M
Minghao Li 已提交
1105 1106
void syncNodeStart(SSyncNode* pSyncNode) {
  // start raft
1107
  if (pSyncNode->replicaNum == 1) {
M
Minghao Li 已提交
1108
    raftStoreNextTerm(pSyncNode->pRaftStore);
1109
    syncNodeBecomeLeader(pSyncNode, "one replica start");
M
format  
Minghao Li 已提交
1110

1111
    // Raft 3.6.2 Committing entries from previous terms
1112 1113
    syncNodeAppendNoop(pSyncNode);
    syncMaybeAdvanceCommitIndex(pSyncNode);
M
Minghao Li 已提交
1114 1115
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
1116 1117
  }

M
Minghao Li 已提交
1118
  int32_t ret = 0;
1119
  // ret = syncNodeStartPingTimer(pSyncNode);
M
Minghao Li 已提交
1120
  ASSERT(ret == 0);
M
Minghao Li 已提交
1121 1122
}

M
Minghao Li 已提交
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
void syncNodeStartStandBy(SSyncNode* pSyncNode) {
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
  ASSERT(ret == 0);
}

M
Minghao Li 已提交
1134
void syncNodeClose(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1135
  syncNodeEventLog(pSyncNode, "sync close");
1136

M
Minghao Li 已提交
1137
  int32_t ret;
M
Minghao Li 已提交
1138
  ASSERT(pSyncNode != NULL);
M
Minghao Li 已提交
1139 1140

  ret = raftStoreClose(pSyncNode->pRaftStore);
M
Minghao Li 已提交
1141
  ASSERT(ret == 0);
M
Minghao Li 已提交
1142

M
Minghao Li 已提交
1143
  syncRespMgrDestroy(pSyncNode->pSyncRespMgr);
M
Minghao Li 已提交
1144 1145 1146 1147 1148
  voteGrantedDestroy(pSyncNode->pVotesGranted);
  votesRespondDestory(pSyncNode->pVotesRespond);
  syncIndexMgrDestroy(pSyncNode->pNextIndex);
  syncIndexMgrDestroy(pSyncNode->pMatchIndex);
  logStoreDestory(pSyncNode->pLogStore);
M
Minghao Li 已提交
1149
  raftCfgClose(pSyncNode->pRaftCfg);
M
Minghao Li 已提交
1150 1151 1152 1153 1154

  syncNodeStopPingTimer(pSyncNode);
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1155 1156 1157 1158
  if (pSyncNode->pFsm != NULL) {
    taosMemoryFree(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1159 1160 1161 1162 1163 1164 1165
  for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
    if ((pSyncNode->senders)[i] != NULL) {
      snapshotSenderDestroy((pSyncNode->senders)[i]);
      (pSyncNode->senders)[i] = NULL;
    }
  }

M
Minghao Li 已提交
1166 1167 1168 1169 1170
  if (pSyncNode->pNewNodeReceiver != NULL) {
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

M
Minghao Li 已提交
1171 1172
  // free memory in syncFreeNode
  // taosMemoryFree(pSyncNode);
M
Minghao Li 已提交
1173 1174
}

M
Minghao Li 已提交
1175
// option
M
Minghao Li 已提交
1176 1177
// bool syncNodeSnapshotEnable(SSyncNode* pSyncNode) { return pSyncNode->pRaftCfg->snapshotEnable; }

M
Minghao Li 已提交
1178
ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode) { return pSyncNode->pRaftCfg->snapshotStrategy; }
M
Minghao Li 已提交
1179

M
Minghao Li 已提交
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
// ping --------------
int32_t syncNodePing(SSyncNode* pSyncNode, const SRaftId* destRaftId, SyncPing* pMsg) {
  syncPingLog2((char*)"==syncNodePing==", pMsg);
  int32_t ret = 0;

  SRpcMsg rpcMsg;
  syncPing2RpcMsg(pMsg, &rpcMsg);
  syncRpcMsgLog2((char*)"==syncNodePing==", &rpcMsg);

  ret = syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg);
  return ret;
}

int32_t syncNodePingSelf(SSyncNode* pSyncNode) {
  int32_t   ret = 0;
M
Minghao Li 已提交
1195
  SyncPing* pMsg = syncPingBuild3(&pSyncNode->myRaftId, &pSyncNode->myRaftId, pSyncNode->vgId);
M
Minghao Li 已提交
1196
  ret = syncNodePing(pSyncNode, &pMsg->destId, pMsg);
M
Minghao Li 已提交
1197
  ASSERT(ret == 0);
M
Minghao Li 已提交
1198 1199 1200 1201 1202 1203 1204 1205

  syncPingDestroy(pMsg);
  return ret;
}

int32_t syncNodePingPeers(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  for (int i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
1206 1207 1208
    SRaftId*  destId = &(pSyncNode->peersId[i]);
    SyncPing* pMsg = syncPingBuild3(&pSyncNode->myRaftId, destId, pSyncNode->vgId);
    ret = syncNodePing(pSyncNode, destId, pMsg);
M
Minghao Li 已提交
1209
    ASSERT(ret == 0);
M
Minghao Li 已提交
1210 1211 1212 1213 1214 1215 1216
    syncPingDestroy(pMsg);
  }
  return ret;
}

int32_t syncNodePingAll(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1217 1218 1219 1220
  for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
    SRaftId*  destId = &(pSyncNode->replicasId[i]);
    SyncPing* pMsg = syncPingBuild3(&pSyncNode->myRaftId, destId, pSyncNode->vgId);
    ret = syncNodePing(pSyncNode, destId, pMsg);
M
Minghao Li 已提交
1221
    ASSERT(ret == 0);
M
Minghao Li 已提交
1222 1223 1224 1225 1226 1227 1228 1229
    syncPingDestroy(pMsg);
  }
  return ret;
}

// timer control --------------
int32_t syncNodeStartPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1230 1231 1232 1233 1234
  if (syncEnvIsStart()) {
    taosTmrReset(pSyncNode->FpPingTimerCB, pSyncNode->pingTimerMS, pSyncNode, gSyncEnv->pTimerManager,
                 &pSyncNode->pPingTimer);
    atomic_store_64(&pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1235
    sError("vgId:%d, start ping timer error, sync env is stop", pSyncNode->vgId);
1236
  }
M
Minghao Li 已提交
1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
  return ret;
}

int32_t syncNodeStopPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->pingTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pPingTimer);
  pSyncNode->pPingTimer = NULL;
  return ret;
}

int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
1250 1251 1252 1253 1254
  if (syncEnvIsStart()) {
    pSyncNode->electTimerMS = ms;
    taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, pSyncNode, gSyncEnv->pTimerManager,
                 &pSyncNode->pElectTimer);
    atomic_store_64(&pSyncNode->electTimerLogicClock, pSyncNode->electTimerLogicClockUser);
1255 1256 1257 1258 1259 1260 1261

    do {
      char logBuf[128];
      snprintf(logBuf, sizeof(logBuf), "elect timer reset, ms:%d", ms);
      syncNodeEventLog(pSyncNode, logBuf);
    } while (0);

1262
  } else {
M
Minghao Li 已提交
1263
    sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId);
1264
  }
M
Minghao Li 已提交
1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
  return ret;
}

int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->electTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pElectTimer);
  pSyncNode->pElectTimer = NULL;
  return ret;
}

int32_t syncNodeRestartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStartElectTimer(pSyncNode, ms);
  return ret;
}

M
Minghao Li 已提交
1283 1284
int32_t syncNodeResetElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1285 1286 1287 1288 1289 1290 1291
  int32_t electMS;

  if (pSyncNode->pRaftCfg->isStandBy) {
    electMS = TIMER_MAX_MS;
  } else {
    electMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
  }
M
Minghao Li 已提交
1292
  ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1293 1294 1295 1296 1297 1298 1299 1300

  do {
    char logBuf[128];
    snprintf(logBuf, sizeof(logBuf), "reset elect timer, min:%d, max:%d, ms:%d", pSyncNode->electBaseLine,
             2 * pSyncNode->electBaseLine, electMS);
    syncNodeEventLog(pSyncNode, logBuf);
  } while (0);

M
Minghao Li 已提交
1301 1302 1303
  return ret;
}

M
Minghao Li 已提交
1304 1305
int32_t syncNodeStartHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1306 1307 1308 1309 1310
  if (syncEnvIsStart()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, pSyncNode->heartbeatTimerMS, pSyncNode, gSyncEnv->pTimerManager,
                 &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1311
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
1312
  }
1313 1314 1315 1316 1317 1318 1319

  do {
    char logBuf[128];
    snprintf(logBuf, sizeof(logBuf), "start heartbeat timer, ms:%d", pSyncNode->heartbeatTimerMS);
    syncNodeEventLog(pSyncNode, logBuf);
  } while (0);

M
Minghao Li 已提交
1320 1321 1322
  return ret;
}

1323 1324 1325 1326 1327 1328 1329 1330
int32_t syncNodeStartNowHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  if (syncEnvIsStart()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, 1, pSyncNode, gSyncEnv->pTimerManager, &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
  }
1331 1332 1333 1334 1335 1336 1337

  do {
    char logBuf[128];
    snprintf(logBuf, sizeof(logBuf), "start heartbeat timer, ms:%d", 1);
    syncNodeEventLog(pSyncNode, logBuf);
  } while (0);

1338 1339 1340
  return ret;
}

M
Minghao Li 已提交
1341 1342 1343 1344 1345
int32_t syncNodeStopHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->heartbeatTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pHeartbeatTimer);
  pSyncNode->pHeartbeatTimer = NULL;
1346 1347
  sTrace("vgId:%d, stop heartbeat timer", pSyncNode->vgId);

M
Minghao Li 已提交
1348 1349 1350
  return ret;
}

1351 1352 1353 1354 1355 1356
int32_t syncNodeRestartHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartHeartbeatTimer(pSyncNode);
  return 0;
}

1357 1358 1359 1360 1361 1362
int32_t syncNodeRestartNowHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartNowHeartbeatTimer(pSyncNode);
  return 0;
}

M
Minghao Li 已提交
1363 1364 1365 1366
// utils --------------
int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
  syncUtilraftId2EpSet(destRaftId, &epSet);
M
Minghao Li 已提交
1367 1368 1369 1370
  if (pSyncNode->FpSendMsg != NULL) {
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1371
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1372
    pSyncNode->FpSendMsg(&epSet, pMsg);
M
Minghao Li 已提交
1373
  } else {
M
Minghao Li 已提交
1374 1375
    sError("vgId:%d, sync send msg by id error, fp-send-msg is null", pSyncNode->vgId);
    return -1;
M
Minghao Li 已提交
1376
  }
M
Minghao Li 已提交
1377

M
Minghao Li 已提交
1378 1379 1380 1381 1382 1383
  return 0;
}

int32_t syncNodeSendMsgByInfo(const SNodeInfo* nodeInfo, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
  syncUtilnodeInfo2EpSet(nodeInfo, &epSet);
M
Minghao Li 已提交
1384 1385 1386 1387
  if (pSyncNode->FpSendMsg != NULL) {
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1388
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1389
    pSyncNode->FpSendMsg(&epSet, pMsg);
M
Minghao Li 已提交
1390
  } else {
M
Minghao Li 已提交
1391
    sError("vgId:%d, sync send msg by info error, fp-send-msg is null", pSyncNode->vgId);
M
Minghao Li 已提交
1392
  }
M
Minghao Li 已提交
1393 1394 1395
  return 0;
}

M
Minghao Li 已提交
1396
cJSON* syncNode2Json(const SSyncNode* pSyncNode) {
C
Cary Xu 已提交
1397
  char   u64buf[128] = {0};
M
Minghao Li 已提交
1398 1399
  cJSON* pRoot = cJSON_CreateObject();

M
Minghao Li 已提交
1400 1401 1402
  if (pSyncNode != NULL) {
    // init by SSyncInfo
    cJSON_AddNumberToObject(pRoot, "vgId", pSyncNode->vgId);
M
Minghao Li 已提交
1403
    cJSON_AddItemToObject(pRoot, "SRaftCfg", raftCfg2Json(pSyncNode->pRaftCfg));
M
Minghao Li 已提交
1404
    cJSON_AddStringToObject(pRoot, "path", pSyncNode->path);
M
Minghao Li 已提交
1405 1406 1407
    cJSON_AddStringToObject(pRoot, "raftStorePath", pSyncNode->raftStorePath);
    cJSON_AddStringToObject(pRoot, "configPath", pSyncNode->configPath);

M
Minghao Li 已提交
1408 1409 1410
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->pWal);
    cJSON_AddStringToObject(pRoot, "pWal", u64buf);

S
Shengliang Guan 已提交
1411
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->msgcb);
M
Minghao Li 已提交
1412 1413 1414 1415
    cJSON_AddStringToObject(pRoot, "rpcClient", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpSendMsg);
    cJSON_AddStringToObject(pRoot, "FpSendMsg", u64buf);

S
Shengliang Guan 已提交
1416
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->msgcb);
M
Minghao Li 已提交
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437
    cJSON_AddStringToObject(pRoot, "queue", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpEqMsg);
    cJSON_AddStringToObject(pRoot, "FpEqMsg", u64buf);

    // init internal
    cJSON* pMe = syncUtilNodeInfo2Json(&pSyncNode->myNodeInfo);
    cJSON_AddItemToObject(pRoot, "myNodeInfo", pMe);
    cJSON* pRaftId = syncUtilRaftId2Json(&pSyncNode->myRaftId);
    cJSON_AddItemToObject(pRoot, "myRaftId", pRaftId);

    cJSON_AddNumberToObject(pRoot, "peersNum", pSyncNode->peersNum);
    cJSON* pPeers = cJSON_CreateArray();
    cJSON_AddItemToObject(pRoot, "peersNodeInfo", pPeers);
    for (int i = 0; i < pSyncNode->peersNum; ++i) {
      cJSON_AddItemToArray(pPeers, syncUtilNodeInfo2Json(&pSyncNode->peersNodeInfo[i]));
    }
    cJSON* pPeersId = cJSON_CreateArray();
    cJSON_AddItemToObject(pRoot, "peersId", pPeersId);
    for (int i = 0; i < pSyncNode->peersNum; ++i) {
      cJSON_AddItemToArray(pPeersId, syncUtilRaftId2Json(&pSyncNode->peersId[i]));
    }
M
Minghao Li 已提交
1438

M
Minghao Li 已提交
1439 1440 1441 1442 1443 1444
    cJSON_AddNumberToObject(pRoot, "replicaNum", pSyncNode->replicaNum);
    cJSON* pReplicasId = cJSON_CreateArray();
    cJSON_AddItemToObject(pRoot, "replicasId", pReplicasId);
    for (int i = 0; i < pSyncNode->replicaNum; ++i) {
      cJSON_AddItemToArray(pReplicasId, syncUtilRaftId2Json(&pSyncNode->replicasId[i]));
    }
M
Minghao Li 已提交
1445

M
Minghao Li 已提交
1446 1447 1448 1449 1450 1451 1452
    // raft algorithm
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->pFsm);
    cJSON_AddStringToObject(pRoot, "pFsm", u64buf);
    cJSON_AddNumberToObject(pRoot, "quorum", pSyncNode->quorum);
    cJSON* pLaderCache = syncUtilRaftId2Json(&pSyncNode->leaderCache);
    cJSON_AddItemToObject(pRoot, "leaderCache", pLaderCache);

M
Minghao Li 已提交
1453
    // life cycle
S
Shengliang Guan 已提交
1454
    snprintf(u64buf, sizeof(u64buf), "%" PRId64, pSyncNode->rid);
M
Minghao Li 已提交
1455 1456
    cJSON_AddStringToObject(pRoot, "rid", u64buf);

M
Minghao Li 已提交
1457 1458 1459
    // tla+ server vars
    cJSON_AddNumberToObject(pRoot, "state", pSyncNode->state);
    cJSON_AddStringToObject(pRoot, "state_str", syncUtilState2String(pSyncNode->state));
M
Minghao Li 已提交
1460
    cJSON_AddItemToObject(pRoot, "pRaftStore", raftStore2Json(pSyncNode->pRaftStore));
M
Minghao Li 已提交
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471

    // tla+ candidate vars
    cJSON_AddItemToObject(pRoot, "pVotesGranted", voteGranted2Json(pSyncNode->pVotesGranted));
    cJSON_AddItemToObject(pRoot, "pVotesRespond", votesRespond2Json(pSyncNode->pVotesRespond));

    // tla+ leader vars
    cJSON_AddItemToObject(pRoot, "pNextIndex", syncIndexMgr2Json(pSyncNode->pNextIndex));
    cJSON_AddItemToObject(pRoot, "pMatchIndex", syncIndexMgr2Json(pSyncNode->pMatchIndex));

    // tla+ log vars
    cJSON_AddItemToObject(pRoot, "pLogStore", logStore2Json(pSyncNode->pLogStore));
S
Shengliang Guan 已提交
1472
    snprintf(u64buf, sizeof(u64buf), "%" PRId64, pSyncNode->commitIndex);
M
Minghao Li 已提交
1473 1474
    cJSON_AddStringToObject(pRoot, "commitIndex", u64buf);

M
Minghao Li 已提交
1475 1476 1477 1478 1479
    // timer ms init
    cJSON_AddNumberToObject(pRoot, "pingBaseLine", pSyncNode->pingBaseLine);
    cJSON_AddNumberToObject(pRoot, "electBaseLine", pSyncNode->electBaseLine);
    cJSON_AddNumberToObject(pRoot, "hbBaseLine", pSyncNode->hbBaseLine);

M
Minghao Li 已提交
1480 1481 1482 1483
    // ping timer
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->pPingTimer);
    cJSON_AddStringToObject(pRoot, "pPingTimer", u64buf);
    cJSON_AddNumberToObject(pRoot, "pingTimerMS", pSyncNode->pingTimerMS);
S
Shengliang Guan 已提交
1484
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->pingTimerLogicClock);
M
Minghao Li 已提交
1485
    cJSON_AddStringToObject(pRoot, "pingTimerLogicClock", u64buf);
S
Shengliang Guan 已提交
1486
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->pingTimerLogicClockUser);
M
Minghao Li 已提交
1487 1488 1489
    cJSON_AddStringToObject(pRoot, "pingTimerLogicClockUser", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpPingTimerCB);
    cJSON_AddStringToObject(pRoot, "FpPingTimerCB", u64buf);
S
Shengliang Guan 已提交
1490
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->pingTimerCounter);
M
Minghao Li 已提交
1491 1492 1493 1494 1495 1496
    cJSON_AddStringToObject(pRoot, "pingTimerCounter", u64buf);

    // elect timer
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->pElectTimer);
    cJSON_AddStringToObject(pRoot, "pElectTimer", u64buf);
    cJSON_AddNumberToObject(pRoot, "electTimerMS", pSyncNode->electTimerMS);
S
Shengliang Guan 已提交
1497
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->electTimerLogicClock);
M
Minghao Li 已提交
1498
    cJSON_AddStringToObject(pRoot, "electTimerLogicClock", u64buf);
S
Shengliang Guan 已提交
1499
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->electTimerLogicClockUser);
M
Minghao Li 已提交
1500 1501 1502
    cJSON_AddStringToObject(pRoot, "electTimerLogicClockUser", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpElectTimerCB);
    cJSON_AddStringToObject(pRoot, "FpElectTimerCB", u64buf);
S
Shengliang Guan 已提交
1503
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->electTimerCounter);
M
Minghao Li 已提交
1504 1505 1506 1507 1508 1509
    cJSON_AddStringToObject(pRoot, "electTimerCounter", u64buf);

    // heartbeat timer
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->pHeartbeatTimer);
    cJSON_AddStringToObject(pRoot, "pHeartbeatTimer", u64buf);
    cJSON_AddNumberToObject(pRoot, "heartbeatTimerMS", pSyncNode->heartbeatTimerMS);
S
Shengliang Guan 已提交
1510
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->heartbeatTimerLogicClock);
M
Minghao Li 已提交
1511
    cJSON_AddStringToObject(pRoot, "heartbeatTimerLogicClock", u64buf);
S
Shengliang Guan 已提交
1512
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->heartbeatTimerLogicClockUser);
M
Minghao Li 已提交
1513 1514 1515
    cJSON_AddStringToObject(pRoot, "heartbeatTimerLogicClockUser", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpHeartbeatTimerCB);
    cJSON_AddStringToObject(pRoot, "FpHeartbeatTimerCB", u64buf);
S
Shengliang Guan 已提交
1516
    snprintf(u64buf, sizeof(u64buf), "%" PRIu64, pSyncNode->heartbeatTimerCounter);
M
Minghao Li 已提交
1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
    cJSON_AddStringToObject(pRoot, "heartbeatTimerCounter", u64buf);

    // callback
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnPing);
    cJSON_AddStringToObject(pRoot, "FpOnPing", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnPingReply);
    cJSON_AddStringToObject(pRoot, "FpOnPingReply", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnRequestVote);
    cJSON_AddStringToObject(pRoot, "FpOnRequestVote", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnRequestVoteReply);
    cJSON_AddStringToObject(pRoot, "FpOnRequestVoteReply", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnAppendEntries);
    cJSON_AddStringToObject(pRoot, "FpOnAppendEntries", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnAppendEntriesReply);
    cJSON_AddStringToObject(pRoot, "FpOnAppendEntriesReply", u64buf);
    snprintf(u64buf, sizeof(u64buf), "%p", pSyncNode->FpOnTimeout);
    cJSON_AddStringToObject(pRoot, "FpOnTimeout", u64buf);
M
Minghao Li 已提交
1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546

    // restoreFinish
    cJSON_AddNumberToObject(pRoot, "restoreFinish", pSyncNode->restoreFinish);

    // snapshot senders
    cJSON* pSenders = cJSON_CreateArray();
    cJSON_AddItemToObject(pRoot, "senders", pSenders);
    for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
      cJSON_AddItemToArray(pSenders, snapshotSender2Json((pSyncNode->senders)[i]));
    }

    // snapshot receivers
    cJSON* pReceivers = cJSON_CreateArray();
1547
    cJSON_AddItemToObject(pRoot, "receiver", snapshotReceiver2Json(pSyncNode->pNewNodeReceiver));
M
Minghao Li 已提交
1548 1549 1550

    // changing
    cJSON_AddNumberToObject(pRoot, "changing", pSyncNode->changing);
M
Minghao Li 已提交
1551 1552 1553 1554 1555 1556 1557
  }

  cJSON* pJson = cJSON_CreateObject();
  cJSON_AddItemToObject(pJson, "SSyncNode", pRoot);
  return pJson;
}

M
Minghao Li 已提交
1558 1559 1560 1561 1562 1563 1564
char* syncNode2Str(const SSyncNode* pSyncNode) {
  cJSON* pJson = syncNode2Json(pSyncNode);
  char*  serialized = cJSON_Print(pJson);
  cJSON_Delete(pJson);
  return serialized;
}

M
Minghao Li 已提交
1565 1566
void syncNodeEventLog(const SSyncNode* pSyncNode, char* str) {
  int32_t userStrLen = strlen(str);
M
Minghao Li 已提交
1567 1568

  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0};
M
Minghao Li 已提交
1569
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
M
Minghao Li 已提交
1570 1571
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
  }
M
Minghao Li 已提交
1572 1573 1574 1575 1576 1577 1578

  SyncIndex logLastIndex = SYNC_INDEX_INVALID;
  SyncIndex logBeginIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pLogStore != NULL) {
    logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    logBeginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
  }
M
Minghao Li 已提交
1579

M
Minghao Li 已提交
1580
  char* pCfgStr = syncCfg2SimpleStr(&(pSyncNode->pRaftCfg->cfg));
1581 1582 1583 1584
  char* printStr = "";
  if (pCfgStr != NULL) {
    printStr = pCfgStr;
  }
M
Minghao Li 已提交
1585

M
Minghao Li 已提交
1586
  if (userStrLen < 256) {
M
Minghao Li 已提交
1587
    char logBuf[256 + 256];
1588 1589
    if (pSyncNode != NULL && pSyncNode->pRaftCfg != NULL && pSyncNode->pRaftStore != NULL) {
      snprintf(logBuf, sizeof(logBuf),
M
Minghao Li 已提交
1590 1591 1592 1593 1594
               "vgId:%d, sync %s %s, tm:%" PRIu64 ", cmt:%" PRId64 ", fst:%" PRId64 ", lst:%" PRId64 ", snap:%" PRId64
               ", snap-tm:%" PRIu64
               ", sby:%d, "
               "stgy:%d, bch:%d, "
               "r-num:%d, "
1595
               "lcfg:%" PRId64 ", chging:%d, rsto:%d, elt:%" PRId64 ", hb:%" PRId64 ", %s",
1596
               pSyncNode->vgId, syncUtilState2String(pSyncNode->state), str, pSyncNode->pRaftStore->currentTerm,
M
Minghao Li 已提交
1597
               pSyncNode->commitIndex, logBeginIndex, logLastIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm,
1598 1599
               pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize,
               pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing,
1600 1601
               pSyncNode->restoreFinish, pSyncNode->electTimerLogicClockUser, pSyncNode->heartbeatTimerLogicClockUser,
               printStr);
1602 1603 1604
    } else {
      snprintf(logBuf, sizeof(logBuf), "%s", str);
    }
1605
    // sDebug("%s", logBuf);
M
Minghao Li 已提交
1606 1607
    // sInfo("%s", logBuf);
    sTrace("%s", logBuf);
M
Minghao Li 已提交
1608

M
Minghao Li 已提交
1609
  } else {
M
Minghao Li 已提交
1610
    int   len = 256 + userStrLen;
M
Minghao Li 已提交
1611
    char* s = (char*)taosMemoryMalloc(len);
1612 1613
    if (pSyncNode != NULL && pSyncNode->pRaftCfg != NULL && pSyncNode->pRaftStore != NULL) {
      snprintf(s, len,
M
Minghao Li 已提交
1614 1615 1616 1617 1618 1619
               "vgId:%d, sync %s %s, tm:%" PRIu64 ", cmt:%" PRId64 ", fst:%" PRId64 ", lst:%" PRId64 ", snap:%" PRId64
               ", snap-tm:%" PRIu64
               ", sby:%d, "
               "stgy:%d, bch:%d, "
               "r-num:%d, "
               "lcfg:%" PRId64 ", chging:%d, rsto:%d, %s",
1620
               pSyncNode->vgId, syncUtilState2String(pSyncNode->state), str, pSyncNode->pRaftStore->currentTerm,
M
Minghao Li 已提交
1621
               pSyncNode->commitIndex, logBeginIndex, logLastIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm,
1622 1623 1624
               pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize,
               pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing,
               pSyncNode->restoreFinish, printStr);
1625 1626 1627
    } else {
      snprintf(s, len, "%s", str);
    }
1628
    // sDebug("%s", s);
M
Minghao Li 已提交
1629 1630
    // sInfo("%s", s);
    sTrace("%s", s);
M
Minghao Li 已提交
1631 1632
    taosMemoryFree(s);
  }
M
Minghao Li 已提交
1633 1634

  taosMemoryFree(pCfgStr);
M
Minghao Li 已提交
1635 1636
}

M
Minghao Li 已提交
1637 1638 1639 1640
void syncNodeErrorLog(const SSyncNode* pSyncNode, char* str) {
  int32_t userStrLen = strlen(str);

  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0};
M
Minghao Li 已提交
1641
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
M
Minghao Li 已提交
1642 1643
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
  }
M
Minghao Li 已提交
1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656

  SyncIndex logLastIndex = SYNC_INDEX_INVALID;
  SyncIndex logBeginIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pLogStore != NULL) {
    logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    logBeginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
  }

  char* pCfgStr = syncCfg2SimpleStr(&(pSyncNode->pRaftCfg->cfg));
  char* printStr = "";
  if (pCfgStr != NULL) {
    printStr = pCfgStr;
  }
M
Minghao Li 已提交
1657 1658

  if (userStrLen < 256) {
M
Minghao Li 已提交
1659
    char logBuf[256 + 256];
1660 1661
    if (pSyncNode != NULL && pSyncNode->pRaftCfg != NULL && pSyncNode->pRaftStore != NULL) {
      snprintf(logBuf, sizeof(logBuf),
M
Minghao Li 已提交
1662 1663 1664 1665 1666 1667
               "vgId:%d, sync %s %s, tm:%" PRIu64 ", cmt:%" PRId64 ", fst:%" PRId64 ", lst:%" PRId64 ", snap:%" PRId64
               ", snap-tm:%" PRIu64
               ", sby:%d, "
               "stgy:%d, bch:%d, "
               "r-num:%d, "
               "lcfg:%" PRId64 ", chging:%d, rsto:%d, %s",
1668
               pSyncNode->vgId, syncUtilState2String(pSyncNode->state), str, pSyncNode->pRaftStore->currentTerm,
M
Minghao Li 已提交
1669 1670 1671 1672
               pSyncNode->commitIndex, logBeginIndex, logLastIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm,
               pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize,
               pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing,
               pSyncNode->restoreFinish, printStr);
1673 1674 1675
    } else {
      snprintf(logBuf, sizeof(logBuf), "%s", str);
    }
M
Minghao Li 已提交
1676 1677 1678
    sError("%s", logBuf);

  } else {
M
Minghao Li 已提交
1679
    int   len = 256 + userStrLen;
M
Minghao Li 已提交
1680
    char* s = (char*)taosMemoryMalloc(len);
1681 1682
    if (pSyncNode != NULL && pSyncNode->pRaftCfg != NULL && pSyncNode->pRaftStore != NULL) {
      snprintf(s, len,
M
Minghao Li 已提交
1683 1684 1685 1686 1687 1688
               "vgId:%d, sync %s %s, tm:%" PRIu64 ", cmt:%" PRId64 ", fst:%" PRId64 ", lst:%" PRId64 ", snap:%" PRId64
               ", snap-tm:%" PRIu64
               ", sby:%d, "
               "stgy:%d, bch:%d, "
               "r-num:%d, "
               "lcfg:%" PRId64 ", chging:%d, rsto:%d, %s",
1689
               pSyncNode->vgId, syncUtilState2String(pSyncNode->state), str, pSyncNode->pRaftStore->currentTerm,
M
Minghao Li 已提交
1690 1691 1692 1693
               pSyncNode->commitIndex, logBeginIndex, logLastIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm,
               pSyncNode->pRaftCfg->isStandBy, pSyncNode->pRaftCfg->snapshotStrategy, pSyncNode->pRaftCfg->batchSize,
               pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing,
               pSyncNode->restoreFinish, printStr);
1694 1695 1696
    } else {
      snprintf(s, len, "%s", str);
    }
M
Minghao Li 已提交
1697 1698 1699
    sError("%s", s);
    taosMemoryFree(s);
  }
M
Minghao Li 已提交
1700 1701

  taosMemoryFree(pCfgStr);
M
Minghao Li 已提交
1702 1703
}

M
Minghao Li 已提交
1704 1705 1706
char* syncNode2SimpleStr(const SSyncNode* pSyncNode) {
  int   len = 256;
  char* s = (char*)taosMemoryMalloc(len);
M
Minghao Li 已提交
1707 1708 1709 1710 1711 1712 1713 1714

  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0};
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  SyncIndex logBeginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);

M
Minghao Li 已提交
1715
  snprintf(s, len,
M
Minghao Li 已提交
1716 1717 1718 1719
           "vgId:%d, sync %s, tm:%" PRIu64 ", cmt:%" PRId64 ", fst:%" PRId64 ", lst:%" PRId64 ", snap:%" PRId64
           ", sby:%d, "
           "r-num:%d, "
           "lcfg:%" PRId64 ", chging:%d, rsto:%d",
M
Minghao Li 已提交
1720 1721 1722 1723
           pSyncNode->vgId, syncUtilState2String(pSyncNode->state), pSyncNode->pRaftStore->currentTerm,
           pSyncNode->commitIndex, logBeginIndex, logLastIndex, snapshot.lastApplyIndex, pSyncNode->pRaftCfg->isStandBy,
           pSyncNode->replicaNum, pSyncNode->pRaftCfg->lastConfigIndex, pSyncNode->changing, pSyncNode->restoreFinish);

M
Minghao Li 已提交
1724 1725 1726
  return s;
}

1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753
bool syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config) {
  bool b1 = false;
  bool b2 = false;

  for (int i = 0; i < config->replicaNum; ++i) {
    if (strcmp((config->nodeInfo)[i].nodeFqdn, pSyncNode->myNodeInfo.nodeFqdn) == 0 &&
        (config->nodeInfo)[i].nodePort == pSyncNode->myNodeInfo.nodePort) {
      b1 = true;
      break;
    }
  }

  for (int i = 0; i < config->replicaNum; ++i) {
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((config->nodeInfo)[i].nodeFqdn, (config->nodeInfo)[i].nodePort);
    raftId.vgId = pSyncNode->vgId;

    if (syncUtilSameId(&raftId, &(pSyncNode->myRaftId))) {
      b2 = true;
      break;
    }
  }

  ASSERT(b1 == b2);
  return b1;
}

M
Minghao Li 已提交
1754
void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncIndex lastConfigChangeIndex) {
1755
  SSyncCfg oldConfig = pSyncNode->pRaftCfg->cfg;
1756
  pSyncNode->pRaftCfg->cfg = *pNewConfig;
1757 1758
  pSyncNode->pRaftCfg->lastConfigIndex = lastConfigChangeIndex;

M
Minghao Li 已提交
1759 1760
  bool IamInOld = syncNodeInConfig(pSyncNode, &oldConfig);
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewConfig);
M
Minghao Li 已提交
1761

M
Minghao Li 已提交
1762 1763
  bool isDrop = false;
  bool isAdd = false;
M
Minghao Li 已提交
1764

M
Minghao Li 已提交
1765 1766 1767 1768
  if (IamInOld && !IamInNew) {
    isDrop = true;
  } else {
    isDrop = false;
1769
  }
1770

M
Minghao Li 已提交
1771 1772 1773 1774 1775
  if (!IamInOld && IamInNew) {
    isAdd = true;
  } else {
    isAdd = false;
  }
M
Minghao Li 已提交
1776

M
Minghao Li 已提交
1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
  // log begin config change
  do {
    char  eventLog[256];
    char* pOldCfgStr = syncCfg2SimpleStr(&oldConfig);
    char* pNewCfgStr = syncCfg2SimpleStr(pNewConfig);
    snprintf(eventLog, sizeof(eventLog), "begin do config change, from %s to %s", pOldCfgStr, pNewCfgStr);
    syncNodeEventLog(pSyncNode, eventLog);
    taosMemoryFree(pOldCfgStr);
    taosMemoryFree(pNewCfgStr);
  } while (0);

M
Minghao Li 已提交
1788 1789
  if (IamInNew) {
    pSyncNode->pRaftCfg->isStandBy = 0;  // change isStandBy to normal
M
Minghao Li 已提交
1790
  }
M
Minghao Li 已提交
1791 1792
  if (isDrop) {
    pSyncNode->pRaftCfg->isStandBy = 1;  // set standby
M
Minghao Li 已提交
1793 1794
  }

M
Minghao Li 已提交
1795
  // add last config index
M
Minghao Li 已提交
1796
  raftCfgAddConfigIndex(pSyncNode->pRaftCfg, lastConfigChangeIndex);
M
Minghao Li 已提交
1797

M
Minghao Li 已提交
1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808
  if (IamInNew) {
    //-----------------------------------------
    int32_t ret = 0;

    // save snapshot senders
    int32_t oldReplicaNum = pSyncNode->replicaNum;
    SRaftId oldReplicasId[TSDB_MAX_REPLICA];
    memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
    SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
    for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
      oldSenders[i] = (pSyncNode->senders)[i];
M
Minghao Li 已提交
1809

M
Minghao Li 已提交
1810 1811 1812 1813
      char* eventLog = snapshotSender2SimpleStr(oldSenders[i], "snapshot sender save old");
      syncNodeEventLog(pSyncNode, eventLog);
      taosMemoryFree(eventLog);
    }
1814

M
Minghao Li 已提交
1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830
    // init internal
    pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
    syncUtilnodeInfo2raftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);

    // init peersNum, peers, peersId
    pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
    int j = 0;
    for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
      if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
        pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
        j++;
      }
    }
    for (int i = 0; i < pSyncNode->peersNum; ++i) {
      syncUtilnodeInfo2raftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
    }
1831

M
Minghao Li 已提交
1832 1833 1834 1835 1836
    // init replicaNum, replicasId
    pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
    for (int i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
      syncUtilnodeInfo2raftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
    }
1837

M
Minghao Li 已提交
1838 1839 1840 1841
    syncIndexMgrUpdate(pSyncNode->pNextIndex, pSyncNode);
    syncIndexMgrUpdate(pSyncNode->pMatchIndex, pSyncNode);
    voteGrantedUpdate(pSyncNode->pVotesGranted, pSyncNode);
    votesRespondUpdate(pSyncNode->pVotesRespond, pSyncNode);
M
Minghao Li 已提交
1842

M
Minghao Li 已提交
1843
    pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);
M
Minghao Li 已提交
1844

M
Minghao Li 已提交
1845
    // reset snapshot senders
1846

M
Minghao Li 已提交
1847 1848 1849 1850
    // clear new
    for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
      (pSyncNode->senders)[i] = NULL;
    }
M
Minghao Li 已提交
1851

M
Minghao Li 已提交
1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
    // reset new
    for (int i = 0; i < pSyncNode->replicaNum; ++i) {
      // reset sender
      bool reset = false;
      for (int j = 0; j < TSDB_MAX_REPLICA; ++j) {
        if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j])) {
          char     host[128];
          uint16_t port;
          syncUtilU642Addr((pSyncNode->replicasId)[i].addr, host, sizeof(host), &port);

          do {
            char eventLog[256];
S
Shengliang Guan 已提交
1864
            snprintf(eventLog, sizeof(eventLog), "snapshot sender reset for: %" PRIu64 ", newIndex:%d, %s:%d, %p",
M
Minghao Li 已提交
1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884
                     (pSyncNode->replicasId)[i].addr, i, host, port, oldSenders[j]);
            syncNodeEventLog(pSyncNode, eventLog);
          } while (0);

          (pSyncNode->senders)[i] = oldSenders[j];
          oldSenders[j] = NULL;
          reset = true;

          // reset replicaIndex
          int32_t oldreplicaIndex = (pSyncNode->senders)[i]->replicaIndex;
          (pSyncNode->senders)[i]->replicaIndex = i;

          do {
            char eventLog[256];
            snprintf(eventLog, sizeof(eventLog),
                     "snapshot sender udpate replicaIndex from %d to %d, %s:%d, %p, reset:%d", oldreplicaIndex, i, host,
                     port, (pSyncNode->senders)[i], reset);
            syncNodeEventLog(pSyncNode, eventLog);
          } while (0);
        }
1885 1886
      }
    }
1887

M
Minghao Li 已提交
1888 1889 1890 1891
    // create new
    for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
      if ((pSyncNode->senders)[i] == NULL) {
        (pSyncNode->senders)[i] = snapshotSenderCreate(pSyncNode, i);
M
Minghao Li 已提交
1892

M
Minghao Li 已提交
1893 1894 1895 1896
        char* eventLog = snapshotSender2SimpleStr((pSyncNode->senders)[i], "snapshot sender create new");
        syncNodeEventLog(pSyncNode, eventLog);
        taosMemoryFree(eventLog);
      }
1897 1898
    }

M
Minghao Li 已提交
1899 1900 1901 1902
    // free old
    for (int i = 0; i < TSDB_MAX_REPLICA; ++i) {
      if (oldSenders[i] != NULL) {
        snapshotSenderDestroy(oldSenders[i]);
M
Minghao Li 已提交
1903

M
Minghao Li 已提交
1904 1905 1906 1907 1908
        do {
          char eventLog[128];
          snprintf(eventLog, sizeof(eventLog), "snapshot sender delete old %p replica-index:%d", oldSenders[i], i);
          syncNodeEventLog(pSyncNode, eventLog);
        } while (0);
M
Minghao Li 已提交
1909

M
Minghao Li 已提交
1910 1911
        oldSenders[i] = NULL;
      }
1912 1913
    }

1914
    // persist cfg
M
Minghao Li 已提交
1915
    raftCfgPersist(pSyncNode->pRaftCfg);
1916

M
Minghao Li 已提交
1917 1918 1919
    char  tmpbuf[512];
    char* oldStr = syncCfg2SimpleStr(&oldConfig);
    char* newStr = syncCfg2SimpleStr(pNewConfig);
1920 1921
    snprintf(tmpbuf, sizeof(tmpbuf), "config change from %d to %d, index:%" PRId64 ", %s  -->  %s",
             oldConfig.replicaNum, pNewConfig->replicaNum, lastConfigChangeIndex, oldStr, newStr);
M
Minghao Li 已提交
1922 1923
    taosMemoryFree(oldStr);
    taosMemoryFree(newStr);
M
Minghao Li 已提交
1924

M
Minghao Li 已提交
1925 1926 1927
    // change isStandBy to normal (election timeout)
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
      syncNodeBecomeLeader(pSyncNode, tmpbuf);
1928 1929 1930

      // Raft 3.6.2 Committing entries from previous terms
      syncNodeAppendNoop(pSyncNode);
1931
#if 0  // simon
1932
      syncNodeReplicate(pSyncNode);
1933
#endif
1934 1935
      syncMaybeAdvanceCommitIndex(pSyncNode);

M
Minghao Li 已提交
1936 1937 1938 1939
    } else {
      syncNodeBecomeFollower(pSyncNode, tmpbuf);
    }
  } else {
1940
    // persist cfg
M
Minghao Li 已提交
1941
    raftCfgPersist(pSyncNode->pRaftCfg);
1942

M
Minghao Li 已提交
1943 1944 1945
    char  tmpbuf[512];
    char* oldStr = syncCfg2SimpleStr(&oldConfig);
    char* newStr = syncCfg2SimpleStr(pNewConfig);
1946 1947
    snprintf(tmpbuf, sizeof(tmpbuf), "do not config change from %d to %d, index:%" PRId64 ", %s  -->  %s",
             oldConfig.replicaNum, pNewConfig->replicaNum, lastConfigChangeIndex, oldStr, newStr);
M
Minghao Li 已提交
1948 1949 1950
    taosMemoryFree(oldStr);
    taosMemoryFree(newStr);
    syncNodeEventLog(pSyncNode, tmpbuf);
1951
  }
1952

M
Minghao Li 已提交
1953
_END:
M
Minghao Li 已提交
1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964

  // log end config change
  do {
    char  eventLog[256];
    char* pOldCfgStr = syncCfg2SimpleStr(&oldConfig);
    char* pNewCfgStr = syncCfg2SimpleStr(pNewConfig);
    snprintf(eventLog, sizeof(eventLog), "end do config change, from %s to %s", pOldCfgStr, pNewCfgStr);
    syncNodeEventLog(pSyncNode, eventLog);
    taosMemoryFree(pOldCfgStr);
    taosMemoryFree(pNewCfgStr);
  } while (0);
M
Minghao Li 已提交
1965
  return;
M
Minghao Li 已提交
1966 1967
}

M
Minghao Li 已提交
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
SSyncNode* syncNodeAcquire(int64_t rid) {
  SSyncNode* pNode = taosAcquireRef(tsNodeRefId, rid);
  if (pNode == NULL) {
    sTrace("failed to acquire node from refId:%" PRId64, rid);
  }

  return pNode;
}

void syncNodeRelease(SSyncNode* pNode) { taosReleaseRef(tsNodeRefId, pNode->rid); }

M
Minghao Li 已提交
1979 1980 1981 1982
// raft state change --------------
void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
1983
    char tmpBuf[64];
S
Shengliang Guan 已提交
1984
    snprintf(tmpBuf, sizeof(tmpBuf), "update term to %" PRIu64, term);
1985
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
M
Minghao Li 已提交
1986 1987 1988 1989
    raftStoreClearVote(pSyncNode->pRaftStore);
  }
}

1990
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
M
Minghao Li 已提交
1991
  // maybe clear leader cache
M
Minghao Li 已提交
1992 1993 1994 1995
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    pSyncNode->leaderCache = EMPTY_RAFT_ID;
  }

M
Minghao Li 已提交
1996
  // state change
M
Minghao Li 已提交
1997 1998 1999
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
2000 2001
  // reset elect timer
  syncNodeResetElectTimer(pSyncNode);
M
Minghao Li 已提交
2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016

  // trace log
  do {
    int32_t debugStrLen = strlen(debugStr);
    if (debugStrLen < 256) {
      char eventLog[256 + 64];
      snprintf(eventLog, sizeof(eventLog), "become follower %s", debugStr);
      syncNodeEventLog(pSyncNode, eventLog);
    } else {
      char* eventLog = taosMemoryMalloc(debugStrLen + 64);
      snprintf(eventLog, debugStrLen, "become follower %s", debugStr);
      syncNodeEventLog(pSyncNode, eventLog);
      taosMemoryFree(eventLog);
    }
  } while (0);
M
Minghao Li 已提交
2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036
}

// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
//     /\ state[i] = Candidate
//     /\ votesGranted[i] \in Quorum
//     /\ state'      = [state EXCEPT ![i] = Leader]
//     /\ nextIndex'  = [nextIndex EXCEPT ![i] =
//                          [j \in Server |-> Len(log[i]) + 1]]
//     /\ matchIndex' = [matchIndex EXCEPT ![i] =
//                          [j \in Server |-> 0]]
//     /\ elections'  = elections \cup
//                          {[eterm     |-> currentTerm[i],
//                            eleader   |-> i,
//                            elog      |-> log[i],
//                            evotes    |-> votesGranted[i],
//                            evoterLog |-> voterLog[i]]}
//     /\ UNCHANGED <<messages, currentTerm, votedFor, candidateVars, logVars>>
//
2037
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
2038 2039 2040
  // reset restoreFinish
  pSyncNode->restoreFinish = false;

M
Minghao Li 已提交
2041
  // state change
M
Minghao Li 已提交
2042
  pSyncNode->state = TAOS_SYNC_STATE_LEADER;
M
Minghao Li 已提交
2043 2044

  // set leader cache
M
Minghao Li 已提交
2045 2046 2047
  pSyncNode->leaderCache = pSyncNode->myRaftId;

  for (int i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
M
Minghao Li 已提交
2048 2049
    // maybe overwrite myself, no harm
    // just do it!
2050 2051 2052 2053 2054 2055 2056 2057 2058

    // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1;

    // maybe wal is deleted
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
    ASSERT(code == 0);
    pSyncNode->pNextIndex->index[i] = lastIndex + 1;
M
Minghao Li 已提交
2059 2060 2061
  }

  for (int i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
M
Minghao Li 已提交
2062 2063
    // maybe overwrite myself, no harm
    // just do it!
M
Minghao Li 已提交
2064 2065 2066
    pSyncNode->pMatchIndex->index[i] = SYNC_INDEX_INVALID;
  }

2067 2068
  // update sender private term
  SSyncSnapshotSender* pMySender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->myRaftId));
2069 2070 2071 2072 2073
  if (pMySender != NULL) {
    for (int i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
      if ((pSyncNode->senders)[i]->privateTerm > pMySender->privateTerm) {
        pMySender->privateTerm = (pSyncNode->senders)[i]->privateTerm;
      }
2074
    }
2075
    (pMySender->privateTerm) += 100;
2076 2077
  }

M
Minghao Li 已提交
2078
  // stop elect timer
M
Minghao Li 已提交
2079
  syncNodeStopElectTimer(pSyncNode);
M
Minghao Li 已提交
2080

M
Minghao Li 已提交
2081 2082
  // start heartbeat timer
  syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097

  // trace log
  do {
    int32_t debugStrLen = strlen(debugStr);
    if (debugStrLen < 256) {
      char eventLog[256 + 64];
      snprintf(eventLog, sizeof(eventLog), "become leader %s", debugStr);
      syncNodeEventLog(pSyncNode, eventLog);
    } else {
      char* eventLog = taosMemoryMalloc(debugStrLen + 64);
      snprintf(eventLog, debugStrLen, "become leader %s", debugStr);
      syncNodeEventLog(pSyncNode, eventLog);
      taosMemoryFree(eventLog);
    }
  } while (0);
M
Minghao Li 已提交
2098 2099 2100
}

void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
2101 2102
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
2103
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
M
Minghao Li 已提交
2104

M
Minghao Li 已提交
2105 2106
  syncNodeLog2("==state change syncNodeCandidate2Leader==", pSyncNode);

M
Minghao Li 已提交
2107
  // Raft 3.6.2 Committing entries from previous terms
2108
  syncNodeAppendNoop(pSyncNode);
2109
#if 0  // simon
2110
  syncNodeReplicate(pSyncNode);
2111
#endif
2112
  syncMaybeAdvanceCommitIndex(pSyncNode);
M
Minghao Li 已提交
2113 2114 2115
}

void syncNodeFollower2Candidate(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
2116
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER);
M
Minghao Li 已提交
2117
  pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE;
M
Minghao Li 已提交
2118

M
Minghao Li 已提交
2119
  syncNodeEventLog(pSyncNode, "follower to candidate");
M
Minghao Li 已提交
2120 2121 2122
}

void syncNodeLeader2Follower(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
2123
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
2124
  syncNodeBecomeFollower(pSyncNode, "leader to follower");
M
Minghao Li 已提交
2125

M
Minghao Li 已提交
2126
  syncNodeEventLog(pSyncNode, "leader to follower");
M
Minghao Li 已提交
2127 2128 2129
}

void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
2130
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
2131
  syncNodeBecomeFollower(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
2132

M
Minghao Li 已提交
2133
  syncNodeEventLog(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
2134 2135 2136
}

// raft vote --------------
M
Minghao Li 已提交
2137 2138 2139

// just called by syncNodeVoteForSelf
// need assert
M
Minghao Li 已提交
2140
void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
M
Minghao Li 已提交
2141 2142
  ASSERT(term == pSyncNode->pRaftStore->currentTerm);
  ASSERT(!raftStoreHasVoted(pSyncNode->pRaftStore));
M
Minghao Li 已提交
2143 2144 2145 2146

  raftStoreVote(pSyncNode->pRaftStore, pRaftId);
}

M
Minghao Li 已提交
2147
// simulate get vote from outside
M
Minghao Li 已提交
2148 2149 2150
void syncNodeVoteForSelf(SSyncNode* pSyncNode) {
  syncNodeVoteForTerm(pSyncNode, pSyncNode->pRaftStore->currentTerm, &(pSyncNode->myRaftId));

M
Minghao Li 已提交
2151
  SyncRequestVoteReply* pMsg = syncRequestVoteReplyBuild(pSyncNode->vgId);
M
Minghao Li 已提交
2152 2153 2154 2155 2156 2157 2158 2159 2160 2161
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = pSyncNode->myRaftId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->voteGranted = true;

  voteGrantedVote(pSyncNode->pVotesGranted, pMsg);
  votesRespondAdd(pSyncNode->pVotesRespond, pMsg);
  syncRequestVoteReplyDestroy(pMsg);
}

M
Minghao Li 已提交
2162
// snapshot --------------
M
Minghao Li 已提交
2163 2164

// return if has a snapshot
M
Minghao Li 已提交
2165 2166
bool syncNodeHasSnapshot(SSyncNode* pSyncNode) {
  bool      ret = false;
2167
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
2168 2169
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
2170 2171 2172 2173 2174 2175 2176
    if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN) {
      ret = true;
    }
  }
  return ret;
}

M
Minghao Li 已提交
2177 2178
// return max(logLastIndex, snapshotLastIndex)
// if no snapshot and log, return -1
M
Minghao Li 已提交
2179
SyncIndex syncNodeGetLastIndex(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
2180
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
2181 2182
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
2183 2184 2185 2186 2187 2188 2189
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);

  SyncIndex lastIndex = logLastIndex > snapshot.lastApplyIndex ? logLastIndex : snapshot.lastApplyIndex;
  return lastIndex;
}

M
Minghao Li 已提交
2190 2191
// return the last term of snapshot and log
// if error, return SYNC_TERM_INVALID (by syncLogLastTerm)
M
Minghao Li 已提交
2192 2193
SyncTerm syncNodeGetLastTerm(SSyncNode* pSyncNode) {
  SyncTerm lastTerm = 0;
M
Minghao Li 已提交
2194 2195
  if (syncNodeHasSnapshot(pSyncNode)) {
    // has snapshot
2196
    SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
2197 2198
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
2199 2200
    }

M
Minghao Li 已提交
2201 2202 2203
    SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    if (logLastIndex > snapshot.lastApplyIndex) {
      lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
M
Minghao Li 已提交
2204 2205 2206 2207
    } else {
      lastTerm = snapshot.lastApplyTerm;
    }

M
Minghao Li 已提交
2208
  } else {
M
Minghao Li 已提交
2209 2210
    // no snapshot
    lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
2211
  }
M
Minghao Li 已提交
2212

M
Minghao Li 已提交
2213 2214 2215 2216 2217 2218 2219
  return lastTerm;
}

// get last index and term along with snapshot
int32_t syncNodeGetLastIndexTerm(SSyncNode* pSyncNode, SyncIndex* pLastIndex, SyncTerm* pLastTerm) {
  *pLastIndex = syncNodeGetLastIndex(pSyncNode);
  *pLastTerm = syncNodeGetLastTerm(pSyncNode);
2220 2221
  return 0;
}
M
Minghao Li 已提交
2222

M
Minghao Li 已提交
2223
// return append-entries first try index
M
Minghao Li 已提交
2224 2225 2226 2227 2228
SyncIndex syncNodeSyncStartIndex(SSyncNode* pSyncNode) {
  SyncIndex syncStartIndex = syncNodeGetLastIndex(pSyncNode) + 1;
  return syncStartIndex;
}

M
Minghao Li 已提交
2229 2230
// if index > 0, return index - 1
// else, return -1
2231 2232 2233 2234 2235 2236 2237 2238 2239
SyncIndex syncNodeGetPreIndex(SSyncNode* pSyncNode, SyncIndex index) {
  SyncIndex preIndex = index - 1;
  if (preIndex < SYNC_INDEX_INVALID) {
    preIndex = SYNC_INDEX_INVALID;
  }

  return preIndex;
}

M
Minghao Li 已提交
2240 2241 2242 2243
// if index < 0, return SYNC_TERM_INVALID
// if index == 0, return 0
// if index > 0, return preTerm
// if error, return SYNC_TERM_INVALID
2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256
SyncTerm syncNodeGetPreTerm(SSyncNode* pSyncNode, SyncIndex index) {
  if (index < SYNC_INDEX_BEGIN) {
    return SYNC_TERM_INVALID;
  }

  if (index == SYNC_INDEX_BEGIN) {
    return 0;
  }

  SyncTerm        preTerm = 0;
  SyncIndex       preIndex = index - 1;
  SSyncRaftEntry* pPreEntry = NULL;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, preIndex, &pPreEntry);
M
Minghao Li 已提交
2257 2258 2259 2260 2261 2262

  SSnapshot snapshot = {.data = NULL,
                        .lastApplyIndex = SYNC_INDEX_INVALID,
                        .lastApplyTerm = SYNC_TERM_INVALID,
                        .lastConfigIndex = SYNC_INDEX_INVALID};

2263 2264 2265 2266 2267 2268
  if (code == 0) {
    ASSERT(pPreEntry != NULL);
    preTerm = pPreEntry->term;
    taosMemoryFree(pPreEntry);
    return preTerm;
  } else {
2269 2270 2271 2272
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
      if (snapshot.lastApplyIndex == preIndex) {
        return snapshot.lastApplyTerm;
2273 2274 2275 2276
      }
    }
  }

2277 2278
  do {
    char logBuf[128];
S
Shengliang Guan 已提交
2279 2280
    snprintf(logBuf, sizeof(logBuf),
             "sync node get pre term error, index:%" PRId64 ", snap-index:%" PRId64 ", snap-term:%" PRIu64, index,
M
Minghao Li 已提交
2281
             snapshot.lastApplyIndex, snapshot.lastApplyTerm);
2282 2283 2284
    syncNodeErrorLog(pSyncNode, logBuf);
  } while (0);

2285 2286
  return SYNC_TERM_INVALID;
}
M
Minghao Li 已提交
2287 2288 2289 2290

// get pre index and term of "index"
int32_t syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncIndex* pPreIndex, SyncTerm* pPreTerm) {
  *pPreIndex = syncNodeGetPreIndex(pSyncNode, index);
M
Minghao Li 已提交
2291
  *pPreTerm = syncNodeGetPreTerm(pSyncNode, index);
M
Minghao Li 已提交
2292 2293 2294
  return 0;
}

M
Minghao Li 已提交
2295 2296 2297
// for debug --------------
void syncNodePrint(SSyncNode* pObj) {
  char* serialized = syncNode2Str(pObj);
S
Shengliang Guan 已提交
2298
  printf("syncNodePrint | len:%" PRIu64 " | %s \n", strlen(serialized), serialized);
M
Minghao Li 已提交
2299
  fflush(NULL);
wafwerar's avatar
wafwerar 已提交
2300
  taosMemoryFree(serialized);
M
Minghao Li 已提交
2301 2302 2303 2304
}

void syncNodePrint2(char* s, SSyncNode* pObj) {
  char* serialized = syncNode2Str(pObj);
S
Shengliang Guan 已提交
2305
  printf("syncNodePrint2 | len:%" PRIu64 " | %s | %s \n", strlen(serialized), s, serialized);
M
Minghao Li 已提交
2306
  fflush(NULL);
wafwerar's avatar
wafwerar 已提交
2307
  taosMemoryFree(serialized);
M
Minghao Li 已提交
2308 2309 2310 2311
}

void syncNodeLog(SSyncNode* pObj) {
  char* serialized = syncNode2Str(pObj);
S
Shengliang Guan 已提交
2312
  sTraceLong("syncNodeLog | len:%" PRIu64 " | %s", strlen(serialized), serialized);
wafwerar's avatar
wafwerar 已提交
2313
  taosMemoryFree(serialized);
M
Minghao Li 已提交
2314 2315 2316
}

void syncNodeLog2(char* s, SSyncNode* pObj) {
2317 2318
  if (gRaftDetailLog) {
    char* serialized = syncNode2Str(pObj);
S
Shengliang Guan 已提交
2319
    sTraceLong("syncNodeLog2 | len:%" PRIu64 " | %s | %s", strlen(serialized), s, serialized);
2320 2321
    taosMemoryFree(serialized);
  }
M
Minghao Li 已提交
2322 2323
}

M
Minghao Li 已提交
2324 2325
void syncNodeLog3(char* s, SSyncNode* pObj) {
  char* serialized = syncNode2Str(pObj);
S
Shengliang Guan 已提交
2326
  sTraceLong("syncNodeLog3 | len:%" PRIu64 " | %s | %s", strlen(serialized), s, serialized);
M
Minghao Li 已提交
2327 2328 2329
  taosMemoryFree(serialized);
}

M
Minghao Li 已提交
2330
// ------ local funciton ---------
M
Minghao Li 已提交
2331
// enqueue message ----
M
Minghao Li 已提交
2332 2333
static void syncNodeEqPingTimer(void* param, void* tmrId) {
  SSyncNode* pSyncNode = (SSyncNode*)param;
M
Minghao Li 已提交
2334
  if (atomic_load_64(&pSyncNode->pingTimerLogicClockUser) <= atomic_load_64(&pSyncNode->pingTimerLogicClock)) {
M
Minghao Li 已提交
2335
    SyncTimeout* pSyncMsg = syncTimeoutBuild2(SYNC_TIMEOUT_PING, atomic_load_64(&pSyncNode->pingTimerLogicClock),
M
Minghao Li 已提交
2336
                                              pSyncNode->pingTimerMS, pSyncNode->vgId, pSyncNode);
M
Minghao Li 已提交
2337 2338
    SRpcMsg      rpcMsg;
    syncTimeout2RpcMsg(pSyncMsg, &rpcMsg);
M
Minghao Li 已提交
2339
    syncRpcMsgLog2((char*)"==syncNodeEqPingTimer==", &rpcMsg);
M
Minghao Li 已提交
2340
    if (pSyncNode->FpEqMsg != NULL) {
2341 2342
      int32_t code = pSyncNode->FpEqMsg(pSyncNode->msgcb, &rpcMsg);
      if (code != 0) {
S
Shengliang Guan 已提交
2343
        sError("vgId:%d, sync enqueue ping msg error, code:%d", pSyncNode->vgId, code);
2344 2345 2346 2347
        rpcFreeCont(rpcMsg.pCont);
        syncTimeoutDestroy(pSyncMsg);
        return;
      }
M
Minghao Li 已提交
2348 2349 2350
    } else {
      sTrace("syncNodeEqPingTimer pSyncNode->FpEqMsg is NULL");
    }
M
Minghao Li 已提交
2351 2352
    syncTimeoutDestroy(pSyncMsg);

2353 2354 2355 2356 2357 2358 2359
    if (syncEnvIsStart()) {
      taosTmrReset(syncNodeEqPingTimer, pSyncNode->pingTimerMS, pSyncNode, gSyncEnv->pTimerManager,
                   &pSyncNode->pPingTimer);
    } else {
      sError("sync env is stop, syncNodeEqPingTimer");
    }

M
Minghao Li 已提交
2360
  } else {
S
Shengliang Guan 已提交
2361
    sTrace("==syncNodeEqPingTimer== pingTimerLogicClock:%" PRIu64 ", pingTimerLogicClockUser:%" PRIu64,
M
Minghao Li 已提交
2362
           pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
M
Minghao Li 已提交
2363 2364 2365 2366 2367 2368 2369
  }
}

static void syncNodeEqElectTimer(void* param, void* tmrId) {
  SSyncNode* pSyncNode = (SSyncNode*)param;
  if (atomic_load_64(&pSyncNode->electTimerLogicClockUser) <= atomic_load_64(&pSyncNode->electTimerLogicClock)) {
    SyncTimeout* pSyncMsg = syncTimeoutBuild2(SYNC_TIMEOUT_ELECTION, atomic_load_64(&pSyncNode->electTimerLogicClock),
M
Minghao Li 已提交
2370
                                              pSyncNode->electTimerMS, pSyncNode->vgId, pSyncNode);
M
Minghao Li 已提交
2371
    SRpcMsg      rpcMsg;
M
Minghao Li 已提交
2372
    syncTimeout2RpcMsg(pSyncMsg, &rpcMsg);
M
Minghao Li 已提交
2373
    syncRpcMsgLog2((char*)"==syncNodeEqElectTimer==", &rpcMsg);
M
Minghao Li 已提交
2374
    if (pSyncNode->FpEqMsg != NULL) {
2375 2376
      int32_t code = pSyncNode->FpEqMsg(pSyncNode->msgcb, &rpcMsg);
      if (code != 0) {
S
Shengliang Guan 已提交
2377
        sError("vgId:%d, sync enqueue elect msg error, code:%d", pSyncNode->vgId, code);
2378 2379 2380 2381
        rpcFreeCont(rpcMsg.pCont);
        syncTimeoutDestroy(pSyncMsg);
        return;
      }
M
Minghao Li 已提交
2382
    } else {
2383
      sTrace("syncNodeEqElectTimer FpEqMsg is NULL");
M
Minghao Li 已提交
2384
    }
M
Minghao Li 已提交
2385 2386
    syncTimeoutDestroy(pSyncMsg);

M
Minghao Li 已提交
2387
    // reset timer ms
2388
    if (syncEnvIsStart()) {
2389 2390 2391 2392
      pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
      taosTmrReset(syncNodeEqElectTimer, pSyncNode->electTimerMS, pSyncNode, gSyncEnv->pTimerManager,
                   &pSyncNode->pElectTimer);
    } else {
2393
      sError("sync env is stop, syncNodeEqElectTimer");
2394
    }
M
Minghao Li 已提交
2395
  } else {
S
Shengliang Guan 已提交
2396
    sTrace("==syncNodeEqElectTimer== electTimerLogicClock:%" PRIu64 ", electTimerLogicClockUser:%" PRIu64,
M
Minghao Li 已提交
2397
           pSyncNode->electTimerLogicClock, pSyncNode->electTimerLogicClockUser);
M
Minghao Li 已提交
2398 2399 2400
  }
}

M
Minghao Li 已提交
2401 2402
static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
  SSyncNode* pSyncNode = (SSyncNode*)param;
2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414
  if (pSyncNode->replicaNum > 1) {
    if (atomic_load_64(&pSyncNode->heartbeatTimerLogicClockUser) <=
        atomic_load_64(&pSyncNode->heartbeatTimerLogicClock)) {
      SyncTimeout* pSyncMsg =
          syncTimeoutBuild2(SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pSyncNode->heartbeatTimerLogicClock),
                            pSyncNode->heartbeatTimerMS, pSyncNode->vgId, pSyncNode);
      SRpcMsg rpcMsg;
      syncTimeout2RpcMsg(pSyncMsg, &rpcMsg);
      syncRpcMsgLog2((char*)"==syncNodeEqHeartbeatTimer==", &rpcMsg);
      if (pSyncNode->FpEqMsg != NULL) {
        int32_t code = pSyncNode->FpEqMsg(pSyncNode->msgcb, &rpcMsg);
        if (code != 0) {
S
Shengliang Guan 已提交
2415
          sError("vgId:%d, sync enqueue timer msg error, code:%d", pSyncNode->vgId, code);
2416 2417 2418 2419 2420 2421
          rpcFreeCont(rpcMsg.pCont);
          syncTimeoutDestroy(pSyncMsg);
          return;
        }
      } else {
        sError("syncNodeEqHeartbeatTimer FpEqMsg is NULL");
2422
      }
2423
      syncTimeoutDestroy(pSyncMsg);
M
Minghao Li 已提交
2424

2425 2426 2427 2428 2429 2430
      if (syncEnvIsStart()) {
        taosTmrReset(syncNodeEqHeartbeatTimer, pSyncNode->heartbeatTimerMS, pSyncNode, gSyncEnv->pTimerManager,
                     &pSyncNode->pHeartbeatTimer);
      } else {
        sError("sync env is stop, syncNodeEqHeartbeatTimer");
      }
2431
    } else {
2432 2433 2434
      sTrace("==syncNodeEqHeartbeatTimer== heartbeatTimerLogicClock:%" PRIu64 ", heartbeatTimerLogicClockUser:%" PRIu64
             "",
             pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
2435
    }
M
Minghao Li 已提交
2436 2437 2438
  }
}

M
Minghao Li 已提交
2439 2440
static int32_t syncNodeEqNoop(SSyncNode* ths) {
  int32_t ret = 0;
M
Minghao Li 已提交
2441
  ASSERT(ths->state == TAOS_SYNC_STATE_LEADER);
M
Minghao Li 已提交
2442

2443
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2444
  SyncTerm        term = ths->pRaftStore->currentTerm;
M
Minghao Li 已提交
2445
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
M
Minghao Li 已提交
2446
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2447 2448 2449 2450

  uint32_t           entryLen;
  char*              serialized = syncEntrySerialize(pEntry, &entryLen);
  SyncClientRequest* pSyncMsg = syncClientRequestBuild(entryLen);
M
Minghao Li 已提交
2451
  ASSERT(pSyncMsg->dataLen == entryLen);
M
Minghao Li 已提交
2452 2453
  memcpy(pSyncMsg->data, serialized, entryLen);

S
Shengliang Guan 已提交
2454
  SRpcMsg rpcMsg = {0};
M
Minghao Li 已提交
2455
  syncClientRequest2RpcMsg(pSyncMsg, &rpcMsg);
M
Minghao Li 已提交
2456
  if (ths->FpEqMsg != NULL) {
S
Shengliang Guan 已提交
2457
    ths->FpEqMsg(ths->msgcb, &rpcMsg);
M
Minghao Li 已提交
2458 2459 2460
  } else {
    sTrace("syncNodeEqNoop pSyncNode->FpEqMsg is NULL");
  }
M
Minghao Li 已提交
2461

wafwerar's avatar
wafwerar 已提交
2462
  taosMemoryFree(serialized);
M
Minghao Li 已提交
2463 2464 2465 2466 2467 2468 2469 2470
  syncClientRequestDestroy(pSyncMsg);

  return ret;
}

static int32_t syncNodeAppendNoop(SSyncNode* ths) {
  int32_t ret = 0;

2471
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2472
  SyncTerm        term = ths->pRaftStore->currentTerm;
M
Minghao Li 已提交
2473
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
M
Minghao Li 已提交
2474
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2475 2476

  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2477 2478
    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    ASSERT(code == 0);
M
Minghao Li 已提交
2479 2480 2481
    syncNodeReplicate(ths);
  }

M
Minghao Li 已提交
2482
  syncEntryDestory(pEntry);
M
Minghao Li 已提交
2483 2484 2485
  return ret;
}

M
Minghao Li 已提交
2486
// on message ----
M
Minghao Li 已提交
2487 2488
int32_t syncNodeOnPingCb(SSyncNode* ths, SyncPing* pMsg) {
  // log state
2489
  char logBuf[1024] = {0};
M
Minghao Li 已提交
2490
  snprintf(logBuf, sizeof(logBuf),
2491 2492
           "==syncNodeOnPingCb== vgId:%d, state: %d, %s, term:%" PRIu64 " electTimerLogicClock:%" PRIu64
           ", "
S
Shengliang Guan 已提交
2493
           "electTimerLogicClockUser:%" PRIu64 ", electTimerMS:%d",
M
Minghao Li 已提交
2494 2495 2496
           ths->vgId, ths->state, syncUtilState2String(ths->state), ths->pRaftStore->currentTerm,
           ths->electTimerLogicClock, ths->electTimerLogicClockUser, ths->electTimerMS);

M
Minghao Li 已提交
2497
  int32_t ret = 0;
M
Minghao Li 已提交
2498 2499
  syncPingLog2(logBuf, pMsg);
  SyncPingReply* pMsgReply = syncPingReplyBuild3(&ths->myRaftId, &pMsg->srcId, ths->vgId);
M
Minghao Li 已提交
2500 2501
  SRpcMsg        rpcMsg;
  syncPingReply2RpcMsg(pMsgReply, &rpcMsg);
M
Minghao Li 已提交
2502 2503 2504 2505 2506 2507 2508 2509

  /*
    // htonl
    SMsgHead* pHead = rpcMsg.pCont;
    pHead->contLen = htonl(pHead->contLen);
    pHead->vgId = htonl(pHead->vgId);
  */

M
Minghao Li 已提交
2510 2511 2512 2513 2514
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);

  return ret;
}

M
Minghao Li 已提交
2515
int32_t syncNodeOnPingReplyCb(SSyncNode* ths, SyncPingReply* pMsg) {
M
Minghao Li 已提交
2516 2517 2518 2519
  int32_t ret = 0;
  syncPingReplyLog2("==syncNodeOnPingReplyCb==", pMsg);
  return ret;
}
M
Minghao Li 已提交
2520

M
Minghao Li 已提交
2521 2522 2523 2524 2525 2526 2527 2528 2529 2530
// TLA+ Spec
// ClientRequest(i, v) ==
//     /\ state[i] = Leader
//     /\ LET entry == [term  |-> currentTerm[i],
//                      value |-> v]
//            newLog == Append(log[i], entry)
//        IN  log' = [log EXCEPT ![i] = newLog]
//     /\ UNCHANGED <<messages, serverVars, candidateVars,
//                    leaderVars, commitIndex>>
//
2531
int32_t syncNodeOnClientRequestCb(SSyncNode* ths, SyncClientRequest* pMsg, SyncIndex* pRetIndex) {
M
Minghao Li 已提交
2532
  int32_t ret = 0;
2533
  int32_t code = 0;
M
Minghao Li 已提交
2534 2535
  syncClientRequestLog2("==syncNodeOnClientRequestCb==", pMsg);

M
Minghao Li 已提交
2536
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2537 2538
  SyncTerm        term = ths->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = syncEntryBuild2((SyncClientRequest*)pMsg, term, index);
M
Minghao Li 已提交
2539
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2540

M
Minghao Li 已提交
2541
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2542 2543 2544 2545 2546 2547 2548
    // append entry
    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    if (code != 0) {
      // del resp mgr, call FpCommitCb
      ASSERT(0);
      return -1;
    }
M
Minghao Li 已提交
2549

2550 2551 2552
    // if mulit replica, start replicate right now
    if (ths->replicaNum > 1) {
      syncNodeReplicate(ths);
M
Minghao Li 已提交
2553

2554 2555 2556
      // pre commit
      syncNodePreCommit(ths, pEntry, 0);
    }
2557

2558 2559 2560 2561
    // if only myself, maybe commit right now
    if (ths->replicaNum == 1) {
      syncMaybeAdvanceCommitIndex(ths);
    }
M
Minghao Li 已提交
2562 2563
  }

2564 2565 2566 2567 2568 2569 2570 2571
  if (pRetIndex != NULL) {
    if (ret == 0 && pEntry != NULL) {
      *pRetIndex = pEntry->index;
    } else {
      *pRetIndex = SYNC_INDEX_INVALID;
    }
  }

M
Minghao Li 已提交
2572
  syncEntryDestory(pEntry);
M
Minghao Li 已提交
2573
  return ret;
2574
}
M
Minghao Li 已提交
2575

M
Minghao Li 已提交
2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608
int32_t syncNodeOnClientRequestBatchCb(SSyncNode* ths, SyncClientRequestBatch* pMsg) {
  int32_t code = 0;

  if (ths->state != TAOS_SYNC_STATE_LEADER) {
    // call FpCommitCb, delete resp mgr
    return -1;
  }

  SyncIndex index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
  SyncTerm  term = ths->pRaftStore->currentTerm;

  int32_t    raftMetaArrayLen = sizeof(SRaftMeta) * pMsg->dataCount;
  int32_t    rpcArrayLen = sizeof(SRpcMsg) * pMsg->dataCount;
  SRaftMeta* raftMetaArr = (SRaftMeta*)(pMsg->data);
  SRpcMsg*   msgArr = (SRpcMsg*)((char*)(pMsg->data) + raftMetaArrayLen);
  for (int32_t i = 0; i < pMsg->dataCount; ++i) {
    SSyncRaftEntry* pEntry = syncEntryBuild(msgArr[i].contLen);
    ASSERT(pEntry != NULL);

    pEntry->originalRpcType = msgArr[i].msgType;
    pEntry->seqNum = raftMetaArr[i].seqNum;
    pEntry->isWeak = raftMetaArr[i].isWeak;
    pEntry->term = term;
    pEntry->index = index;
    memcpy(pEntry->data, msgArr[i].pCont, msgArr[i].contLen);
    ASSERT(msgArr[i].contLen == pEntry->dataLen);

    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    if (code != 0) {
      // del resp mgr, call FpCommitCb
      ASSERT(0);
      return -1;
    }
M
Minghao Li 已提交
2609 2610 2611

    // update rpc msg conn apply.index
    msgArr[i].info.conn.applyIndex = pEntry->index;
M
Minghao Li 已提交
2612 2613 2614 2615 2616 2617 2618
  }

  // fsync once
  SSyncLogStoreData* pData = ths->pLogStore->data;
  SWal*              pWal = pData->pWal;
  walFsync(pWal, true);

M
Minghao Li 已提交
2619
  if (ths->replicaNum > 1) {
M
Minghao Li 已提交
2620
    // if multi replica, start replicate right now
M
Minghao Li 已提交
2621 2622 2623 2624 2625 2626 2627
    syncNodeReplicate(ths);

  } else if (ths->replicaNum == 1) {
    // one replica
    syncMaybeAdvanceCommitIndex(ths);
  }

M
Minghao Li 已提交
2628 2629 2630
  return 0;
}

M
Minghao Li 已提交
2631 2632
static void syncFreeNode(void* param) {
  SSyncNode* pNode = param;
M
Minghao Li 已提交
2633 2634
  // inner object already free
  // syncNodePrint2((char*)"==syncFreeNode==", pNode);
M
Minghao Li 已提交
2635

wafwerar's avatar
wafwerar 已提交
2636
  taosMemoryFree(pNode);
M
Minghao Li 已提交
2637
}
S
Shengliang Guan 已提交
2638 2639 2640 2641

const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
2642
      return "follower";
S
Shengliang Guan 已提交
2643
    case TAOS_SYNC_STATE_CANDIDATE:
2644
      return "candidate";
S
Shengliang Guan 已提交
2645
    case TAOS_SYNC_STATE_LEADER:
2646
      return "leader";
S
Shengliang Guan 已提交
2647
    default:
2648
      return "error";
S
Shengliang Guan 已提交
2649
  }
M
Minghao Li 已提交
2650
}
2651

2652
int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry) {
M
Minghao Li 已提交
2653 2654
  SyncLeaderTransfer* pSyncLeaderTransfer = syncLeaderTransferFromRpcMsg2(pRpcMsg);

2655 2656 2657 2658
  if (ths->state != TAOS_SYNC_STATE_FOLLOWER) {
    syncNodeEventLog(ths, "I am not follower, can not do leader transfer");
    return 0;
  }
M
Minghao Li 已提交
2659
  syncNodeEventLog(ths, "do leader transfer");
M
Minghao Li 已提交
2660

M
Minghao Li 已提交
2661 2662 2663
  bool sameId = syncUtilSameId(&(pSyncLeaderTransfer->newLeaderId), &(ths->myRaftId));
  bool sameNodeInfo = strcmp(pSyncLeaderTransfer->newNodeInfo.nodeFqdn, ths->myNodeInfo.nodeFqdn) == 0 &&
                      pSyncLeaderTransfer->newNodeInfo.nodePort == ths->myNodeInfo.nodePort;
M
Minghao Li 已提交
2664

M
Minghao Li 已提交
2665 2666
  bool same = sameId || sameNodeInfo;
  if (same) {
M
Minghao Li 已提交
2667 2668 2669 2670
    // reset elect timer now!
    int32_t electMS = 1;
    int32_t ret = syncNodeRestartElectTimer(ths, electMS);
    ASSERT(ret == 0);
M
Minghao Li 已提交
2671 2672

    char eventLog[256];
S
Shengliang Guan 已提交
2673
    snprintf(eventLog, sizeof(eventLog), "maybe leader transfer to %s:%d %" PRIu64,
M
Minghao Li 已提交
2674 2675 2676
             pSyncLeaderTransfer->newNodeInfo.nodeFqdn, pSyncLeaderTransfer->newNodeInfo.nodePort,
             pSyncLeaderTransfer->newLeaderId.addr);
    syncNodeEventLog(ths, eventLog);
2677 2678
  }

M
Minghao Li 已提交
2679
  if (ths->pFsm->FpLeaderTransferCb != NULL) {
2680
    SFsmCbMeta cbMeta = {0};
M
Minghao Li 已提交
2681 2682 2683 2684
    cbMeta.code = 0;
    cbMeta.currentTerm = ths->pRaftStore->currentTerm;
    cbMeta.flag = 0;
    cbMeta.index = pEntry->index;
2685
    cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
M
Minghao Li 已提交
2686 2687 2688 2689 2690
    cbMeta.isWeak = pEntry->isWeak;
    cbMeta.seqNum = pEntry->seqNum;
    cbMeta.state = ths->state;
    cbMeta.term = pEntry->term;
    ths->pFsm->FpLeaderTransferCb(ths->pFsm, pRpcMsg, cbMeta);
2691 2692
  }

M
Minghao Li 已提交
2693
  syncLeaderTransferDestroy(pSyncLeaderTransfer);
2694 2695 2696
  return 0;
}

2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
  for (int i = 0; i < pNewCfg->replicaNum; ++i) {
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((pNewCfg->nodeInfo)[i].nodeFqdn, (pNewCfg->nodeInfo)[i].nodePort);
    raftId.vgId = ths->vgId;

    if (syncUtilSameId(&(ths->myRaftId), &raftId)) {
      pNewCfg->myIndex = i;
      return 0;
    }
  }

  return -1;
}

M
Minghao Li 已提交
2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736
static int32_t syncNodeConfigChangeFinish(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry) {
  SyncReconfigFinish* pFinish = syncReconfigFinishFromRpcMsg2(pRpcMsg);
  ASSERT(pFinish);

  if (ths->pFsm->FpReConfigCb != NULL) {
    SReConfigCbMeta cbMeta = {0};
    cbMeta.code = 0;
    cbMeta.index = pEntry->index;
    cbMeta.term = pEntry->term;
    cbMeta.seqNum = pEntry->seqNum;
    cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index);
    cbMeta.state = ths->state;
    cbMeta.currentTerm = ths->pRaftStore->currentTerm;
    cbMeta.isWeak = pEntry->isWeak;
    cbMeta.flag = 0;

    cbMeta.oldCfg = pFinish->oldCfg;
    cbMeta.newCfg = pFinish->newCfg;
    cbMeta.newCfgIndex = pFinish->newCfgIndex;
    cbMeta.newCfgTerm = pFinish->newCfgTerm;
    cbMeta.newCfgSeqNum = pFinish->newCfgSeqNum;

    ths->pFsm->FpReConfigCb(ths->pFsm, pRpcMsg, cbMeta);
  }

2737
  // clear changing
M
Minghao Li 已提交
2738 2739 2740 2741 2742
  ths->changing = false;

  char  tmpbuf[512];
  char* oldStr = syncCfg2SimpleStr(&(pFinish->oldCfg));
  char* newStr = syncCfg2SimpleStr(&(pFinish->newCfg));
S
Shengliang Guan 已提交
2743
  snprintf(tmpbuf, sizeof(tmpbuf), "config change finish from %d to %d, index:%" PRId64 ", %s  -->  %s",
M
Minghao Li 已提交
2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755
           pFinish->oldCfg.replicaNum, pFinish->newCfg.replicaNum, pFinish->newCfgIndex, oldStr, newStr);
  taosMemoryFree(oldStr);
  taosMemoryFree(newStr);
  syncNodeEventLog(ths, tmpbuf);

  syncReconfigFinishDestroy(pFinish);

  return 0;
}

static int32_t syncNodeConfigChange(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry,
                                    SyncReconfigFinish* pFinish) {
2756 2757 2758
  // set changing
  ths->changing = true;

M
Minghao Li 已提交
2759
  // old config
2760 2761
  SSyncCfg oldSyncCfg = ths->pRaftCfg->cfg;

M
Minghao Li 已提交
2762
  // new config
2763 2764 2765 2766 2767
  SSyncCfg newSyncCfg;
  int32_t  ret = syncCfgFromStr(pRpcMsg->pCont, &newSyncCfg);
  ASSERT(ret == 0);

  // update new config myIndex
2768 2769
  syncNodeUpdateNewConfigIndex(ths, &newSyncCfg);

M
Minghao Li 已提交
2770 2771
  // do config change
  syncNodeDoConfigChange(ths, &newSyncCfg, pEntry->index);
2772

M
Minghao Li 已提交
2773 2774 2775 2776 2777 2778
  // set pFinish
  pFinish->oldCfg = oldSyncCfg;
  pFinish->newCfg = newSyncCfg;
  pFinish->newCfgIndex = pEntry->index;
  pFinish->newCfgTerm = pEntry->term;
  pFinish->newCfgSeqNum = pEntry->seqNum;
2779

M
Minghao Li 已提交
2780 2781
  return 0;
}
2782

M
Minghao Li 已提交
2783 2784 2785 2786 2787 2788 2789 2790 2791
static int32_t syncNodeProposeConfigChangeFinish(SSyncNode* ths, SyncReconfigFinish* pFinish) {
  SRpcMsg rpcMsg;
  syncReconfigFinish2RpcMsg(pFinish, &rpcMsg);

  int32_t code = syncNodePropose(ths, &rpcMsg, false);
  if (code != 0) {
    sError("syncNodeProposeConfigChangeFinish error");
    ths->changing = false;
  }
2792 2793 2794
  return 0;
}

2795 2796 2797 2798
bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
}

2799
int32_t syncNodeCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) {
2800 2801
  int32_t    code = 0;
  ESyncState state = flag;
M
Minghao Li 已提交
2802 2803

  char eventLog[128];
S
Shengliang Guan 已提交
2804
  snprintf(eventLog, sizeof(eventLog), "commit wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex);
M
Minghao Li 已提交
2805
  syncNodeEventLog(ths, eventLog);
2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818

  // execute fsm
  if (ths->pFsm != NULL) {
    for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
      if (i != SYNC_INDEX_INVALID) {
        SSyncRaftEntry* pEntry;
        code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry);
        ASSERT(code == 0);
        ASSERT(pEntry != NULL);

        SRpcMsg rpcMsg;
        syncEntry2OriginalRpc(pEntry, &rpcMsg);

2819
        // user commit
2820 2821
        if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) {
          bool internalExecute = true;
S
Shengliang Guan 已提交
2822
          if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) {
2823 2824 2825 2826 2827
            internalExecute = false;
          }

          do {
            char logBuf[128];
S
Shengliang Guan 已提交
2828
            snprintf(logBuf, sizeof(logBuf), "commit index:%" PRId64 ", internal:%d", i, internalExecute);
2829 2830
            syncNodeEventLog(ths, logBuf);
          } while (0);
2831

2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846
          // execute fsm in apply thread, or execute outside syncPropose
          if (internalExecute) {
            SFsmCbMeta cbMeta = {0};
            cbMeta.index = pEntry->index;
            cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
            cbMeta.isWeak = pEntry->isWeak;
            cbMeta.code = 0;
            cbMeta.state = ths->state;
            cbMeta.seqNum = pEntry->seqNum;
            cbMeta.term = pEntry->term;
            cbMeta.currentTerm = ths->pRaftStore->currentTerm;
            cbMeta.flag = flag;

            ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, cbMeta);
          }
2847 2848 2849
        }

        // config change
2850
        if (pEntry->originalRpcType == TDMT_SYNC_CONFIG_CHANGE) {
M
Minghao Li 已提交
2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865
          SyncReconfigFinish* pFinish = syncReconfigFinishBuild(ths->vgId);
          ASSERT(pFinish != NULL);

          code = syncNodeConfigChange(ths, &rpcMsg, pEntry, pFinish);
          ASSERT(code == 0);

          if (ths->state == TAOS_SYNC_STATE_LEADER) {
            syncNodeProposeConfigChangeFinish(ths, pFinish);
          }
          syncReconfigFinishDestroy(pFinish);
        }

        // config change finish
        if (pEntry->originalRpcType == TDMT_SYNC_CONFIG_CHANGE_FINISH) {
          code = syncNodeConfigChangeFinish(ths, &rpcMsg, pEntry);
2866 2867
          ASSERT(code == 0);
        }
2868

2869 2870
#if 0
        // execute in pre-commit
M
Minghao Li 已提交
2871
        // leader transfer
2872 2873 2874
        if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) {
          code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry);
          ASSERT(code == 0);
2875
        }
2876
#endif
2877 2878

        // restore finish
2879
        // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok
2880 2881 2882 2883 2884 2885
        if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
          if (ths->restoreFinish == false) {
            if (ths->pFsm->FpRestoreFinishCb != NULL) {
              ths->pFsm->FpRestoreFinishCb(ths->pFsm);
            }
            ths->restoreFinish = true;
M
Minghao Li 已提交
2886 2887

            char eventLog[128];
S
Shengliang Guan 已提交
2888
            snprintf(eventLog, sizeof(eventLog), "restore finish, index:%" PRId64, pEntry->index);
M
Minghao Li 已提交
2889
            syncNodeEventLog(ths, eventLog);
2890 2891 2892 2893 2894 2895 2896 2897 2898
          }
        }

        rpcFreeCont(rpcMsg.pCont);
        syncEntryDestory(pEntry);
      }
    }
  }
  return 0;
2899 2900 2901 2902 2903 2904 2905 2906 2907
}

bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
  for (int i = 0; i < ths->replicaNum; ++i) {
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
      return true;
    }
  }
  return false;
M
Minghao Li 已提交
2908 2909 2910 2911 2912 2913 2914 2915 2916 2917
}

SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
  SSyncSnapshotSender* pSender = NULL;
  for (int i = 0; i < ths->replicaNum; ++i) {
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pSender = (ths->senders)[i];
    }
  }
  return pSender;
M
Minghao Li 已提交
2918
}
M
Minghao Li 已提交
2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942

bool syncNodeCanChange(SSyncNode* pSyncNode) {
  if (pSyncNode->changing) {
    sError("sync cannot change");
    return false;
  }

  if ((pSyncNode->commitIndex >= SYNC_INDEX_BEGIN)) {
    SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
    if (pSyncNode->commitIndex != lastIndex) {
      sError("sync cannot change2");
      return false;
    }
  }

  for (int i = 0; i < pSyncNode->peersNum; ++i) {
    SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->peersId)[i]);
    if (pSender->start) {
      sError("sync cannot change3");
      return false;
    }
  }

  return true;
M
Minghao Li 已提交
2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993
}

void syncLogSendRequestVote(SSyncNode* pSyncNode, const SyncRequestVote* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
           "send sync-request-vote to %s:%d {term:%" PRIu64 ", lindex:%" PRId64 ", lterm:%" PRIu64 "}, %s", host, port,
           pMsg->term, pMsg->lastLogIndex, pMsg->lastLogTerm, s);
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogRecvRequestVote(SSyncNode* pSyncNode, const SyncRequestVote* pMsg, const char* s) {
  char     logBuf[256];
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
  snprintf(logBuf, sizeof(logBuf),
           "recv sync-request-vote from %s:%d, {term:%" PRIu64 ", lindex:%" PRId64 ", lterm:%" PRIu64 "}, %s", host,
           port, pMsg->term, pMsg->lastLogIndex, pMsg->lastLogTerm, s);
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogSendRequestVoteReply(SSyncNode* pSyncNode, const SyncRequestVoteReply* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf), "send sync-request-vote-reply to %s:%d {term:%" PRIu64 ", grant:%d}, %s", host, port,
           pMsg->term, pMsg->voteGranted, s);
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogRecvRequestVoteReply(SSyncNode* pSyncNode, const SyncRequestVoteReply* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf), "recv sync-request-vote-reply from %s:%d {term:%" PRIu64 ", grant:%d}, %s", host,
           port, pMsg->term, pMsg->voteGranted, s);
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogSendAppendEntries(SSyncNode* pSyncNode, const SyncAppendEntries* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
           "send sync-append-entries to %s:%d, {term:%" PRIu64 ", pre-index:%" PRId64 ", pre-term:%" PRIu64
M
Minghao Li 已提交
2994
           ", pterm:%" PRIu64 ", cmt:%" PRId64
M
Minghao Li 已提交
2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007
           ", "
           "datalen:%d}, %s",
           host, port, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->privateTerm, pMsg->commitIndex,
           pMsg->dataLen, s);
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogRecvAppendEntries(SSyncNode* pSyncNode, const SyncAppendEntries* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
M
Minghao Li 已提交
3008
           "recv sync-append-entries from %s:%d {term:%" PRIu64 ", pre-index:%" PRIu64 ", pre-term:%" PRIu64
M
Minghao Li 已提交
3009
           ", cmt:%" PRIu64 ", pterm:%" PRIu64
M
Minghao Li 已提交
3010 3011 3012 3013
           ", "
           "datalen:%d}, %s",
           host, port, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex, pMsg->privateTerm,
           pMsg->dataLen, s);
M
Minghao Li 已提交
3014
  syncNodeEventLog(pSyncNode, logBuf);
M
Minghao Li 已提交
3015 3016 3017 3018 3019 3020 3021 3022 3023
}

void syncLogSendAppendEntriesBatch(SSyncNode* pSyncNode, const SyncAppendEntriesBatch* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
           "send sync-append-entries-batch to %s:%d, {term:%" PRIu64 ", pre-index:%" PRId64 ", pre-term:%" PRIu64
M
Minghao Li 已提交
3024
           ", pterm:%" PRIu64 ", cmt:%" PRId64 ", datalen:%d, count:%d}, %s",
M
Minghao Li 已提交
3025 3026
           host, port, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->privateTerm, pMsg->commitIndex,
           pMsg->dataLen, pMsg->dataCount, s);
M
Minghao Li 已提交
3027 3028 3029 3030 3031 3032 3033 3034 3035 3036
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogRecvAppendEntriesBatch(SSyncNode* pSyncNode, const SyncAppendEntriesBatch* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
           "recv sync-append-entries-batch from %s:%d, {term:%" PRIu64 ", pre-index:%" PRId64 ", pre-term:%" PRIu64
M
Minghao Li 已提交
3037
           ", pterm:%" PRIu64 ", cmt:%" PRId64 ", datalen:%d, count:%d}, %s",
M
Minghao Li 已提交
3038 3039
           host, port, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->privateTerm, pMsg->commitIndex,
           pMsg->dataLen, pMsg->dataCount, s);
M
Minghao Li 已提交
3040
  syncNodeEventLog(pSyncNode, logBuf);
M
Minghao Li 已提交
3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060
}

void syncLogSendAppendEntriesReply(SSyncNode* pSyncNode, const SyncAppendEntriesReply* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->destId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
           "send sync-append-entries-reply to %s:%d, {term:%" PRIu64 ", pterm:%" PRIu64 ", success:%d, match:%" PRId64
           "}, %s",
           host, port, pMsg->term, pMsg->privateTerm, pMsg->success, pMsg->matchIndex, s);
  syncNodeEventLog(pSyncNode, logBuf);
}

void syncLogRecvAppendEntriesReply(SSyncNode* pSyncNode, const SyncAppendEntriesReply* pMsg, const char* s) {
  char     host[64];
  uint16_t port;
  syncUtilU642Addr(pMsg->srcId.addr, host, sizeof(host), &port);
  char logBuf[256];
  snprintf(logBuf, sizeof(logBuf),
M
Minghao Li 已提交
3061
           "recv sync-append-entries-reply from %s:%d {term:%" PRIu64 ", pterm:%" PRIu64 ", success:%d, match:%" PRId64
M
Minghao Li 已提交
3062 3063
           "}, %s",
           host, port, pMsg->term, pMsg->privateTerm, pMsg->success, pMsg->matchIndex, s);
M
Minghao Li 已提交
3064
  syncNodeEventLog(pSyncNode, logBuf);
M
Minghao Li 已提交
3065
}