syncMain.c 86.7 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

S
Shengliang Guan 已提交
16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "sync.h"
M
Minghao Li 已提交
18 19
#include "syncAppendEntries.h"
#include "syncAppendEntriesReply.h"
M
Minghao Li 已提交
20
#include "syncCommit.h"
M
Minghao Li 已提交
21
#include "syncElection.h"
M
Minghao Li 已提交
22
#include "syncEnv.h"
M
Minghao Li 已提交
23
#include "syncIndexMgr.h"
M
Minghao Li 已提交
24
#include "syncInt.h"
M
Minghao Li 已提交
25
#include "syncMessage.h"
26
#include "syncPipeline.h"
M
Minghao Li 已提交
27
#include "syncRaftCfg.h"
M
Minghao Li 已提交
28
#include "syncRaftLog.h"
M
Minghao Li 已提交
29
#include "syncRaftStore.h"
M
Minghao Li 已提交
30
#include "syncReplication.h"
M
Minghao Li 已提交
31 32
#include "syncRequestVote.h"
#include "syncRequestVoteReply.h"
M
Minghao Li 已提交
33
#include "syncRespMgr.h"
M
Minghao Li 已提交
34
#include "syncSnapshot.h"
M
Minghao Li 已提交
35
#include "syncTimeout.h"
M
Minghao Li 已提交
36
#include "syncUtil.h"
M
Minghao Li 已提交
37
#include "syncVoteMgr.h"
38
#include "tglobal.h"
M
Minghao Li 已提交
39
#include "tref.h"
M
Minghao Li 已提交
40

M
Minghao Li 已提交
41 42 43 44
static void    syncNodeEqPingTimer(void* param, void* tmrId);
static void    syncNodeEqElectTimer(void* param, void* tmrId);
static void    syncNodeEqHeartbeatTimer(void* param, void* tmrId);
static int32_t syncNodeAppendNoop(SSyncNode* ths);
45
static void    syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId);
S
Shengliang Guan 已提交
46
static bool    syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg);
S
Shengliang Guan 已提交
47 48 49
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId);
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
S
Shengliang Guan 已提交
50 51 52 53 54 55 56 57 58 59 60
static int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg);
static bool    syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config);
static void    syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* newConfig, SyncIndex lastConfigChangeIndex);
static bool    syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg);

static bool    syncNodeCanChange(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader);
static int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry);

static ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode);
M
Minghao Li 已提交
61

62
int64_t syncOpen(SSyncInfo* pSyncInfo) {
M
Minghao Li 已提交
63
  SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo);
64
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
65
    sError("vgId:%d, failed to open sync node", pSyncInfo->vgId);
66 67
    return -1;
  }
M
Minghao Li 已提交
68

S
Shengliang Guan 已提交
69
  pSyncNode->rid = syncNodeAdd(pSyncNode);
M
Minghao Li 已提交
70
  if (pSyncNode->rid < 0) {
71
    syncNodeClose(pSyncNode);
M
Minghao Li 已提交
72 73 74
    return -1;
  }

S
Shengliang Guan 已提交
75 76 77 78 79 80
  pSyncNode->pingBaseLine = pSyncInfo->pingMs;
  pSyncNode->pingTimerMS = pSyncInfo->pingMs;
  pSyncNode->electBaseLine = pSyncInfo->electMs;
  pSyncNode->hbBaseLine = pSyncInfo->heartbeatMs;
  pSyncNode->heartbeatTimerMS = pSyncInfo->heartbeatMs;
  pSyncNode->msgcb = pSyncInfo->msgcb;
M
Minghao Li 已提交
81
  return pSyncNode->rid;
M
Minghao Li 已提交
82
}
M
Minghao Li 已提交
83

B
Benguang Zhao 已提交
84
int32_t syncStart(int64_t rid) {
S
Shengliang Guan 已提交
85
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
86
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
87
    sError("failed to acquire rid:%" PRId64 " of tsNodeReftId for pSyncNode", rid);
B
Benguang Zhao 已提交
88 89 90 91
    return -1;
  }

  if (syncNodeRestore(pSyncNode) < 0) {
92
    sError("vgId:%d, failed to restore sync log buffer since %s", pSyncNode->vgId, terrstr());
93
    goto _err;
M
Minghao Li 已提交
94
  }
M
Minghao Li 已提交
95

B
Benguang Zhao 已提交
96 97 98 99
  if (syncNodeStart(pSyncNode) < 0) {
    sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr());
    goto _err;
  }
M
Minghao Li 已提交
100

B
Benguang Zhao 已提交
101 102
  syncNodeRelease(pSyncNode);
  return 0;
M
Minghao Li 已提交
103

104 105 106
_err:
  syncNodeRelease(pSyncNode);
  return -1;
M
Minghao Li 已提交
107 108
}

M
Minghao Li 已提交
109
void syncStop(int64_t rid) {
S
Shengliang Guan 已提交
110
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
111
  if (pSyncNode != NULL) {
112
    pSyncNode->isStart = false;
S
Shengliang Guan 已提交
113
    syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
114
    syncNodeRemove(rid);
M
Minghao Li 已提交
115 116 117
  }
}

M
Minghao Li 已提交
118 119
void syncPreStop(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
120 121 122
  if (pSyncNode != NULL) {
    syncNodePreClose(pSyncNode);
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
123 124 125
  }
}

126 127 128 129 130 131 132 133
void syncPostStop(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode != NULL) {
    syncNodePostClose(pSyncNode);
    syncNodeRelease(pSyncNode);
  }
}

S
Shengliang Guan 已提交
134 135 136
static bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pCfg) {
  if (!syncNodeInConfig(pSyncNode, pCfg)) return false;
  return abs(pCfg->replicaNum - pSyncNode->replicaNum) <= 1;
M
Minghao Li 已提交
137 138
}

S
Shengliang Guan 已提交
139
int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
140
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
141
  if (pSyncNode == NULL) return -1;
M
Minghao Li 已提交
142

M
Minghao Li 已提交
143
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
S
Shengliang Guan 已提交
144
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
145
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
S
Shengliang Guan 已提交
146
    sError("vgId:%d, failed to reconfig since invalid new config", pSyncNode->vgId);
M
Minghao Li 已提交
147
    return -1;
M
Minghao Li 已提交
148
  }
149

S
Shengliang Guan 已提交
150 151
  syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg);
  syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID);
S
Shengliang Guan 已提交
152

M
Minghao Li 已提交
153 154 155 156
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    syncNodeStopHeartbeatTimer(pSyncNode);

    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
S
Shengliang Guan 已提交
157
      syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]);
M
Minghao Li 已提交
158 159 160
    }

    syncNodeStartHeartbeatTimer(pSyncNode);
S
Shengliang Guan 已提交
161
    // syncNodeReplicate(pSyncNode);
M
Minghao Li 已提交
162
  }
S
Shengliang Guan 已提交
163

S
Shengliang Guan 已提交
164
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
165
  return 0;
M
Minghao Li 已提交
166
}
M
Minghao Li 已提交
167

S
Shengliang Guan 已提交
168 169 170 171
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg) {
  int32_t code = -1;
  if (!syncIsInit()) return code;

S
Shengliang Guan 已提交
172
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
173 174
  if (pSyncNode == NULL) return code;

S
Shengliang Guan 已提交
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
  switch (pMsg->msgType) {
    case TDMT_SYNC_HEARTBEAT:
      code = syncNodeOnHeartbeat(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_HEARTBEAT_REPLY:
      code = syncNodeOnHeartbeatReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_TIMEOUT:
      code = syncNodeOnTimeout(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_CLIENT_REQUEST:
      code = syncNodeOnClientRequest(pSyncNode, pMsg, NULL);
      break;
    case TDMT_SYNC_REQUEST_VOTE:
      code = syncNodeOnRequestVote(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_REQUEST_VOTE_REPLY:
      code = syncNodeOnRequestVoteReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES:
      code = syncNodeOnAppendEntries(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES_REPLY:
      code = syncNodeOnAppendEntriesReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_SEND:
      code = syncNodeOnSnapshot(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_RSP:
S
Shengliang Guan 已提交
204
      code = syncNodeOnSnapshotRsp(pSyncNode, pMsg);
S
Shengliang Guan 已提交
205 206 207 208 209
      break;
    case TDMT_SYNC_LOCAL_CMD:
      code = syncNodeOnLocalCmd(pSyncNode, pMsg);
      break;
    default:
210
      terrno = TSDB_CODE_MSG_NOT_PROCESSED;
S
Shengliang Guan 已提交
211
      code = -1;
M
Minghao Li 已提交
212 213
  }

S
Shengliang Guan 已提交
214
  syncNodeRelease(pSyncNode);
215 216 217 218
  if (code != 0) {
    sDebug("vgId:%d, failed to process sync msg:%p type:%s since 0x%x", pSyncNode->vgId, pMsg, TMSG_INFO(pMsg->msgType),
           terrno);
  }
S
Shengliang Guan 已提交
219
  return code;
220 221
}

S
Shengliang Guan 已提交
222
int32_t syncLeaderTransfer(int64_t rid) {
S
Shengliang Guan 已提交
223
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
224
  if (pSyncNode == NULL) return -1;
225

S
Shengliang Guan 已提交
226
  int32_t ret = syncNodeLeaderTransfer(pSyncNode);
S
Shengliang Guan 已提交
227
  syncNodeRelease(pSyncNode);
228 229 230
  return ret;
}

231
int32_t syncSendTimeoutRsp(int64_t rid, int64_t seq) {
S
Shengliang Guan 已提交
232
  SSyncNode* pNode = syncNodeAcquire(rid);
233
  if (pNode == NULL) return -1;
S
Shengliang Guan 已提交
234 235

  SRpcMsg rpcMsg = {0};
236
  int32_t ret = syncRespMgrGetAndDel(pNode->pSyncRespMgr, seq, &rpcMsg.info);
S
Shengliang Guan 已提交
237 238 239
  rpcMsg.code = TSDB_CODE_SYN_TIMEOUT;

  syncNodeRelease(pNode);
240
  if (ret == 1) {
241
    sInfo("send timeout response, seq:%" PRId64 " handle:%p ahandle:%p", seq, rpcMsg.info.handle, rpcMsg.info.ahandle);
242
    rpcSendResponse(&rpcMsg);
243 244
    return 0;
  } else {
245
    sError("no message handle to send timeout response, seq:%" PRId64, seq);
246
    return -1;
247
  }
S
Shengliang Guan 已提交
248 249
}

M
Minghao Li 已提交
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
SyncIndex syncMinMatchIndex(SSyncNode* pSyncNode) {
  SyncIndex minMatchIndex = SYNC_INDEX_INVALID;

  if (pSyncNode->peersNum > 0) {
    minMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
  }

  for (int32_t i = 1; i < pSyncNode->peersNum; ++i) {
    SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (matchIndex < minMatchIndex) {
      minMatchIndex = matchIndex;
    }
  }
  return minMatchIndex;
}

266
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
S
Shengliang Guan 已提交
267
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
268
  if (pSyncNode == NULL) {
269
    sError("sync begin snapshot error");
270 271
    return -1;
  }
272

273 274
  int32_t code = 0;

M
Minghao Li 已提交
275
  if (syncNodeIsMnode(pSyncNode)) {
M
Minghao Li 已提交
276 277 278
    // mnode
    int64_t logRetention = SYNC_MNODE_LOG_RETENTION;

M
Minghao Li 已提交
279 280 281
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    int64_t   logNum = endIndex - beginIndex;
M
Minghao Li 已提交
282 283 284
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || (!isEmpty && logNum < logRetention)) {
S
Shengliang Guan 已提交
285 286
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", log-num:%" PRId64 ", empty:%d, do not delete wal",
              lastApplyIndex, logNum, isEmpty);
S
Shengliang Guan 已提交
287
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
288 289 290
      return 0;
    }

M
Minghao Li 已提交
291 292 293
    goto _DEL_WAL;

  } else {
294 295 296 297 298 299 300 301 302 303
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || !(lastApplyIndex >= beginIndex && lastApplyIndex <= endIndex)) {
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", empty:%d, do not delete wal", lastApplyIndex, isEmpty);
      syncNodeRelease(pSyncNode);
      return 0;
    }

M
Minghao Li 已提交
304 305 306 307
    // vnode
    if (pSyncNode->replicaNum > 1) {
      // multi replicas

308
      lastApplyIndex = TMAX(lastApplyIndex - SYNC_VNODE_LOG_RETENTION, beginIndex - 1);
309

M
Minghao Li 已提交
310 311 312 313 314 315
      if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
        pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode);

        for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
          int64_t matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
          if (lastApplyIndex > matchIndex) {
316 317 318 319
            sNTrace(pSyncNode,
                    "new-snapshot-index:%" PRId64 " is greater than match-index:%" PRId64
                    " of dnode:%d, do not delete wal",
                    lastApplyIndex, matchIndex, DID(&pSyncNode->peersId[i]));
M
Minghao Li 已提交
320

S
Shengliang Guan 已提交
321
            syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
322 323 324 325 326 327
            return 0;
          }
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
        if (lastApplyIndex > pSyncNode->minMatchIndex) {
S
Shengliang Guan 已提交
328 329 330
          sNTrace(pSyncNode,
                  "new-snapshot-index:%" PRId64 " is greater than min-match-index:%" PRId64 ", do not delete wal",
                  lastApplyIndex, pSyncNode->minMatchIndex);
S
Shengliang Guan 已提交
331
          syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
332 333 334 335
          return 0;
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE) {
S
Shengliang Guan 已提交
336
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
337
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
338 339 340
        return 0;

      } else {
S
Shengliang Guan 已提交
341
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " unknown state, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
342
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
343 344 345 346 347 348 349 350 351
        return 0;
      }

      goto _DEL_WAL;

    } else {
      // one replica

      goto _DEL_WAL;
352 353 354
    }
  }

M
Minghao Li 已提交
355
_DEL_WAL:
356

M
Minghao Li 已提交
357
  do {
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    SyncIndex          snapshotVer = walGetSnapshotVer(pData->pWal);
    SyncIndex          walCommitVer = walGetCommittedVer(pData->pWal);
    SyncIndex          wallastVer = walGetLastVer(pData->pWal);
    if (lastApplyIndex <= walCommitVer) {
      SyncIndex snapshottingIndex = atomic_load_64(&pSyncNode->snapshottingIndex);

      if (snapshottingIndex == SYNC_INDEX_INVALID) {
        atomic_store_64(&pSyncNode->snapshottingIndex, lastApplyIndex);
        pSyncNode->snapshottingTime = taosGetTimestampMs();

        code = walBeginSnapshot(pData->pWal, lastApplyIndex);
        if (code == 0) {
          sNTrace(pSyncNode, "wal snapshot begin, index:%" PRId64 ", last apply index:%" PRId64,
                  pSyncNode->snapshottingIndex, lastApplyIndex);
        } else {
          sNError(pSyncNode, "wal snapshot begin error since:%s, index:%" PRId64 ", last apply index:%" PRId64,
                  terrstr(terrno), pSyncNode->snapshottingIndex, lastApplyIndex);
          atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
        }
378

M
Minghao Li 已提交
379
      } else {
380 381
        sNTrace(pSyncNode, "snapshotting for %" PRId64 ", do not delete wal for new-snapshot-index:%" PRId64,
                snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
382
      }
383
    }
M
Minghao Li 已提交
384
  } while (0);
385

S
Shengliang Guan 已提交
386
  syncNodeRelease(pSyncNode);
387 388 389 390
  return code;
}

int32_t syncEndSnapshot(int64_t rid) {
S
Shengliang Guan 已提交
391
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
392
  if (pSyncNode == NULL) {
393
    sError("sync end snapshot error");
394 395 396
    return -1;
  }

397 398 399 400
  int32_t code = 0;
  if (atomic_load_64(&pSyncNode->snapshottingIndex) != SYNC_INDEX_INVALID) {
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    code = walEndSnapshot(pData->pWal);
M
Minghao Li 已提交
401
    if (code != 0) {
402
      sNError(pSyncNode, "wal snapshot end error since:%s", terrstr());
S
Shengliang Guan 已提交
403
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
404 405
      return -1;
    } else {
S
Shengliang Guan 已提交
406
      sNTrace(pSyncNode, "wal snapshot end, index:%" PRId64, atomic_load_64(&pSyncNode->snapshottingIndex));
M
Minghao Li 已提交
407 408
      atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
    }
409
  }
410

S
Shengliang Guan 已提交
411
  syncNodeRelease(pSyncNode);
412 413 414
  return code;
}

M
Minghao Li 已提交
415
int32_t syncStepDown(int64_t rid, SyncTerm newTerm) {
S
Shengliang Guan 已提交
416
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
417
  if (pSyncNode == NULL) {
418
    sError("sync step down error");
M
Minghao Li 已提交
419 420 421
    return -1;
  }

M
Minghao Li 已提交
422
  syncNodeStepDown(pSyncNode, newTerm);
S
Shengliang Guan 已提交
423
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
424
  return 0;
M
Minghao Li 已提交
425 426
}

427
bool syncNodeIsReadyForRead(SSyncNode* pSyncNode) {
428
  if (pSyncNode == NULL) {
429
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
430
    sError("sync ready for read error");
431 432
    return false;
  }
M
Minghao Li 已提交
433

434 435 436 437 438 439
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return false;
  }

  if (pSyncNode->restoreFinish) {
440
    return true;
M
Minghao Li 已提交
441 442
  }

443
  bool ready = false;
444 445 446
  if (!pSyncNode->pFsm->FpApplyQueueEmptyCb(pSyncNode->pFsm)) {
    // apply queue not empty
    ready = false;
M
Minghao Li 已提交
447

448 449 450 451 452 453 454 455 456 457 458 459 460
  } else {
    if (!pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore)) {
      SyncIndex       lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
      SSyncRaftEntry* pEntry = NULL;
      SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
      LRUHandle*      h = taosLRUCacheLookup(pCache, &lastIndex, sizeof(lastIndex));
      int32_t         code = 0;
      if (h) {
        pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
        code = 0;

        pSyncNode->pLogStore->cacheHit++;
        sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", lastIndex, pEntry->bytes, pEntry);
M
Minghao Li 已提交
461

462 463 464
      } else {
        pSyncNode->pLogStore->cacheMiss++;
        sNTrace(pSyncNode, "miss cache index:%" PRId64, lastIndex);
M
Minghao Li 已提交
465

466 467
        code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, lastIndex, &pEntry);
      }
468

469
      if (code == 0 && pEntry != NULL) {
470
        if (pEntry->originalRpcType == TDMT_SYNC_NOOP && pEntry->term == raftStoreGetTerm(pSyncNode)) {
471
          ready = true;
472
        }
473

474 475 476 477
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
          syncEntryDestroy(pEntry);
478
        }
479 480 481 482
      }
    }
  }

483
  if (!ready) {
484
    terrno = TSDB_CODE_SYN_RESTORING;
485
  }
486

487 488 489 490 491 492 493 494 495 496 497 498
  return ready;
}

bool syncIsReadyForRead(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    sError("sync ready for read error");
    return false;
  }

  bool ready = syncNodeIsReadyForRead(pSyncNode);

499 500
  syncNodeRelease(pSyncNode);
  return ready;
M
Minghao Li 已提交
501
}
M
Minghao Li 已提交
502

503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
bool syncSnapshotSending(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotSending(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

bool syncSnapshotRecving(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotRecving(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

M
Minghao Li 已提交
525 526
int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode) {
  if (pSyncNode->peersNum == 0) {
S
Shengliang Guan 已提交
527
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
528 529
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
530
  }
M
Minghao Li 已提交
531

532
  int32_t ret = 0;
533
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER && pSyncNode->replicaNum > 1) {
534
    SNodeInfo newLeader = (pSyncNode->peersNodeInfo)[0];
535 536 537 538 539 540 541
    if (pSyncNode->peersNum == 2) {
      SyncIndex matchIndex0 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
      SyncIndex matchIndex1 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[1]));
      if (matchIndex1 > matchIndex0) {
        newLeader = (pSyncNode->peersNodeInfo)[1];
      }
    }
542 543 544
    ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
  }

M
Minghao Li 已提交
545
  return ret;
M
Minghao Li 已提交
546 547
}

M
Minghao Li 已提交
548 549
int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader) {
  if (pSyncNode->replicaNum == 1) {
S
Shengliang Guan 已提交
550
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
551 552
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
553
  }
554

S
Shengliang Guan 已提交
555
  sNTrace(pSyncNode, "begin leader transfer to %s:%u", newLeader.nodeFqdn, newLeader.nodePort);
M
Minghao Li 已提交
556

557 558 559 560
  SRpcMsg rpcMsg = {0};
  (void)syncBuildLeaderTransfer(&rpcMsg, pSyncNode->vgId);

  SyncLeaderTransfer* pMsg = rpcMsg.pCont;
561
  pMsg->newLeaderId.addr = SYNC_ADDR(&newLeader);
M
Minghao Li 已提交
562 563 564
  pMsg->newLeaderId.vgId = pSyncNode->vgId;
  pMsg->newNodeInfo = newLeader;

S
Shengliang Guan 已提交
565
  int32_t ret = syncNodePropose(pSyncNode, &rpcMsg, false, NULL);
S
Shengliang Guan 已提交
566 567
  rpcFreeCont(rpcMsg.pCont);
  return ret;
M
Minghao Li 已提交
568 569
}

570 571
SSyncState syncGetState(int64_t rid) {
  SSyncState state = {.state = TAOS_SYNC_STATE_ERROR};
M
Minghao Li 已提交
572

S
Shengliang Guan 已提交
573
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
574 575 576
  if (pSyncNode != NULL) {
    state.state = pSyncNode->state;
    state.restored = pSyncNode->restoreFinish;
577 578 579 580 581
    if (pSyncNode->vgId != 1) {
      state.canRead = syncNodeIsReadyForRead(pSyncNode);
    } else {
      state.canRead = state.restored;
    }
582
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
583 584
  }

585
  return state;
M
Minghao Li 已提交
586 587
}

588
SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
589 590
  ASSERT(pSyncNode->raftCfg.configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->raftCfg.configIndexArr)[0];
591

592 593 594 595
  for (int32_t i = 0; i < pSyncNode->raftCfg.configIndexCount; ++i) {
    if ((pSyncNode->raftCfg.configIndexArr)[i] > lastIndex &&
        (pSyncNode->raftCfg.configIndexArr)[i] <= snapshotLastApplyIndex) {
      lastIndex = (pSyncNode->raftCfg.configIndexArr)[i];
596 597
    }
  }
S
Shengliang Guan 已提交
598
  sTrace("vgId:%d, sync get last config index, index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId,
S
Shengliang Guan 已提交
599
         snapshotLastApplyIndex, lastIndex);
600 601 602 603

  return lastIndex;
}

604 605
void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
  pEpSet->numOfEps = 0;
M
Minghao Li 已提交
606

S
Shengliang Guan 已提交
607
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
608
  if (pSyncNode == NULL) return;
M
Minghao Li 已提交
609

610
  for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
S
Shengliang Guan 已提交
611
    SEp* pEp = &pEpSet->eps[i];
612 613
    tstrncpy(pEp->fqdn, pSyncNode->raftCfg.cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN);
    pEp->port = (pSyncNode->raftCfg.cfg.nodeInfo)[i].nodePort;
S
Shengliang Guan 已提交
614
    pEpSet->numOfEps++;
615
    sDebug("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port);
M
Minghao Li 已提交
616
  }
M
Minghao Li 已提交
617
  if (pEpSet->numOfEps > 0) {
618
    pEpSet->inUse = (pSyncNode->raftCfg.cfg.myIndex + 1) % pEpSet->numOfEps;
M
Minghao Li 已提交
619 620
  }

S
Shengliang Guan 已提交
621
  sInfo("vgId:%d, sync get retry epset numOfEps:%d inUse:%d", pSyncNode->vgId, pEpSet->numOfEps, pEpSet->inUse);
S
Shengliang Guan 已提交
622
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
623 624
}

S
Shengliang Guan 已提交
625
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak, int64_t* seq) {
S
Shengliang Guan 已提交
626
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
627
  if (pSyncNode == NULL) {
628
    sError("sync propose error");
M
Minghao Li 已提交
629
    return -1;
630
  }
631

S
Shengliang Guan 已提交
632
  int32_t ret = syncNodePropose(pSyncNode, pMsg, isWeak, seq);
S
Shengliang Guan 已提交
633
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
634 635
  return ret;
}
M
Minghao Li 已提交
636

S
Shengliang Guan 已提交
637
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak, int64_t* seq) {
S
Shengliang Guan 已提交
638 639
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
S
Shengliang Guan 已提交
640
    sNError(pSyncNode, "sync propose not leader, type:%s", TMSG_INFO(pMsg->msgType));
S
Shengliang Guan 已提交
641 642
    return -1;
  }
643

S
Shengliang Guan 已提交
644 645 646 647 648 649 650
  // not restored, vnode enable
  if (!pSyncNode->restoreFinish && pSyncNode->vgId != 1) {
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since not ready, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }
651

652
  // heartbeat timeout
653
  if (syncNodeHeartbeatReplyTimeout(pSyncNode)) {
654 655 656 657 658 659
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since hearbeat timeout, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }

S
Shengliang Guan 已提交
660 661 662
  // optimized one replica
  if (syncNodeIsOptimizedOneReplica(pSyncNode, pMsg)) {
    SyncIndex retIndex;
663
    int32_t   code = syncNodeOnClientRequest(pSyncNode, pMsg, &retIndex);
S
Shengliang Guan 已提交
664 665
    if (code == 0) {
      pMsg->info.conn.applyIndex = retIndex;
666
      pMsg->info.conn.applyTerm = raftStoreGetTerm(pSyncNode);
667 668 669
      sTrace("vgId:%d, propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
             TMSG_INFO(pMsg->msgType));
      return 1;
M
Minghao Li 已提交
670
    } else {
S
Shengliang Guan 已提交
671
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
672
      sError("vgId:%d, failed to propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
S
Shengliang Guan 已提交
673
             TMSG_INFO(pMsg->msgType));
674
      return -1;
675
    }
S
Shengliang Guan 已提交
676
  } else {
S
Shengliang Guan 已提交
677 678
    SRespStub stub = {.createTime = taosGetTimestampMs(), .rpcMsg = *pMsg};
    uint64_t  seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);
679
    SRpcMsg   rpcMsg = {0};
S
Shengliang Guan 已提交
680
    int32_t   code = syncBuildClientRequest(&rpcMsg, pMsg, seqNum, isWeak, pSyncNode->vgId);
681 682 683 684
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while serialize since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
      return -1;
M
Minghao Li 已提交
685
    }
686

687 688 689 690 691
    sNTrace(pSyncNode, "propose msg, type:%s", TMSG_INFO(pMsg->msgType));
    code = (*pSyncNode->syncEqMsg)(pSyncNode->msgcb, &rpcMsg);
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while enqueue since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
M
Minghao Li 已提交
692
    }
M
Minghao Li 已提交
693

S
Shengliang Guan 已提交
694
    if (seq != NULL) *seq = seqNum;
695
    return code;
M
Minghao Li 已提交
696
  }
M
Minghao Li 已提交
697 698
}

S
Shengliang Guan 已提交
699
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId) {
700 701 702 703 704
  pSyncTimer->pTimer = NULL;
  pSyncTimer->counter = 0;
  pSyncTimer->timerMS = pSyncNode->hbBaseLine;
  pSyncTimer->timerCb = syncNodeEqPeerHeartbeatTimer;
  pSyncTimer->destId = destId;
M
Minghao Li 已提交
705
  pSyncTimer->timeStamp = taosGetTimestampMs();
706 707 708 709
  atomic_store_64(&pSyncTimer->logicClock, 0);
  return 0;
}

S
Shengliang Guan 已提交
710
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
711
  int32_t ret = 0;
S
Shengliang Guan 已提交
712
  int64_t tsNow = taosGetTimestampMs();
S
Shengliang Guan 已提交
713
  if (syncIsInit()) {
714 715 716 717 718 719
    SSyncHbTimerData* pData = syncHbTimerDataAcquire(pSyncTimer->hbDataRid);
    if (pData == NULL) {
      pData = taosMemoryMalloc(sizeof(SSyncHbTimerData));
      pData->rid = syncHbTimerDataAdd(pData);
    }
    pSyncTimer->hbDataRid = pData->rid;
S
Shengliang Guan 已提交
720
    pSyncTimer->timeStamp = tsNow;
721 722

    pData->syncNodeRid = pSyncNode->rid;
723 724 725
    pData->pTimer = pSyncTimer;
    pData->destId = pSyncTimer->destId;
    pData->logicClock = pSyncTimer->logicClock;
S
Shengliang Guan 已提交
726
    pData->execTime = tsNow + pSyncTimer->timerMS;
M
Minghao Li 已提交
727

728 729
    taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)(pData->rid),
                 syncEnv()->pTimerManager, &pSyncTimer->pTimer);
730 731 732 733 734 735
  } else {
    sError("vgId:%d, start ctrl hb timer error, sync env is stop", pSyncNode->vgId);
  }
  return ret;
}

S
Shengliang Guan 已提交
736
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
737 738 739 740
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncTimer->logicClock, 1);
  taosTmrStop(pSyncTimer->pTimer);
  pSyncTimer->pTimer = NULL;
741 742
  syncHbTimerDataRemove(pSyncTimer->hbDataRid);
  pSyncTimer->hbDataRid = -1;
743 744 745
  return ret;
}

746
int32_t syncNodeLogStoreRestoreOnNeed(SSyncNode* pNode) {
S
Shengliang Guan 已提交
747 748 749
  ASSERTS(pNode->pLogStore != NULL, "log store not created");
  ASSERTS(pNode->pFsm != NULL, "pFsm not registered");
  ASSERTS(pNode->pFsm->FpGetSnapshotInfo != NULL, "FpGetSnapshotInfo not registered");
750 751 752
  SSnapshot snapshot = {0};
  pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot);

753 754 755 756 757
  SyncIndex commitIndex = snapshot.lastApplyIndex;
  SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore);
  SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore);
  if (lastVer < commitIndex || firstVer > commitIndex + 1) {
    if (pNode->pLogStore->syncLogRestoreFromSnapshot(pNode->pLogStore, commitIndex)) {
S
Shengliang Guan 已提交
758
      sError("vgId:%d, failed to restore log store from snapshot since %s. lastVer:%" PRId64 ", snapshotVer:%" PRId64,
759 760 761 762 763 764 765
             pNode->vgId, terrstr(), lastVer, commitIndex);
      return -1;
    }
  }
  return 0;
}

M
Minghao Li 已提交
766
// open/close --------------
S
Shengliang Guan 已提交
767 768
SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode));
769 770 771 772
  if (pSyncNode == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _error;
  }
M
Minghao Li 已提交
773

M
Minghao Li 已提交
774 775 776 777
  if (!taosDirExist((char*)(pSyncInfo->path))) {
    if (taosMkDir(pSyncInfo->path) != 0) {
      terrno = TAOS_SYSTEM_ERROR(errno);
      sError("failed to create dir:%s since %s", pSyncInfo->path, terrstr());
778
      goto _error;
M
Minghao Li 已提交
779
    }
780
  }
M
Minghao Li 已提交
781

782 783 784
  memcpy(pSyncNode->path, pSyncInfo->path, sizeof(pSyncNode->path));
  snprintf(pSyncNode->raftStorePath, sizeof(pSyncNode->raftStorePath), "%s%sraft_store.json", pSyncInfo->path,
           TD_DIRSEP);
S
Shengliang Guan 已提交
785
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
786

787
  if (!taosCheckExistFile(pSyncNode->configPath)) {
M
Minghao Li 已提交
788
    // create a new raft config file
789
    sInfo("vgId:%d, create a new raft config file", pSyncNode->vgId);
790 791 792 793 794 795 796 797 798 799
    pSyncNode->raftCfg.isStandBy = pSyncInfo->isStandBy;
    pSyncNode->raftCfg.snapshotStrategy = pSyncInfo->snapshotStrategy;
    pSyncNode->raftCfg.lastConfigIndex = SYNC_INDEX_INVALID;
    pSyncNode->raftCfg.batchSize = pSyncInfo->batchSize;
    pSyncNode->raftCfg.cfg = pSyncInfo->syncCfg;
    pSyncNode->raftCfg.configIndexCount = 1;
    pSyncNode->raftCfg.configIndexArr[0] = -1;

    if (syncWriteCfgFile(pSyncNode) != 0) {
      sError("vgId:%d, failed to create sync cfg file", pSyncNode->vgId);
H
Hongze Cheng 已提交
800
      goto _error;
801
    }
802 803
  } else {
    // update syncCfg by raft_config.json
804 805
    if (syncReadCfgFile(pSyncNode) != 0) {
      sError("vgId:%d, failed to read sync cfg file", pSyncNode->vgId);
H
Hongze Cheng 已提交
806
      goto _error;
807
    }
S
Shengliang Guan 已提交
808

809
    if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->raftCfg.cfg, &pSyncInfo->syncCfg)) {
S
Shengliang Guan 已提交
810
      sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId);
811 812 813
      pSyncNode->raftCfg.cfg = pSyncInfo->syncCfg;
      if (syncWriteCfgFile(pSyncNode) != 0) {
        sError("vgId:%d, failed to write sync cfg file", pSyncNode->vgId);
S
Shengliang Guan 已提交
814 815
        goto _error;
      }
S
Shengliang Guan 已提交
816
    } else {
817 818
      sInfo("vgId:%d, use sync config from sync cfg file", pSyncNode->vgId);
      pSyncInfo->syncCfg = pSyncNode->raftCfg.cfg;
S
Shengliang Guan 已提交
819
    }
M
Minghao Li 已提交
820 821
  }

M
Minghao Li 已提交
822
  // init by SSyncInfo
M
Minghao Li 已提交
823
  pSyncNode->vgId = pSyncInfo->vgId;
S
Shengliang Guan 已提交
824
  SSyncCfg* pCfg = &pSyncNode->raftCfg.cfg;
S
Shengliang Guan 已提交
825
  sInfo("vgId:%d, start to open sync node, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
S
Shengliang Guan 已提交
826 827
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SNodeInfo* pNode = &pCfg->nodeInfo[i];
828
    tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
829 830
    sInfo("vgId:%d, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, pSyncNode->vgId, i, pNode->nodeFqdn, pNode->nodePort,
          pNode->nodeId, pNode->clusterId);
S
Shengliang Guan 已提交
831 832
  }

M
Minghao Li 已提交
833
  pSyncNode->pWal = pSyncInfo->pWal;
S
Shengliang Guan 已提交
834
  pSyncNode->msgcb = pSyncInfo->msgcb;
S
Shengliang Guan 已提交
835 836 837
  pSyncNode->syncSendMSg = pSyncInfo->syncSendMSg;
  pSyncNode->syncEqMsg = pSyncInfo->syncEqMsg;
  pSyncNode->syncEqCtrlMsg = pSyncInfo->syncEqCtrlMsg;
M
Minghao Li 已提交
838

B
Benguang Zhao 已提交
839 840 841
  // create raft log ring buffer
  pSyncNode->pLogBuf = syncLogBufferCreate();
  if (pSyncNode->pLogBuf == NULL) {
842
    sError("failed to init sync log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId);
B
Benguang Zhao 已提交
843 844 845
    goto _error;
  }

M
Minghao Li 已提交
846
  // init internal
847
  pSyncNode->myNodeInfo = pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex];
848
  if (!syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId)) {
S
Shengliang Guan 已提交
849
    sError("vgId:%d, failed to determine my raft member id", pSyncNode->vgId);
H
Hongze Cheng 已提交
850
    goto _error;
851
  }
M
Minghao Li 已提交
852

M
Minghao Li 已提交
853
  // init peersNum, peers, peersId
854
  pSyncNode->peersNum = pSyncNode->raftCfg.cfg.replicaNum - 1;
S
Shengliang Guan 已提交
855
  int32_t j = 0;
856 857 858 859
  for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
    if (i != pSyncNode->raftCfg.cfg.myIndex) {
      pSyncNode->peersNodeInfo[j] = pSyncNode->raftCfg.cfg.nodeInfo[i];
      syncUtilNodeInfo2EpSet(&pSyncNode->peersNodeInfo[j], &pSyncNode->peersEpset[j]);
M
Minghao Li 已提交
860 861 862
      j++;
    }
  }
S
Shengliang Guan 已提交
863
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
864
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i])) {
S
Shengliang Guan 已提交
865
      sError("vgId:%d, failed to determine raft member id, peer:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
866
      goto _error;
867
    }
M
Minghao Li 已提交
868
  }
M
Minghao Li 已提交
869

M
Minghao Li 已提交
870
  // init replicaNum, replicasId
871 872 873
  pSyncNode->replicaNum = pSyncNode->raftCfg.cfg.replicaNum;
  for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->raftCfg.cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) {
S
Shengliang Guan 已提交
874
      sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
875
      goto _error;
876
    }
M
Minghao Li 已提交
877 878
  }

M
Minghao Li 已提交
879
  // init raft algorithm
M
Minghao Li 已提交
880
  pSyncNode->pFsm = pSyncInfo->pFsm;
881
  pSyncInfo->pFsm = NULL;
882
  pSyncNode->quorum = syncUtilQuorum(pSyncNode->raftCfg.cfg.replicaNum);
M
Minghao Li 已提交
883 884
  pSyncNode->leaderCache = EMPTY_RAFT_ID;

M
Minghao Li 已提交
885
  // init life cycle outside
M
Minghao Li 已提交
886

M
Minghao Li 已提交
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
  // TLA+ Spec
  // InitHistoryVars == /\ elections = {}
  //                    /\ allLogs   = {}
  //                    /\ voterLog  = [i \in Server |-> [j \in {} |-> <<>>]]
  // InitServerVars == /\ currentTerm = [i \in Server |-> 1]
  //                   /\ state       = [i \in Server |-> Follower]
  //                   /\ votedFor    = [i \in Server |-> Nil]
  // InitCandidateVars == /\ votesResponded = [i \in Server |-> {}]
  //                      /\ votesGranted   = [i \in Server |-> {}]
  // \* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the
  // \* leader does not send itself messages. It's still easier to include these
  // \* in the functions.
  // InitLeaderVars == /\ nextIndex  = [i \in Server |-> [j \in Server |-> 1]]
  //                   /\ matchIndex = [i \in Server |-> [j \in Server |-> 0]]
  // InitLogVars == /\ log          = [i \in Server |-> << >>]
  //                /\ commitIndex  = [i \in Server |-> 0]
  // Init == /\ messages = [m \in {} |-> 0]
  //         /\ InitHistoryVars
  //         /\ InitServerVars
  //         /\ InitCandidateVars
  //         /\ InitLeaderVars
  //         /\ InitLogVars
  //

M
Minghao Li 已提交
911
  // init TLA+ server vars
M
syncInt  
Minghao Li 已提交
912
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
913
  if (raftStoreOpen(pSyncNode) != 0) {
S
Shengliang Guan 已提交
914
    sError("vgId:%d, failed to open raft store at path %s", pSyncNode->vgId, pSyncNode->raftStorePath);
915 916
    goto _error;
  }
M
Minghao Li 已提交
917

M
Minghao Li 已提交
918
  // init TLA+ candidate vars
M
Minghao Li 已提交
919
  pSyncNode->pVotesGranted = voteGrantedCreate(pSyncNode);
920
  if (pSyncNode->pVotesGranted == NULL) {
S
Shengliang Guan 已提交
921
    sError("vgId:%d, failed to create VotesGranted", pSyncNode->vgId);
922 923
    goto _error;
  }
M
Minghao Li 已提交
924
  pSyncNode->pVotesRespond = votesRespondCreate(pSyncNode);
925
  if (pSyncNode->pVotesRespond == NULL) {
S
Shengliang Guan 已提交
926
    sError("vgId:%d, failed to create VotesRespond", pSyncNode->vgId);
927 928
    goto _error;
  }
M
Minghao Li 已提交
929

M
Minghao Li 已提交
930 931
  // init TLA+ leader vars
  pSyncNode->pNextIndex = syncIndexMgrCreate(pSyncNode);
932
  if (pSyncNode->pNextIndex == NULL) {
S
Shengliang Guan 已提交
933
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
934 935
    goto _error;
  }
M
Minghao Li 已提交
936
  pSyncNode->pMatchIndex = syncIndexMgrCreate(pSyncNode);
937
  if (pSyncNode->pMatchIndex == NULL) {
S
Shengliang Guan 已提交
938
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
939 940
    goto _error;
  }
M
Minghao Li 已提交
941 942 943

  // init TLA+ log vars
  pSyncNode->pLogStore = logStoreCreate(pSyncNode);
944
  if (pSyncNode->pLogStore == NULL) {
S
Shengliang Guan 已提交
945
    sError("vgId:%d, failed to create SyncLogStore", pSyncNode->vgId);
946 947
    goto _error;
  }
948 949 950 951

  SyncIndex commitIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot = {0};
952
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
953 954
    if (snapshot.lastApplyIndex > commitIndex) {
      commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
955
      sNTrace(pSyncNode, "reset commit index by snapshot");
956 957 958
    }
  }
  pSyncNode->commitIndex = commitIndex;
959
  sInfo("vgId:%d, sync node commitIndex initialized as %" PRId64, pSyncNode->vgId, pSyncNode->commitIndex);
M
Minghao Li 已提交
960

961
  // restore log store on need
962
  if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) {
963
    sError("vgId:%d, failed to restore log store since %s.", pSyncNode->vgId, terrstr());
964 965
    goto _error;
  }
966

M
Minghao Li 已提交
967 968
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
969 970
  pSyncNode->electBaseLine = tsElectInterval;
  pSyncNode->hbBaseLine = tsHeartbeatInterval;
M
Minghao Li 已提交
971

M
Minghao Li 已提交
972
  // init ping timer
M
Minghao Li 已提交
973
  pSyncNode->pPingTimer = NULL;
M
Minghao Li 已提交
974
  pSyncNode->pingTimerMS = pSyncNode->pingBaseLine;
M
Minghao Li 已提交
975 976
  atomic_store_64(&pSyncNode->pingTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->pingTimerLogicClockUser, 0);
M
Minghao Li 已提交
977
  pSyncNode->FpPingTimerCB = syncNodeEqPingTimer;
M
Minghao Li 已提交
978
  pSyncNode->pingTimerCounter = 0;
M
Minghao Li 已提交
979

M
Minghao Li 已提交
980 981
  // init elect timer
  pSyncNode->pElectTimer = NULL;
M
Minghao Li 已提交
982
  pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
M
Minghao Li 已提交
983
  atomic_store_64(&pSyncNode->electTimerLogicClock, 0);
M
Minghao Li 已提交
984
  pSyncNode->FpElectTimerCB = syncNodeEqElectTimer;
M
Minghao Li 已提交
985 986 987 988
  pSyncNode->electTimerCounter = 0;

  // init heartbeat timer
  pSyncNode->pHeartbeatTimer = NULL;
M
Minghao Li 已提交
989
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
M
Minghao Li 已提交
990 991
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClockUser, 0);
M
Minghao Li 已提交
992
  pSyncNode->FpHeartbeatTimerCB = syncNodeEqHeartbeatTimer;
M
Minghao Li 已提交
993 994
  pSyncNode->heartbeatTimerCounter = 0;

995 996 997 998 999
  // init peer heartbeat timer
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
    syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]);
  }

M
Minghao Li 已提交
1000
  // tools
M
Minghao Li 已提交
1001
  pSyncNode->pSyncRespMgr = syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS);
1002
  if (pSyncNode->pSyncRespMgr == NULL) {
S
Shengliang Guan 已提交
1003
    sError("vgId:%d, failed to create SyncRespMgr", pSyncNode->vgId);
1004 1005
    goto _error;
  }
M
Minghao Li 已提交
1006

1007 1008
  // restore state
  pSyncNode->restoreFinish = false;
1009

M
Minghao Li 已提交
1010
  // snapshot senders
S
Shengliang Guan 已提交
1011
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1012
    SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
1013 1014 1015 1016
    if (pSender == NULL) return NULL;

    pSyncNode->senders[i] = pSender;
    sSDebug(pSender, "snapshot sender create while open sync node, data:%p", pSender);
M
Minghao Li 已提交
1017 1018 1019
  }

  // snapshot receivers
1020
  pSyncNode->pNewNodeReceiver = snapshotReceiverCreate(pSyncNode, EMPTY_RAFT_ID);
1021 1022 1023
  if (pSyncNode->pNewNodeReceiver == NULL) return NULL;
  sRDebug(pSyncNode->pNewNodeReceiver, "snapshot receiver create while open sync node, data:%p",
          pSyncNode->pNewNodeReceiver);
M
Minghao Li 已提交
1024

M
Minghao Li 已提交
1025 1026 1027
  // is config changing
  pSyncNode->changing = false;

B
Benguang Zhao 已提交
1028
  // replication mgr
1029 1030 1031 1032
  if (syncNodeLogReplMgrInit(pSyncNode) < 0) {
    sError("vgId:%d, failed to init repl mgr since %s.", pSyncNode->vgId, terrstr());
    goto _error;
  }
B
Benguang Zhao 已提交
1033

M
Minghao Li 已提交
1034
  // peer state
1035 1036 1037 1038
  if (syncNodePeerStateInit(pSyncNode) < 0) {
    sError("vgId:%d, failed to init peer stat since %s.", pSyncNode->vgId, terrstr());
    goto _error;
  }
M
Minghao Li 已提交
1039

B
Benguang Zhao 已提交
1040
  //
M
Minghao Li 已提交
1041 1042 1043
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

M
Minghao Li 已提交
1044
  // start in syncNodeStart
M
Minghao Li 已提交
1045
  // start raft
M
Minghao Li 已提交
1046
  // syncNodeBecomeFollower(pSyncNode);
M
Minghao Li 已提交
1047

M
Minghao Li 已提交
1048 1049
  int64_t timeNow = taosGetTimestampMs();
  pSyncNode->startTime = timeNow;
1050
  pSyncNode->leaderTime = timeNow;
M
Minghao Li 已提交
1051 1052
  pSyncNode->lastReplicateTime = timeNow;

1053 1054 1055
  // snapshotting
  atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);

B
Benguang Zhao 已提交
1056 1057
  // init log buffer
  if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) {
1058
    sError("vgId:%d, failed to init sync log buffer since %s", pSyncNode->vgId, terrstr());
1059
    goto _error;
B
Benguang Zhao 已提交
1060 1061
  }

1062
  pSyncNode->isStart = true;
1063 1064 1065
  pSyncNode->electNum = 0;
  pSyncNode->becomeLeaderNum = 0;
  pSyncNode->configChangeNum = 0;
1066 1067
  pSyncNode->hbSlowNum = 0;
  pSyncNode->hbrSlowNum = 0;
M
Minghao Li 已提交
1068
  pSyncNode->tmrRoutineNum = 0;
1069

1070 1071
  sNInfo(pSyncNode, "sync open, node:%p electInterval:%d heartbeatInterval:%d heartbeatTimeout:%d", pSyncNode,
         tsElectInterval, tsHeartbeatInterval, tsHeartbeatTimeout);
M
Minghao Li 已提交
1072
  return pSyncNode;
1073 1074 1075

_error:
  if (pSyncInfo->pFsm) {
H
Hongze Cheng 已提交
1076 1077
    taosMemoryFree(pSyncInfo->pFsm);
    pSyncInfo->pFsm = NULL;
1078 1079 1080 1081
  }
  syncNodeClose(pSyncNode);
  pSyncNode = NULL;
  return NULL;
M
Minghao Li 已提交
1082 1083
}

M
Minghao Li 已提交
1084 1085
void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) {
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
1086 1087
    SSnapshot snapshot = {0};
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1088 1089 1090 1091 1092 1093
    if (snapshot.lastApplyIndex > pSyncNode->commitIndex) {
      pSyncNode->commitIndex = snapshot.lastApplyIndex;
    }
  }
}

B
Benguang Zhao 已提交
1094
int32_t syncNodeRestore(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1095 1096
  ASSERTS(pSyncNode->pLogStore != NULL, "log store not created");
  ASSERTS(pSyncNode->pLogBuf != NULL, "ring log buffer not created");
B
Benguang Zhao 已提交
1097 1098 1099 1100

  SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore);
  SyncIndex endIndex = pSyncNode->pLogBuf->endIndex;
1101 1102
  if (lastVer != -1 && endIndex != lastVer + 1) {
    terrno = TSDB_CODE_WAL_LOG_INCOMPLETE;
S
Shengliang Guan 已提交
1103
    sError("vgId:%d, failed to restore sync node since %s. expected lastLogIndex:%" PRId64 ", lastVer:%" PRId64 "",
1104 1105 1106
           pSyncNode->vgId, terrstr(), endIndex - 1, lastVer);
    return -1;
  }
B
Benguang Zhao 已提交
1107

1108
  ASSERT(endIndex == lastVer + 1);
1109 1110
  pSyncNode->commitIndex = TMAX(pSyncNode->commitIndex, commitIndex);
  sInfo("vgId:%d, restore sync until commitIndex:%" PRId64, pSyncNode->vgId, pSyncNode->commitIndex);
B
Benguang Zhao 已提交
1111

1112
  if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, pSyncNode->commitIndex) < 0) {
B
Benguang Zhao 已提交
1113 1114 1115 1116 1117 1118 1119 1120 1121
    return -1;
  }

  return 0;
}

int32_t syncNodeStart(SSyncNode* pSyncNode) {
  // start raft
  if (pSyncNode->replicaNum == 1) {
S
Shengliang Guan 已提交
1122
    raftStoreNextTerm(pSyncNode);
B
Benguang Zhao 已提交
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
    syncNodeBecomeLeader(pSyncNode, "one replica start");

    // Raft 3.6.2 Committing entries from previous terms
    syncNodeAppendNoop(pSyncNode);
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
  }

  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1133 1134
  if (ret != 0) {
    sError("vgId:%d, failed to start ping timer since %s", pSyncNode->vgId, terrstr());
1135
  }
1136
  return ret;
M
Minghao Li 已提交
1137 1138
}

B
Benguang Zhao 已提交
1139
int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1140 1141 1142 1143 1144 1145 1146
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1147 1148 1149 1150
  if (ret < 0) {
    sError("vgId:%d, failed to restart elect timer since %s", pSyncNode->vgId, terrstr());
    return -1;
  }
1151

1152
  ret = syncNodeStartPingTimer(pSyncNode);
1153 1154 1155 1156
  if (ret < 0) {
    sError("vgId:%d, failed to start ping timer since %s", pSyncNode->vgId, terrstr());
    return -1;
  }
B
Benguang Zhao 已提交
1157
  return ret;
M
Minghao Li 已提交
1158 1159
}

M
Minghao Li 已提交
1160
void syncNodePreClose(SSyncNode* pSyncNode) {
1161 1162 1163 1164
  if (pSyncNode != NULL && pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpApplyQueueItems != NULL) {
    while (1) {
      int32_t aqItems = pSyncNode->pFsm->FpApplyQueueItems(pSyncNode->pFsm);
      sTrace("vgId:%d, pre close, %d items in apply queue", pSyncNode->vgId, aqItems);
1165
      if (aqItems == 0 || aqItems == -1) {
1166 1167 1168 1169 1170 1171
        break;
      }
      taosMsleep(20);
    }
  }

1172
#if 0
1173 1174
  if (pSyncNode->pNewNodeReceiver != NULL) {
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
S
Shengliang Guan 已提交
1175
      snapshotReceiverStop(pSyncNode->pNewNodeReceiver);
1176 1177
    }

1178 1179
    sDebug("vgId:%d, snapshot receiver destroy while preclose sync node, data:%p", pSyncNode->vgId,
           pSyncNode->pNewNodeReceiver);
1180 1181 1182
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }
1183
#endif
1184

M
Minghao Li 已提交
1185 1186 1187 1188 1189
  // stop elect timer
  syncNodeStopElectTimer(pSyncNode);

  // stop heartbeat timer
  syncNodeStopHeartbeatTimer(pSyncNode);
1190 1191 1192

  // clean rsp
  syncRespCleanRsp(pSyncNode->pSyncRespMgr);
M
Minghao Li 已提交
1193 1194
}

1195 1196 1197
void syncNodePostClose(SSyncNode* pSyncNode) {
  if (pSyncNode->pNewNodeReceiver != NULL) {
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
S
Shengliang Guan 已提交
1198
      snapshotReceiverStop(pSyncNode->pNewNodeReceiver);
1199 1200 1201 1202 1203 1204 1205
    }

    sDebug("vgId:%d, snapshot receiver destroy while preclose sync node, data:%p", pSyncNode->vgId,
           pSyncNode->pNewNodeReceiver);
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }
M
Minghao Li 已提交
1206 1207
}

1208
void syncHbTimerDataFree(SSyncHbTimerData* pData) { taosMemoryFree(pData); }
M
Minghao Li 已提交
1209

M
Minghao Li 已提交
1210
void syncNodeClose(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1211
  if (pSyncNode == NULL) return;
1212
  sNInfo(pSyncNode, "sync close, node:%p", pSyncNode);
M
Minghao Li 已提交
1213

1214 1215 1216 1217
  syncNodeStopPingTimer(pSyncNode);
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStopHeartbeatTimer(pSyncNode);

B
Benguang Zhao 已提交
1218
  syncNodeLogReplMgrDestroy(pSyncNode);
1219

M
Minghao Li 已提交
1220
  syncRespMgrDestroy(pSyncNode->pSyncRespMgr);
1221
  pSyncNode->pSyncRespMgr = NULL;
M
Minghao Li 已提交
1222
  voteGrantedDestroy(pSyncNode->pVotesGranted);
1223
  pSyncNode->pVotesGranted = NULL;
M
Minghao Li 已提交
1224
  votesRespondDestory(pSyncNode->pVotesRespond);
1225
  pSyncNode->pVotesRespond = NULL;
M
Minghao Li 已提交
1226
  syncIndexMgrDestroy(pSyncNode->pNextIndex);
1227
  pSyncNode->pNextIndex = NULL;
M
Minghao Li 已提交
1228
  syncIndexMgrDestroy(pSyncNode->pMatchIndex);
1229
  pSyncNode->pMatchIndex = NULL;
M
Minghao Li 已提交
1230
  logStoreDestory(pSyncNode->pLogStore);
1231
  pSyncNode->pLogStore = NULL;
B
Benguang Zhao 已提交
1232 1233
  syncLogBufferDestroy(pSyncNode->pLogBuf);
  pSyncNode->pLogBuf = NULL;
M
Minghao Li 已提交
1234

S
Shengliang Guan 已提交
1235
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
1236 1237
    if (pSyncNode->senders[i] != NULL) {
      sDebug("vgId:%d, snapshot sender destroy while close, data:%p", pSyncNode->vgId, pSyncNode->senders[i]);
1238

1239 1240
      if (snapshotSenderIsStart(pSyncNode->senders[i])) {
        snapshotSenderStop(pSyncNode->senders[i], false);
1241 1242
      }

1243 1244
      snapshotSenderDestroy(pSyncNode->senders[i]);
      pSyncNode->senders[i] = NULL;
M
Minghao Li 已提交
1245 1246 1247
    }
  }

M
Minghao Li 已提交
1248
  if (pSyncNode->pNewNodeReceiver != NULL) {
1249
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
S
Shengliang Guan 已提交
1250
      snapshotReceiverStop(pSyncNode->pNewNodeReceiver);
1251 1252
    }

1253
    sDebug("vgId:%d, snapshot receiver destroy while close, data:%p", pSyncNode->vgId, pSyncNode->pNewNodeReceiver);
M
Minghao Li 已提交
1254 1255 1256 1257
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

1258 1259 1260 1261
  if (pSyncNode->pFsm != NULL) {
    taosMemoryFree(pSyncNode->pFsm);
  }

1262 1263
  raftStoreClose(pSyncNode);

1264
  taosMemoryFree(pSyncNode);
M
Minghao Li 已提交
1265 1266
}

1267
ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode) { return pSyncNode->raftCfg.snapshotStrategy; }
M
Minghao Li 已提交
1268

M
Minghao Li 已提交
1269 1270 1271
// timer control --------------
int32_t syncNodeStartPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1272 1273
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpPingTimerCB, pSyncNode->pingTimerMS, pSyncNode, syncEnv()->pTimerManager,
1274 1275 1276
                 &pSyncNode->pPingTimer);
    atomic_store_64(&pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1277
    sError("vgId:%d, start ping timer error, sync env is stop", pSyncNode->vgId);
1278
  }
M
Minghao Li 已提交
1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
  return ret;
}

int32_t syncNodeStopPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->pingTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pPingTimer);
  pSyncNode->pPingTimer = NULL;
  return ret;
}

int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1292
  if (syncIsInit()) {
1293
    pSyncNode->electTimerMS = ms;
S
Shengliang Guan 已提交
1294

1295 1296 1297 1298 1299
    int64_t execTime = taosGetTimestampMs() + ms;
    atomic_store_64(&(pSyncNode->electTimerParam.executeTime), execTime);
    atomic_store_64(&(pSyncNode->electTimerParam.logicClock), pSyncNode->electTimerLogicClock);
    pSyncNode->electTimerParam.pSyncNode = pSyncNode;
    pSyncNode->electTimerParam.pData = NULL;
S
Shengliang Guan 已提交
1300

M
Minghao Li 已提交
1301
    taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, (void*)(pSyncNode->rid), syncEnv()->pTimerManager,
1302
                 &pSyncNode->pElectTimer);
1303

1304
  } else {
M
Minghao Li 已提交
1305
    sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId);
1306
  }
M
Minghao Li 已提交
1307 1308 1309 1310 1311
  return ret;
}

int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1312
  atomic_add_fetch_64(&pSyncNode->electTimerLogicClock, 1);
M
Minghao Li 已提交
1313 1314
  taosTmrStop(pSyncNode->pElectTimer);
  pSyncNode->pElectTimer = NULL;
1315

M
Minghao Li 已提交
1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
  return ret;
}

int32_t syncNodeRestartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStartElectTimer(pSyncNode, ms);
  return ret;
}

1326
void syncNodeResetElectTimer(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1327 1328
  int32_t electMS;

1329
  if (pSyncNode->raftCfg.isStandBy) {
M
Minghao Li 已提交
1330 1331 1332 1333
    electMS = TIMER_MAX_MS;
  } else {
    electMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
  }
1334 1335

  (void)syncNodeRestartElectTimer(pSyncNode, electMS);
1336

S
Shengliang Guan 已提交
1337 1338
  sNTrace(pSyncNode, "reset elect timer, min:%d, max:%d, ms:%d", pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine,
          electMS);
M
Minghao Li 已提交
1339 1340
}

M
Minghao Li 已提交
1341
static int32_t syncNodeDoStartHeartbeatTimer(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1342
  int32_t ret = 0;
S
Shengliang Guan 已提交
1343 1344
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, pSyncNode->heartbeatTimerMS, pSyncNode, syncEnv()->pTimerManager,
1345 1346 1347
                 &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1348
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
1349
  }
1350

S
Shengliang Guan 已提交
1351
  sNTrace(pSyncNode, "start heartbeat timer, ms:%d", pSyncNode->heartbeatTimerMS);
M
Minghao Li 已提交
1352 1353 1354
  return ret;
}

M
Minghao Li 已提交
1355
int32_t syncNodeStartHeartbeatTimer(SSyncNode* pSyncNode) {
1356
  int32_t ret = 0;
M
Minghao Li 已提交
1357

1358
#if 0
M
Minghao Li 已提交
1359
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
1360 1361
  ret = syncNodeDoStartHeartbeatTimer(pSyncNode);
#endif
1362

S
Shengliang Guan 已提交
1363
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1364
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1365 1366 1367
    if (pSyncTimer != NULL) {
      syncHbTimerStart(pSyncNode, pSyncTimer);
    }
1368
  }
1369

M
Minghao Li 已提交
1370 1371 1372
  return ret;
}

M
Minghao Li 已提交
1373 1374
int32_t syncNodeStopHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1375 1376

#if 0
M
Minghao Li 已提交
1377 1378 1379
  atomic_add_fetch_64(&pSyncNode->heartbeatTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pHeartbeatTimer);
  pSyncNode->pHeartbeatTimer = NULL;
1380
#endif
1381

S
Shengliang Guan 已提交
1382
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1383
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1384 1385 1386
    if (pSyncTimer != NULL) {
      syncHbTimerStop(pSyncNode, pSyncTimer);
    }
1387
  }
1388

M
Minghao Li 已提交
1389 1390 1391
  return ret;
}

1392 1393 1394 1395 1396 1397
int32_t syncNodeRestartHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartHeartbeatTimer(pSyncNode);
  return 0;
}

1398 1399 1400 1401 1402 1403 1404 1405
int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pNode, SRpcMsg* pMsg) {
  SEpSet* epSet = NULL;
  for (int32_t i = 0; i < pNode->peersNum; ++i) {
    if (destRaftId->addr == pNode->peersId[i].addr) {
      epSet = &pNode->peersEpset[i];
      break;
    }
  }
1406

S
Shengliang Guan 已提交
1407
  int32_t code = -1;
1408
  if (pNode->syncSendMSg != NULL && epSet != NULL) {
M
Minghao Li 已提交
1409
    syncUtilMsgHtoN(pMsg->pCont);
1410
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1411 1412 1413 1414 1415 1416
    code = pNode->syncSendMSg(epSet, pMsg);
  }

  if (code < 0) {
    sError("vgId:%d, sync send msg by id error, epset:%p dnode:%d addr:%" PRId64 " err:0x%x", pNode->vgId, epSet,
           DID(destRaftId), destRaftId->addr, terrno);
S
Shengliang Guan 已提交
1417
    rpcFreeCont(pMsg->pCont);
1418
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
M
Minghao Li 已提交
1419
  }
S
Shengliang Guan 已提交
1420 1421

  return code;
M
Minghao Li 已提交
1422 1423
}

1424
inline bool syncNodeInConfig(SSyncNode* pNode, const SSyncCfg* pCfg) {
1425 1426 1427
  bool b1 = false;
  bool b2 = false;

1428 1429 1430
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    if (strcmp(pCfg->nodeInfo[i].nodeFqdn, pNode->myNodeInfo.nodeFqdn) == 0 &&
        pCfg->nodeInfo[i].nodePort == pNode->myNodeInfo.nodePort) {
1431 1432 1433 1434 1435
      b1 = true;
      break;
    }
  }

1436 1437 1438 1439 1440
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SRaftId raftId = {
        .addr = SYNC_ADDR(&pCfg->nodeInfo[i]),
        .vgId = pNode->vgId,
    };
1441

1442
    if (syncUtilSameId(&raftId, &pNode->myRaftId)) {
1443 1444 1445 1446 1447
      b2 = true;
      break;
    }
  }

1448
  ASSERT(b1 == b2);
1449 1450 1451
  return b1;
}

1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464
static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) {
  if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true;
  if (pOldCfg->myIndex != pNewCfg->myIndex) return true;
  for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) {
    const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i];
    const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i];
    if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true;
    if (pOldInfo->nodePort != pNewInfo->nodePort) return true;
  }

  return false;
}

M
Minghao Li 已提交
1465
void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncIndex lastConfigChangeIndex) {
1466
  SSyncCfg oldConfig = pSyncNode->raftCfg.cfg;
1467 1468 1469 1470
  if (!syncIsConfigChanged(&oldConfig, pNewConfig)) {
    sInfo("vgId:1, sync not reconfig since not changed");
    return;
  }
S
Shengliang Guan 已提交
1471

1472 1473
  pSyncNode->raftCfg.cfg = *pNewConfig;
  pSyncNode->raftCfg.lastConfigIndex = lastConfigChangeIndex;
1474

1475 1476
  pSyncNode->configChangeNum++;

M
Minghao Li 已提交
1477 1478
  bool IamInOld = syncNodeInConfig(pSyncNode, &oldConfig);
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewConfig);
M
Minghao Li 已提交
1479

M
Minghao Li 已提交
1480 1481
  bool isDrop = false;
  bool isAdd = false;
M
Minghao Li 已提交
1482

M
Minghao Li 已提交
1483 1484 1485 1486
  if (IamInOld && !IamInNew) {
    isDrop = true;
  } else {
    isDrop = false;
1487
  }
1488

M
Minghao Li 已提交
1489 1490 1491 1492 1493
  if (!IamInOld && IamInNew) {
    isAdd = true;
  } else {
    isAdd = false;
  }
M
Minghao Li 已提交
1494

M
Minghao Li 已提交
1495
  // log begin config change
1496 1497
  sNInfo(pSyncNode, "begin do config change, from %d to %d", pSyncNode->vgId, oldConfig.replicaNum,
         pNewConfig->replicaNum);
M
Minghao Li 已提交
1498

M
Minghao Li 已提交
1499
  if (IamInNew) {
1500
    pSyncNode->raftCfg.isStandBy = 0;  // change isStandBy to normal
M
Minghao Li 已提交
1501
  }
M
Minghao Li 已提交
1502
  if (isDrop) {
1503
    pSyncNode->raftCfg.isStandBy = 1;  // set standby
M
Minghao Li 已提交
1504 1505
  }

M
Minghao Li 已提交
1506
  // add last config index
1507
  syncAddCfgIndex(pSyncNode, lastConfigChangeIndex);
M
Minghao Li 已提交
1508

M
Minghao Li 已提交
1509 1510 1511 1512 1513 1514 1515 1516 1517
  if (IamInNew) {
    //-----------------------------------------
    int32_t ret = 0;

    // save snapshot senders
    int32_t oldReplicaNum = pSyncNode->replicaNum;
    SRaftId oldReplicasId[TSDB_MAX_REPLICA];
    memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
    SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
S
Shengliang Guan 已提交
1518
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
1519
      oldSenders[i] = pSyncNode->senders[i];
S
Shengliang Guan 已提交
1520
      sSTrace(oldSenders[i], "snapshot sender save old");
M
Minghao Li 已提交
1521
    }
1522

M
Minghao Li 已提交
1523
    // init internal
1524
    pSyncNode->myNodeInfo = pSyncNode->raftCfg.cfg.nodeInfo[pSyncNode->raftCfg.cfg.myIndex];
1525
    syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1526 1527

    // init peersNum, peers, peersId
1528
    pSyncNode->peersNum = pSyncNode->raftCfg.cfg.replicaNum - 1;
S
Shengliang Guan 已提交
1529
    int32_t j = 0;
1530 1531 1532 1533
    for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
      if (i != pSyncNode->raftCfg.cfg.myIndex) {
        pSyncNode->peersNodeInfo[j] = pSyncNode->raftCfg.cfg.nodeInfo[i];
        syncUtilNodeInfo2EpSet(&pSyncNode->peersNodeInfo[j], &pSyncNode->peersEpset[j]);
M
Minghao Li 已提交
1534 1535 1536
        j++;
      }
    }
S
Shengliang Guan 已提交
1537
    for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1538
      syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
M
Minghao Li 已提交
1539
    }
1540

M
Minghao Li 已提交
1541
    // init replicaNum, replicasId
1542 1543 1544
    pSyncNode->replicaNum = pSyncNode->raftCfg.cfg.replicaNum;
    for (int32_t i = 0; i < pSyncNode->raftCfg.cfg.replicaNum; ++i) {
      syncUtilNodeInfo2RaftId(&pSyncNode->raftCfg.cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
M
Minghao Li 已提交
1545
    }
1546

1547
    // update quorum first
1548
    pSyncNode->quorum = syncUtilQuorum(pSyncNode->raftCfg.cfg.replicaNum);
1549

M
Minghao Li 已提交
1550 1551 1552 1553
    syncIndexMgrUpdate(pSyncNode->pNextIndex, pSyncNode);
    syncIndexMgrUpdate(pSyncNode->pMatchIndex, pSyncNode);
    voteGrantedUpdate(pSyncNode->pVotesGranted, pSyncNode);
    votesRespondUpdate(pSyncNode->pVotesRespond, pSyncNode);
M
Minghao Li 已提交
1554

M
Minghao Li 已提交
1555
    // reset snapshot senders
1556

M
Minghao Li 已提交
1557
    // clear new
S
Shengliang Guan 已提交
1558
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
1559
      pSyncNode->senders[i] = NULL;
M
Minghao Li 已提交
1560
    }
M
Minghao Li 已提交
1561

M
Minghao Li 已提交
1562
    // reset new
S
Shengliang Guan 已提交
1563
    for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
M
Minghao Li 已提交
1564 1565
      // reset sender
      bool reset = false;
S
Shengliang Guan 已提交
1566
      for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) {
M
Minghao Li 已提交
1567
        if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j]) && oldSenders[j] != NULL) {
1568 1569
          sNTrace(pSyncNode, "snapshot sender reset for:%" PRId64 ", newIndex:%d, dnode:%d, %p",
                  (pSyncNode->replicasId)[i].addr, i, DID(&pSyncNode->replicasId[i]), oldSenders[j]);
M
Minghao Li 已提交
1570

1571
          pSyncNode->senders[i] = oldSenders[j];
M
Minghao Li 已提交
1572 1573 1574 1575
          oldSenders[j] = NULL;
          reset = true;

          // reset replicaIndex
1576 1577
          int32_t oldreplicaIndex = pSyncNode->senders[i]->replicaIndex;
          pSyncNode->senders[i]->replicaIndex = i;
M
Minghao Li 已提交
1578

1579 1580
          sNTrace(pSyncNode, "snapshot sender udpate replicaIndex from %d to %d, dnode:%d, %p, reset:%d",
                  oldreplicaIndex, i, DID(&pSyncNode->replicasId[i]), pSyncNode->senders[i], reset);
M
Minghao Li 已提交
1581 1582

          break;
M
Minghao Li 已提交
1583
        }
1584 1585
      }
    }
1586

M
Minghao Li 已提交
1587
    // create new
S
Shengliang Guan 已提交
1588
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
1589 1590 1591 1592 1593 1594 1595 1596
      if (pSyncNode->senders[i] == NULL) {
        pSyncNode->senders[i] = snapshotSenderCreate(pSyncNode, i);
        if (pSyncNode->senders[i] == NULL) {
          // will be created later while send snapshot
          sSError(pSyncNode->senders[i], "snapshot sender create failed while reconfig");
        } else {
          sSDebug(pSyncNode->senders[i], "snapshot sender create while reconfig, data:%p", pSyncNode->senders[i]);
        }
S
Shengliang Guan 已提交
1597
      } else {
1598
        sSDebug(pSyncNode->senders[i], "snapshot sender already exist, data:%p", pSyncNode->senders[i]);
M
Minghao Li 已提交
1599
      }
1600 1601
    }

M
Minghao Li 已提交
1602
    // free old
S
Shengliang Guan 已提交
1603
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1604
      if (oldSenders[i] != NULL) {
1605
        sSDebug(oldSenders[i], "snapshot sender destroy old, data:%p replica-index:%d", oldSenders[i], i);
M
Minghao Li 已提交
1606 1607 1608
        snapshotSenderDestroy(oldSenders[i]);
        oldSenders[i] = NULL;
      }
1609 1610
    }

1611
    // persist cfg
1612
    syncWriteCfgFile(pSyncNode);
1613

M
Minghao Li 已提交
1614 1615
    // change isStandBy to normal (election timeout)
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
1616
      syncNodeBecomeLeader(pSyncNode, "");
1617 1618 1619

      // Raft 3.6.2 Committing entries from previous terms
      syncNodeAppendNoop(pSyncNode);
1620
      // syncMaybeAdvanceCommitIndex(pSyncNode);
1621

M
Minghao Li 已提交
1622
    } else {
1623
      syncNodeBecomeFollower(pSyncNode, "");
M
Minghao Li 已提交
1624 1625
    }
  } else {
1626
    // persist cfg
1627 1628
    syncWriteCfgFile(pSyncNode);
    sNInfo(pSyncNode, "do not config change from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum);
1629
  }
1630

M
Minghao Li 已提交
1631
_END:
M
Minghao Li 已提交
1632
  // log end config change
S
Shengliang Guan 已提交
1633
  sNInfo(pSyncNode, "end do config change, from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum);
M
Minghao Li 已提交
1634 1635
}

M
Minghao Li 已提交
1636 1637
// raft state change --------------
void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term) {
1638
  if (term > raftStoreGetTerm(pSyncNode)) {
S
Shengliang Guan 已提交
1639
    raftStoreSetTerm(pSyncNode, term);
1640
    char tmpBuf[64];
1641
    snprintf(tmpBuf, sizeof(tmpBuf), "update term to %" PRId64, term);
1642
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
S
Shengliang Guan 已提交
1643
    raftStoreClearVote(pSyncNode);
M
Minghao Li 已提交
1644 1645 1646
  }
}

1647
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term) {
1648
  if (term > raftStoreGetTerm(pSyncNode)) {
S
Shengliang Guan 已提交
1649
    raftStoreSetTerm(pSyncNode, term);
1650 1651 1652
  }
}

M
Minghao Li 已提交
1653
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) {
1654 1655 1656
  SyncTerm currentTerm = raftStoreGetTerm(pSyncNode);
  if (currentTerm > newTerm) {
    sNTrace(pSyncNode, "step down, ignore, new-term:%" PRId64 ", current-term:%" PRId64, newTerm, currentTerm);
M
Minghao Li 已提交
1657 1658
    return;
  }
M
Minghao Li 已提交
1659 1660

  do {
1661
    sNTrace(pSyncNode, "step down, new-term:%" PRId64 ", current-term:%" PRId64, newTerm, currentTerm);
M
Minghao Li 已提交
1662 1663
  } while (0);

1664
  if (currentTerm < newTerm) {
S
Shengliang Guan 已提交
1665
    raftStoreSetTerm(pSyncNode, newTerm);
M
Minghao Li 已提交
1666
    char tmpBuf[64];
1667
    snprintf(tmpBuf, sizeof(tmpBuf), "step down, update term to %" PRId64, newTerm);
M
Minghao Li 已提交
1668
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
S
Shengliang Guan 已提交
1669
    raftStoreClearVote(pSyncNode);
M
Minghao Li 已提交
1670 1671 1672 1673 1674 1675 1676 1677

  } else {
    if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
      syncNodeBecomeFollower(pSyncNode, "step down");
    }
  }
}

1678 1679
void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); }

1680
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
M
Minghao Li 已提交
1681
  // maybe clear leader cache
M
Minghao Li 已提交
1682 1683 1684 1685
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    pSyncNode->leaderCache = EMPTY_RAFT_ID;
  }

1686 1687
  pSyncNode->hbSlowNum = 0;

M
Minghao Li 已提交
1688
  // state change
M
Minghao Li 已提交
1689 1690 1691
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1692 1693
  // reset elect timer
  syncNodeResetElectTimer(pSyncNode);
M
Minghao Li 已提交
1694

1695 1696 1697
  // send rsp to client
  syncNodeLeaderChangeRsp(pSyncNode);

1698 1699 1700 1701 1702
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeFollowerCb != NULL) {
    pSyncNode->pFsm->FpBecomeFollowerCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1703 1704 1705
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1706 1707 1708
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1709
  // trace log
S
Shengliang Guan 已提交
1710
  sNTrace(pSyncNode, "become follower %s", debugStr);
M
Minghao Li 已提交
1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
}

// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
//     /\ state[i] = Candidate
//     /\ votesGranted[i] \in Quorum
//     /\ state'      = [state EXCEPT ![i] = Leader]
//     /\ nextIndex'  = [nextIndex EXCEPT ![i] =
//                          [j \in Server |-> Len(log[i]) + 1]]
//     /\ matchIndex' = [matchIndex EXCEPT ![i] =
//                          [j \in Server |-> 0]]
//     /\ elections'  = elections \cup
//                          {[eterm     |-> currentTerm[i],
//                            eleader   |-> i,
//                            elog      |-> log[i],
//                            evotes    |-> votesGranted[i],
//                            evoterLog |-> voterLog[i]]}
//     /\ UNCHANGED <<messages, currentTerm, votedFor, candidateVars, logVars>>
//
1731
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
1732 1733
  pSyncNode->leaderTime = taosGetTimestampMs();

1734
  pSyncNode->becomeLeaderNum++;
1735
  pSyncNode->hbrSlowNum = 0;
1736

1737 1738 1739
  // reset restoreFinish
  pSyncNode->restoreFinish = false;

M
Minghao Li 已提交
1740
  // state change
M
Minghao Li 已提交
1741
  pSyncNode->state = TAOS_SYNC_STATE_LEADER;
M
Minghao Li 已提交
1742 1743

  // set leader cache
M
Minghao Li 已提交
1744 1745
  pSyncNode->leaderCache = pSyncNode->myRaftId;

S
Shengliang Guan 已提交
1746
  for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
1747 1748 1749
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
1750
    ASSERT(code == 0);
1751
    pSyncNode->pNextIndex->index[i] = lastIndex + 1;
M
Minghao Li 已提交
1752 1753
  }

S
Shengliang Guan 已提交
1754
  for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1755 1756
    // maybe overwrite myself, no harm
    // just do it!
M
Minghao Li 已提交
1757 1758 1759
    pSyncNode->pMatchIndex->index[i] = SYNC_INDEX_INVALID;
  }

M
Minghao Li 已提交
1760 1761 1762
  // init peer mgr
  syncNodePeerStateInit(pSyncNode);

M
Minghao Li 已提交
1763
#if 0
1764 1765
  // update sender private term
  SSyncSnapshotSender* pMySender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->myRaftId));
1766
  if (pMySender != NULL) {
S
Shengliang Guan 已提交
1767
    for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
1768 1769
      if (pSyncNode->senders[i]->privateTerm > pMySender->privateTerm) {
        pMySender->privateTerm = pSyncNode->senders[i]->privateTerm;
1770
      }
1771
    }
1772
    (pMySender->privateTerm) += 100;
1773
  }
M
Minghao Li 已提交
1774
#endif
1775

1776
  // close receiver
M
Minghao Li 已提交
1777 1778
  if (pSyncNode != NULL && pSyncNode->pNewNodeReceiver != NULL &&
      snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
S
Shengliang Guan 已提交
1779
    snapshotReceiverStop(pSyncNode->pNewNodeReceiver);
1780 1781
  }

M
Minghao Li 已提交
1782
  // stop elect timer
M
Minghao Li 已提交
1783
  syncNodeStopElectTimer(pSyncNode);
M
Minghao Li 已提交
1784

M
Minghao Li 已提交
1785 1786
  // start heartbeat timer
  syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
1787

M
Minghao Li 已提交
1788 1789
  // send heartbeat right now
  syncNodeHeartbeatPeers(pSyncNode);
M
Minghao Li 已提交
1790

1791 1792 1793 1794 1795
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLeaderCb != NULL) {
    pSyncNode->pFsm->FpBecomeLeaderCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1796 1797 1798
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1799 1800 1801
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1802
  // trace log
1803
  sNInfo(pSyncNode, "become leader %s", debugStr);
M
Minghao Li 已提交
1804 1805 1806
}

void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
1807
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
1808 1809 1810 1811 1812
  bool granted = voteGrantedMajority(pSyncNode->pVotesGranted);
  if (!granted) {
    sError("vgId:%d, not granted by majority.", pSyncNode->vgId);
    return;
  }
1813
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
M
Minghao Li 已提交
1814

S
Shengliang Guan 已提交
1815
  sNTrace(pSyncNode, "state change syncNodeCandidate2Leader");
M
Minghao Li 已提交
1816

B
Benguang Zhao 已提交
1817
  int32_t ret = syncNodeAppendNoop(pSyncNode);
1818 1819 1820 1821
  if (ret < 0) {
    sError("vgId:%d, failed to append noop entry since %s", pSyncNode->vgId, terrstr());
  }

B
Benguang Zhao 已提交
1822
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
1823
  ASSERT(lastIndex >= 0);
1824 1825
  sInfo("vgId:%d, become leader. term:%" PRId64 ", commit index:%" PRId64 ", last index:%" PRId64 "", pSyncNode->vgId,
        raftStoreGetTerm(pSyncNode), pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1826 1827
}

M
Minghao Li 已提交
1828 1829
bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }

M
Minghao Li 已提交
1830
int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1831
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1832 1833 1834 1835 1836
    pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID;
    pSyncNode->peerStates[i].lastSendTime = 0;
  }

  return 0;
M
Minghao Li 已提交
1837 1838 1839
}

void syncNodeFollower2Candidate(SSyncNode* pSyncNode) {
1840
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER);
M
Minghao Li 已提交
1841
  pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE;
B
Benguang Zhao 已提交
1842
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
S
Shengliang Guan 已提交
1843
  sInfo("vgId:%d, become candidate from follower. term:%" PRId64 ", commit index:%" PRId64 ", last index:%" PRId64,
1844
        pSyncNode->vgId, raftStoreGetTerm(pSyncNode), pSyncNode->commitIndex, lastIndex);
M
Minghao Li 已提交
1845

S
Shengliang Guan 已提交
1846
  sNTrace(pSyncNode, "follower to candidate");
M
Minghao Li 已提交
1847 1848 1849
}

void syncNodeLeader2Follower(SSyncNode* pSyncNode) {
1850
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
1851
  syncNodeBecomeFollower(pSyncNode, "leader to follower");
B
Benguang Zhao 已提交
1852
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
S
Shengliang Guan 已提交
1853
  sInfo("vgId:%d, become follower from leader. term:%" PRId64 ", commit index:%" PRId64 ", last index:%" PRId64,
1854
        pSyncNode->vgId, raftStoreGetTerm(pSyncNode), pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1855

S
Shengliang Guan 已提交
1856
  sNTrace(pSyncNode, "leader to follower");
M
Minghao Li 已提交
1857 1858 1859
}

void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
1860
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
1861
  syncNodeBecomeFollower(pSyncNode, "candidate to follower");
B
Benguang Zhao 已提交
1862
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
S
Shengliang Guan 已提交
1863
  sInfo("vgId:%d, become follower from candidate. term:%" PRId64 ", commit index:%" PRId64 ", last index:%" PRId64,
1864
        pSyncNode->vgId, raftStoreGetTerm(pSyncNode), pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1865

S
Shengliang Guan 已提交
1866
  sNTrace(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
1867 1868
}

M
Minghao Li 已提交
1869 1870
// just called by syncNodeVoteForSelf
// need assert
M
Minghao Li 已提交
1871
void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
1872
  ASSERT(term == raftStoreGetTerm(pSyncNode));
1873 1874
  bool voted = raftStoreHasVoted(pSyncNode);
  ASSERT(!voted);
M
Minghao Li 已提交
1875

S
Shengliang Guan 已提交
1876
  raftStoreVote(pSyncNode, pRaftId);
M
Minghao Li 已提交
1877 1878
}

M
Minghao Li 已提交
1879
// simulate get vote from outside
1880 1881
void syncNodeVoteForSelf(SSyncNode* pSyncNode, SyncTerm currentTerm) {
  syncNodeVoteForTerm(pSyncNode, currentTerm, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1882

S
Shengliang Guan 已提交
1883 1884
  SRpcMsg rpcMsg = {0};
  int32_t ret = syncBuildRequestVoteReply(&rpcMsg, pSyncNode->vgId);
S
Shengliang Guan 已提交
1885
  if (ret != 0) return;
M
Minghao Li 已提交
1886

S
Shengliang Guan 已提交
1887
  SyncRequestVoteReply* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
1888 1889
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = pSyncNode->myRaftId;
1890
  pMsg->term = currentTerm;
M
Minghao Li 已提交
1891 1892 1893 1894
  pMsg->voteGranted = true;

  voteGrantedVote(pSyncNode->pVotesGranted, pMsg);
  votesRespondAdd(pSyncNode->pVotesRespond, pMsg);
S
Shengliang Guan 已提交
1895
  rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
1896 1897
}

M
Minghao Li 已提交
1898
// return if has a snapshot
M
Minghao Li 已提交
1899 1900
bool syncNodeHasSnapshot(SSyncNode* pSyncNode) {
  bool      ret = false;
1901
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1902 1903
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1904 1905 1906 1907 1908 1909 1910
    if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN) {
      ret = true;
    }
  }
  return ret;
}

M
Minghao Li 已提交
1911 1912
// return max(logLastIndex, snapshotLastIndex)
// if no snapshot and log, return -1
1913
SyncIndex syncNodeGetLastIndex(const SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1914
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1915 1916
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1917 1918 1919 1920 1921 1922 1923
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);

  SyncIndex lastIndex = logLastIndex > snapshot.lastApplyIndex ? logLastIndex : snapshot.lastApplyIndex;
  return lastIndex;
}

M
Minghao Li 已提交
1924 1925
// return the last term of snapshot and log
// if error, return SYNC_TERM_INVALID (by syncLogLastTerm)
M
Minghao Li 已提交
1926 1927
SyncTerm syncNodeGetLastTerm(SSyncNode* pSyncNode) {
  SyncTerm lastTerm = 0;
M
Minghao Li 已提交
1928 1929
  if (syncNodeHasSnapshot(pSyncNode)) {
    // has snapshot
1930
    SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1931 1932
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1933 1934
    }

M
Minghao Li 已提交
1935 1936 1937
    SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    if (logLastIndex > snapshot.lastApplyIndex) {
      lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
M
Minghao Li 已提交
1938 1939 1940 1941
    } else {
      lastTerm = snapshot.lastApplyTerm;
    }

M
Minghao Li 已提交
1942
  } else {
M
Minghao Li 已提交
1943 1944
    // no snapshot
    lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
1945
  }
M
Minghao Li 已提交
1946

M
Minghao Li 已提交
1947 1948 1949 1950 1951 1952 1953
  return lastTerm;
}

// get last index and term along with snapshot
int32_t syncNodeGetLastIndexTerm(SSyncNode* pSyncNode, SyncIndex* pLastIndex, SyncTerm* pLastTerm) {
  *pLastIndex = syncNodeGetLastIndex(pSyncNode);
  *pLastTerm = syncNodeGetLastTerm(pSyncNode);
1954 1955
  return 0;
}
M
Minghao Li 已提交
1956

M
Minghao Li 已提交
1957
// return append-entries first try index
M
Minghao Li 已提交
1958 1959 1960 1961 1962
SyncIndex syncNodeSyncStartIndex(SSyncNode* pSyncNode) {
  SyncIndex syncStartIndex = syncNodeGetLastIndex(pSyncNode) + 1;
  return syncStartIndex;
}

M
Minghao Li 已提交
1963 1964
// if index > 0, return index - 1
// else, return -1
1965 1966 1967 1968 1969 1970 1971 1972 1973
SyncIndex syncNodeGetPreIndex(SSyncNode* pSyncNode, SyncIndex index) {
  SyncIndex preIndex = index - 1;
  if (preIndex < SYNC_INDEX_INVALID) {
    preIndex = SYNC_INDEX_INVALID;
  }

  return preIndex;
}

M
Minghao Li 已提交
1974 1975 1976 1977
// if index < 0, return SYNC_TERM_INVALID
// if index == 0, return 0
// if index > 0, return preTerm
// if error, return SYNC_TERM_INVALID
1978 1979 1980 1981 1982 1983 1984 1985 1986
SyncTerm syncNodeGetPreTerm(SSyncNode* pSyncNode, SyncIndex index) {
  if (index < SYNC_INDEX_BEGIN) {
    return SYNC_TERM_INVALID;
  }

  if (index == SYNC_INDEX_BEGIN) {
    return 0;
  }

1987 1988 1989
  SyncTerm  preTerm = 0;
  SyncIndex preIndex = index - 1;

1990
  SSyncRaftEntry* pPreEntry = NULL;
1991 1992 1993 1994 1995 1996 1997
  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
  LRUHandle*      h = taosLRUCacheLookup(pCache, &preIndex, sizeof(preIndex));
  int32_t         code = 0;
  if (h) {
    pPreEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
    code = 0;

1998
    pSyncNode->pLogStore->cacheHit++;
1999 2000 2001
    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", preIndex, pPreEntry->bytes, pPreEntry);

  } else {
2002
    pSyncNode->pLogStore->cacheMiss++;
2003 2004 2005 2006
    sNTrace(pSyncNode, "miss cache index:%" PRId64, preIndex);

    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, preIndex, &pPreEntry);
  }
M
Minghao Li 已提交
2007 2008 2009 2010 2011 2012

  SSnapshot snapshot = {.data = NULL,
                        .lastApplyIndex = SYNC_INDEX_INVALID,
                        .lastApplyTerm = SYNC_TERM_INVALID,
                        .lastConfigIndex = SYNC_INDEX_INVALID};

2013
  if (code == 0) {
2014
    ASSERT(pPreEntry != NULL);
2015
    preTerm = pPreEntry->term;
2016 2017 2018 2019

    if (h) {
      taosLRUCacheRelease(pCache, h, false);
    } else {
2020
      syncEntryDestroy(pPreEntry);
2021 2022
    }

2023 2024
    return preTerm;
  } else {
2025 2026 2027 2028
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
      if (snapshot.lastApplyIndex == preIndex) {
        return snapshot.lastApplyTerm;
2029 2030 2031 2032
      }
    }
  }

2033
  sNError(pSyncNode, "sync node get pre term error, index:%" PRId64 ", snap-index:%" PRId64 ", snap-term:%" PRId64,
S
Shengliang Guan 已提交
2034
          index, snapshot.lastApplyIndex, snapshot.lastApplyTerm);
2035 2036
  return SYNC_TERM_INVALID;
}
M
Minghao Li 已提交
2037 2038 2039 2040

// get pre index and term of "index"
int32_t syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncIndex* pPreIndex, SyncTerm* pPreTerm) {
  *pPreIndex = syncNodeGetPreIndex(pSyncNode, index);
M
Minghao Li 已提交
2041
  *pPreTerm = syncNodeGetPreTerm(pSyncNode, index);
M
Minghao Li 已提交
2042 2043 2044
  return 0;
}

M
Minghao Li 已提交
2045
static void syncNodeEqPingTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2046
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2047

S
Shengliang Guan 已提交
2048 2049 2050
  SSyncNode* pNode = param;
  if (atomic_load_64(&pNode->pingTimerLogicClockUser) <= atomic_load_64(&pNode->pingTimerLogicClock)) {
    SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2051
    int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_PING, atomic_load_64(&pNode->pingTimerLogicClock),
S
Shengliang Guan 已提交
2052 2053
                                    pNode->pingTimerMS, pNode);
    if (code != 0) {
M
Minghao Li 已提交
2054
      sError("failed to build ping msg");
S
Shengliang Guan 已提交
2055 2056
      rpcFreeCont(rpcMsg.pCont);
      return;
M
Minghao Li 已提交
2057
    }
M
Minghao Li 已提交
2058

M
Minghao Li 已提交
2059
    // sTrace("enqueue ping msg");
S
Shengliang Guan 已提交
2060 2061
    code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
    if (code != 0) {
M
Minghao Li 已提交
2062
      sError("failed to sync enqueue ping msg since %s", terrstr());
S
Shengliang Guan 已提交
2063 2064
      rpcFreeCont(rpcMsg.pCont);
      return;
2065
    }
M
Minghao Li 已提交
2066

S
Shengliang Guan 已提交
2067
    taosTmrReset(syncNodeEqPingTimer, pNode->pingTimerMS, pNode, syncEnv()->pTimerManager, &pNode->pPingTimer);
2068
  }
M
Minghao Li 已提交
2069 2070
}

M
Minghao Li 已提交
2071
static void syncNodeEqElectTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2072
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2073

M
Minghao Li 已提交
2074 2075
  int64_t    rid = (int64_t)param;
  SSyncNode* pNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
2076

2077
  if (pNode == NULL) return;
M
Minghao Li 已提交
2078 2079 2080 2081 2082

  if (pNode->syncEqMsg == NULL) {
    syncNodeRelease(pNode);
    return;
  }
2083

2084
  int64_t tsNow = taosGetTimestampMs();
M
Minghao Li 已提交
2085 2086 2087 2088
  if (tsNow < pNode->electTimerParam.executeTime) {
    syncNodeRelease(pNode);
    return;
  }
M
Minghao Li 已提交
2089

S
Shengliang Guan 已提交
2090
  SRpcMsg rpcMsg = {0};
2091 2092
  int32_t code =
      syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_ELECTION, pNode->electTimerParam.logicClock, pNode->electTimerMS, pNode);
S
Shengliang Guan 已提交
2093

S
Shengliang Guan 已提交
2094
  if (code != 0) {
M
Minghao Li 已提交
2095
    sError("failed to build elect msg");
M
Minghao Li 已提交
2096
    syncNodeRelease(pNode);
S
Shengliang Guan 已提交
2097
    return;
M
Minghao Li 已提交
2098 2099
  }

S
Shengliang Guan 已提交
2100
  SyncTimeout* pTimeout = rpcMsg.pCont;
S
Shengliang Guan 已提交
2101
  sNTrace(pNode, "enqueue elect msg lc:%" PRId64, pTimeout->logicClock);
S
Shengliang Guan 已提交
2102 2103 2104

  code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2105
    sError("failed to sync enqueue elect msg since %s", terrstr());
S
Shengliang Guan 已提交
2106
    rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
2107
    syncNodeRelease(pNode);
2108
    return;
M
Minghao Li 已提交
2109
  }
M
Minghao Li 已提交
2110 2111

  syncNodeRelease(pNode);
M
Minghao Li 已提交
2112 2113
}

M
Minghao Li 已提交
2114
static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2115
  if (!syncIsInit()) return;
2116

S
Shengliang Guan 已提交
2117 2118 2119 2120
  SSyncNode* pNode = param;
  if (pNode->replicaNum > 1) {
    if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) {
      SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2121
      int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock),
S
Shengliang Guan 已提交
2122 2123 2124
                                      pNode->heartbeatTimerMS, pNode);

      if (code != 0) {
M
Minghao Li 已提交
2125
        sError("failed to build heartbeat msg");
S
Shengliang Guan 已提交
2126
        return;
2127
      }
M
Minghao Li 已提交
2128

2129
      sTrace("vgId:%d, enqueue heartbeat timer", pNode->vgId);
S
Shengliang Guan 已提交
2130 2131
      code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
      if (code != 0) {
M
Minghao Li 已提交
2132
        sError("failed to enqueue heartbeat msg since %s", terrstr());
S
Shengliang Guan 已提交
2133 2134
        rpcFreeCont(rpcMsg.pCont);
        return;
2135
      }
S
Shengliang Guan 已提交
2136 2137 2138 2139

      taosTmrReset(syncNodeEqHeartbeatTimer, pNode->heartbeatTimerMS, pNode, syncEnv()->pTimerManager,
                   &pNode->pHeartbeatTimer);

2140
    } else {
S
Shengliang Guan 已提交
2141 2142
      sTrace("==syncNodeEqHeartbeatTimer== heartbeatTimerLogicClock:%" PRId64 ", heartbeatTimerLogicClockUser:%" PRId64,
             pNode->heartbeatTimerLogicClock, pNode->heartbeatTimerLogicClockUser);
2143
    }
M
Minghao Li 已提交
2144 2145 2146
  }
}

2147
static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
2148
  int64_t hbDataRid = (int64_t)param;
2149
  int64_t tsNow = taosGetTimestampMs();
2150

2151 2152
  SSyncHbTimerData* pData = syncHbTimerDataAcquire(hbDataRid);
  if (pData == NULL) {
M
Minghao Li 已提交
2153
    sError("hb timer get pData NULL, %" PRId64, hbDataRid);
2154 2155
    return;
  }
2156

2157
  SSyncNode* pSyncNode = syncNodeAcquire(pData->syncNodeRid);
M
Minghao Li 已提交
2158
  if (pSyncNode == NULL) {
2159
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2160
    sError("hb timer get pSyncNode NULL");
2161 2162 2163 2164 2165 2166 2167 2168
    return;
  }

  SSyncTimer* pSyncTimer = pData->pTimer;

  if (!pSyncNode->isStart) {
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2169
    sError("vgId:%d, hb timer sync node already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2170 2171 2172
    return;
  }

M
Minghao Li 已提交
2173
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
2174 2175
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2176
    sError("vgId:%d, hb timer sync node not leader", pSyncNode->vgId);
M
Minghao Li 已提交
2177 2178 2179
    return;
  }

M
Minghao Li 已提交
2180
  // sTrace("vgId:%d, eq peer hb timer", pSyncNode->vgId);
2181 2182

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
2183 2184 2185
    int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock);
    int64_t msgLogicClock = atomic_load_64(&pData->logicClock);

2186
    if (timerLogicClock == msgLogicClock) {
2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202
      if (tsNow > pData->execTime) {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  EXECUTE this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, "
            "---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif

        pData->execTime += pSyncTimer->timerMS;

        SRpcMsg rpcMsg = {0};
        (void)syncBuildHeartbeat(&rpcMsg, pSyncNode->vgId);

        SyncHeartbeat* pSyncMsg = rpcMsg.pCont;
        pSyncMsg->srcId = pSyncNode->myRaftId;
        pSyncMsg->destId = pData->destId;
2203
        pSyncMsg->term = raftStoreGetTerm(pSyncNode);
2204 2205 2206
        pSyncMsg->commitIndex = pSyncNode->commitIndex;
        pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
        pSyncMsg->privateTerm = 0;
2207
        pSyncMsg->timeStamp = tsNow;
2208 2209 2210 2211 2212 2213

        // update reset time
        int64_t timerElapsed = tsNow - pSyncTimer->timeStamp;
        pSyncTimer->timeStamp = tsNow;

        // send msg
2214 2215
        syncLogSendHeartbeat(pSyncNode, pSyncMsg, false, timerElapsed, pData->execTime);
        syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
2216 2217 2218 2219 2220 2221 2222 2223
      } else {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  pass this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, ---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif
      }

M
Minghao Li 已提交
2224 2225
      if (syncIsInit()) {
        // sTrace("vgId:%d, reset peer hb timer", pSyncNode->vgId);
2226 2227
        taosTmrReset(syncNodeEqPeerHeartbeatTimer, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)hbDataRid,
                     syncEnv()->pTimerManager, &pSyncTimer->pTimer);
M
Minghao Li 已提交
2228 2229 2230 2231
      } else {
        sError("sync env is stop, reset peer hb timer error");
      }

2232
    } else {
M
Minghao Li 已提交
2233 2234
      sTrace("vgId:%d, do not send hb, timerLogicClock:%" PRId64 ", msgLogicClock:%" PRId64 "", pSyncNode->vgId,
             timerLogicClock, msgLogicClock);
2235 2236
    }
  }
2237 2238 2239

  syncHbTimerDataRelease(pData);
  syncNodeRelease(pSyncNode);
2240 2241
}

2242 2243
static void deleteCacheEntry(const void* key, size_t keyLen, void* value) { taosMemoryFree(value); }

2244 2245 2246 2247
int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, LRUHandle** h) {
  SSyncLogStoreData* pData = pLogStore->data;
  sNTrace(pData->pSyncNode, "in cache index:%" PRId64 ", bytes:%u, %p", pEntry->index, pEntry->bytes, pEntry);

S
Shengliang Guan 已提交
2248 2249
  int32_t   code = 0;
  int32_t   entryLen = sizeof(*pEntry) + pEntry->dataLen;
2250 2251 2252 2253 2254 2255 2256 2257 2258
  LRUStatus status = taosLRUCacheInsert(pLogStore->pCache, &pEntry->index, sizeof(pEntry->index), pEntry, entryLen,
                                        deleteCacheEntry, h, TAOS_LRU_PRIORITY_LOW);
  if (status != TAOS_LRU_STATUS_OK) {
    code = -1;
  }

  return code;
}

B
Benguang Zhao 已提交
2259
int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) {
2260 2261 2262 2263 2264 2265 2266
  if (pEntry->dataLen < sizeof(SMsgHead)) {
    sError("vgId:%d, cannot append an invalid client request with no msg head. type:%s, dataLen:%d", ths->vgId,
           TMSG_INFO(pEntry->originalRpcType), pEntry->dataLen);
    syncEntryDestroy(pEntry);
    return -1;
  }

B
Benguang Zhao 已提交
2267 2268
  // append to log buffer
  if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) {
2269 2270
    sError("vgId:%d, failed to enqueue sync log buffer, index:%" PRId64, ths->vgId, pEntry->index);
    terrno = TSDB_CODE_SYN_BUFFER_FULL;
2271
    (void)syncLogFsmExecute(ths, ths->pFsm, ths->state, raftStoreGetTerm(ths), pEntry, TSDB_CODE_SYN_BUFFER_FULL);
2272
    syncEntryDestroy(pEntry);
B
Benguang Zhao 已提交
2273 2274 2275 2276
    return -1;
  }

  // proceed match index, with replicating on needed
2277
  SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, NULL);
B
Benguang Zhao 已提交
2278

S
Shengliang Guan 已提交
2279
  sTrace("vgId:%d, append raft entry. index:%" PRId64 ", term:%" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64
2280 2281 2282
         ", %" PRId64 ")",
         ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex,
         ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex);
B
Benguang Zhao 已提交
2283

B
Benguang Zhao 已提交
2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299
  // multi replica
  if (ths->replicaNum > 1) {
    return 0;
  }

  // single replica
  (void)syncNodeUpdateCommitIndex(ths, matchIndex);

  if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
    sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex);
    return -1;
  }

  return 0;
}

2300
bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) {
2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312
  if (pSyncNode->replicaNum == 1) {
    return false;
  }

  int32_t toCount = 0;
  int64_t tsNow = taosGetTimestampMs();
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
    int64_t recvTime = syncIndexMgrGetRecvTime(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (recvTime == 0 || recvTime == -1) {
      continue;
    }

2313
    if (tsNow - recvTime > tsHeartbeatTimeout) {
2314 2315 2316 2317 2318 2319 2320 2321 2322
      toCount++;
    }
  }

  bool b = (toCount >= pSyncNode->quorum ? true : false);

  return b;
}

2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341
bool syncNodeSnapshotSending(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  bool b = false;
  for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
    if (pSyncNode->senders[i] != NULL && pSyncNode->senders[i]->start) {
      b = true;
      break;
    }
  }
  return b;
}

bool syncNodeSnapshotRecving(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  if (pSyncNode->pNewNodeReceiver == NULL) return false;
  if (pSyncNode->pNewNodeReceiver->start) return true;
  return false;
}

M
Minghao Li 已提交
2342
static int32_t syncNodeAppendNoop(SSyncNode* ths) {
B
Benguang Zhao 已提交
2343
  SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf);
2344
  SyncTerm  term = raftStoreGetTerm(ths);
B
Benguang Zhao 已提交
2345 2346 2347 2348 2349 2350 2351

  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
  if (pEntry == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return -1;
  }

B
Benguang Zhao 已提交
2352 2353
  int32_t ret = syncNodeAppend(ths, pEntry);
  return 0;
B
Benguang Zhao 已提交
2354 2355 2356
}

static int32_t syncNodeAppendNoopOld(SSyncNode* ths) {
M
Minghao Li 已提交
2357 2358
  int32_t ret = 0;

2359
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
2360
  SyncTerm        term = raftStoreGetTerm(ths);
M
Minghao Li 已提交
2361
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
2362
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2363

2364 2365
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2366
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2367
    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry, false);
2368
    if (code != 0) {
M
Minghao Li 已提交
2369
      sError("append noop error");
2370 2371
      return -1;
    }
2372 2373

    syncCacheEntry(ths->pLogStore, pEntry, &h);
M
Minghao Li 已提交
2374 2375
  }

2376 2377 2378
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2379
    syncEntryDestroy(pEntry);
2380 2381
  }

M
Minghao Li 已提交
2382 2383 2384
  return ret;
}

S
Shengliang Guan 已提交
2385 2386
int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncHeartbeat* pMsg = pRpcMsg->pCont;
2387

M
Minghao Li 已提交
2388 2389 2390 2391
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2392
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2393
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2394
  syncLogRecvHeartbeat(ths, pMsg, timeDiff, tbuf);
2395

2396 2397
  SRpcMsg rpcMsg = {0};
  (void)syncBuildHeartbeatReply(&rpcMsg, ths->vgId);
2398
  SyncTerm currentTerm = raftStoreGetTerm(ths);
2399 2400

  SyncHeartbeatReply* pMsgReply = rpcMsg.pCont;
2401 2402
  pMsgReply->destId = pMsg->srcId;
  pMsgReply->srcId = ths->myRaftId;
2403
  pMsgReply->term = currentTerm;
2404
  pMsgReply->privateTerm = 8864;  // magic number
2405
  pMsgReply->startTime = ths->startTime;
2406
  pMsgReply->timeStamp = tsMs;
2407

2408
  if (pMsg->term == currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) {
2409 2410
    syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), tsMs);

2411
    syncNodeResetElectTimer(ths);
M
Minghao Li 已提交
2412
    ths->minMatchIndex = pMsg->minMatchIndex;
2413 2414

    if (ths->state == TAOS_SYNC_STATE_FOLLOWER) {
2415
      // syncNodeFollowerCommit(ths, pMsg->commitIndex);
S
Shengliang Guan 已提交
2416 2417 2418 2419
      SRpcMsg rpcMsgLocalCmd = {0};
      (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

      SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2420
      pSyncMsg->cmd = SYNC_LOCAL_CMD_FOLLOWER_CMT;
2421 2422 2423
      pSyncMsg->commitIndex = pMsg->commitIndex;
      pSyncMsg->currentTerm = pMsg->term;
      SyncIndex fcIndex = pSyncMsg->commitIndex;
2424 2425 2426 2427 2428 2429 2430

      if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
        int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
        if (code != 0) {
          sError("vgId:%d, sync enqueue fc-commit msg error, code:%d", ths->vgId, code);
          rpcFreeCont(rpcMsgLocalCmd.pCont);
        } else {
2431
          sTrace("vgId:%d, sync enqueue fc-commit msg, fc-index:%" PRId64, ths->vgId, fcIndex);
2432 2433
        }
      }
2434 2435 2436
    }
  }

2437
  if (pMsg->term >= currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) {
2438
    // syncNodeStepDown(ths, pMsg->term);
S
Shengliang Guan 已提交
2439 2440 2441 2442
    SRpcMsg rpcMsgLocalCmd = {0};
    (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

    SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2443
    pSyncMsg->cmd = SYNC_LOCAL_CMD_STEP_DOWN;
2444 2445
    pSyncMsg->currentTerm = pMsg->term;
    pSyncMsg->commitIndex = pMsg->commitIndex;
2446

S
Shengliang Guan 已提交
2447 2448
    if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
      int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
2449 2450 2451 2452
      if (code != 0) {
        sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code);
        rpcFreeCont(rpcMsgLocalCmd.pCont);
      } else {
S
Shengliang Guan 已提交
2453
        sTrace("vgId:%d, sync enqueue step-down msg, new-term:%" PRId64, ths->vgId, pSyncMsg->currentTerm);
2454
      }
2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469
    }
  }

  /*
    // htonl
    SMsgHead* pHead = rpcMsg.pCont;
    pHead->contLen = htonl(pHead->contLen);
    pHead->vgId = htonl(pHead->vgId);
  */

  // reply
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);
  return 0;
}

2470
int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
S
Shengliang Guan 已提交
2471 2472 2473 2474
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2475
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
B
Benguang Zhao 已提交
2476
  SSyncLogReplMgr*    pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId);
2477 2478 2479 2480
  if (pMgr == NULL) {
    sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr);
    return -1;
  }
2481 2482

  int64_t tsMs = taosGetTimestampMs();
S
Shengliang Guan 已提交
2483
  syncLogRecvHeartbeatReply(ths, pMsg, tsMs - pMsg->timeStamp, tbuf);
2484

2485 2486
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);

2487 2488 2489
  return syncLogReplMgrProcessHeartbeatReply(pMgr, ths, pMsg);
}

2490
int32_t syncNodeOnHeartbeatReplyOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
2491
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
2492

M
Minghao Li 已提交
2493 2494 2495 2496
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

M
Minghao Li 已提交
2497
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2498
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2499
  syncLogRecvHeartbeatReply(ths, pMsg, timeDiff, tbuf);
M
Minghao Li 已提交
2500

2501
  // update last reply time, make decision whether the other node is alive or not
M
Minghao Li 已提交
2502
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);
2503 2504 2505
  return 0;
}

S
Shengliang Guan 已提交
2506 2507
int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
2508 2509
  syncLogRecvLocalCmd(ths, pMsg, "");

2510
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
2511
    syncNodeStepDown(ths, pMsg->currentTerm);
2512 2513

  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
2514 2515 2516 2517
    SyncTerm matchTerm = syncLogBufferGetLastMatchTerm(ths->pLogBuf);
    if (pMsg->currentTerm == matchTerm) {
      (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex);
    }
2518
    if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
S
Shengliang Guan 已提交
2519
      sError("vgId:%d, failed to commit raft log since %s. commit index:%" PRId64 "", ths->vgId, terrstr(),
2520 2521 2522 2523 2524 2525 2526 2527 2528
             ths->commitIndex);
    }
  } else {
    sError("error local cmd");
  }

  return 0;
}

M
Minghao Li 已提交
2529 2530 2531 2532 2533 2534 2535 2536 2537 2538
// TLA+ Spec
// ClientRequest(i, v) ==
//     /\ state[i] = Leader
//     /\ LET entry == [term  |-> currentTerm[i],
//                      value |-> v]
//            newLog == Append(log[i], entry)
//        IN  log' = [log EXCEPT ![i] = newLog]
//     /\ UNCHANGED <<messages, serverVars, candidateVars,
//                    leaderVars, commitIndex>>
//
M
Minghao Li 已提交
2539

2540
int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
S
Shengliang Guan 已提交
2541
  sNTrace(ths, "on client request");
2542

B
Benguang Zhao 已提交
2543 2544
  int32_t code = 0;

B
Benguang Zhao 已提交
2545
  SyncIndex       index = syncLogBufferGetEndIndex(ths->pLogBuf);
2546
  SyncTerm        term = raftStoreGetTerm(ths);
B
Benguang Zhao 已提交
2547
  SSyncRaftEntry* pEntry = NULL;
2548 2549 2550 2551
  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
B
Benguang Zhao 已提交
2552 2553
  }

2554 2555 2556 2557 2558
  if (pEntry == NULL) {
    sError("vgId:%d, failed to process client request since %s.", ths->vgId, terrstr());
    return -1;
  }

B
Benguang Zhao 已提交
2559 2560 2561 2562 2563
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
    if (pRetIndex) {
      (*pRetIndex) = index;
    }

2564 2565
    int32_t code = syncNodeAppend(ths, pEntry);
    return code;
2566 2567 2568
  } else {
    syncEntryDestroy(pEntry);
    pEntry = NULL;
2569
    return -1;
B
Benguang Zhao 已提交
2570 2571 2572
  }
}

S
Shengliang Guan 已提交
2573 2574 2575
const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
2576
      return "follower";
S
Shengliang Guan 已提交
2577
    case TAOS_SYNC_STATE_CANDIDATE:
2578
      return "candidate";
S
Shengliang Guan 已提交
2579
    case TAOS_SYNC_STATE_LEADER:
2580
      return "leader";
S
Shengliang Guan 已提交
2581
    case TAOS_SYNC_STATE_ERROR:
2582
      return "error";
S
Shengliang Guan 已提交
2583 2584 2585 2586
    case TAOS_SYNC_STATE_OFFLINE:
      return "offline";
    default:
      return "unknown";
S
Shengliang Guan 已提交
2587
  }
M
Minghao Li 已提交
2588
}
2589

2590
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
2591
  for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) {
2592 2593 2594 2595
    SRaftId raftId = {
        .addr = SYNC_ADDR(&pNewCfg->nodeInfo[i]),
        .vgId = ths->vgId,
    };
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605

    if (syncUtilSameId(&(ths->myRaftId), &raftId)) {
      pNewCfg->myIndex = i;
      return 0;
    }
  }

  return -1;
}

2606 2607 2608 2609
bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
}

2610
bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
S
Shengliang Guan 已提交
2611
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2612 2613 2614 2615 2616
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
      return true;
    }
  }
  return false;
M
Minghao Li 已提交
2617 2618 2619 2620
}

SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
  SSyncSnapshotSender* pSender = NULL;
S
Shengliang Guan 已提交
2621
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
2622 2623 2624 2625 2626
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pSender = (ths->senders)[i];
    }
  }
  return pSender;
M
Minghao Li 已提交
2627
}
M
Minghao Li 已提交
2628

2629 2630
SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
  SSyncTimer* pTimer = NULL;
S
Shengliang Guan 已提交
2631
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2632 2633 2634 2635 2636 2637 2638
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pTimer = &((ths->peerHeartbeatTimerArr)[i]);
    }
  }
  return pTimer;
}

M
Minghao Li 已提交
2639 2640
SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) {
  SPeerState* pState = NULL;
S
Shengliang Guan 已提交
2641
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
2642 2643 2644 2645 2646 2647 2648 2649 2650
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pState = &((ths->peerStates)[i]);
    }
  }
  return pState;
}

bool syncNodeNeedSendAppendEntries(SSyncNode* ths, const SRaftId* pDestId, const SyncAppendEntries* pMsg) {
  SPeerState* pState = syncNodeGetPeerState(ths, pDestId);
M
Minghao Li 已提交
2651
  if (pState == NULL) {
2652
    sError("vgId:%d, replica maybe dropped", ths->vgId);
M
Minghao Li 已提交
2653 2654
    return false;
  }
M
Minghao Li 已提交
2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665

  SyncIndex sendIndex = pMsg->prevLogIndex + 1;
  int64_t   tsNow = taosGetTimestampMs();

  if (pState->lastSendIndex == sendIndex && tsNow - pState->lastSendTime < SYNC_APPEND_ENTRIES_TIMEOUT_MS) {
    return false;
  }

  return true;
}

M
Minghao Li 已提交
2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679
bool syncNodeCanChange(SSyncNode* pSyncNode) {
  if (pSyncNode->changing) {
    sError("sync cannot change");
    return false;
  }

  if ((pSyncNode->commitIndex >= SYNC_INDEX_BEGIN)) {
    SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
    if (pSyncNode->commitIndex != lastIndex) {
      sError("sync cannot change2");
      return false;
    }
  }

S
Shengliang Guan 已提交
2680
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
2681
    SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->peersId)[i]);
M
Minghao Li 已提交
2682
    if (pSender != NULL && pSender->start) {
M
Minghao Li 已提交
2683 2684 2685 2686 2687 2688
      sError("sync cannot change3");
      return false;
    }
  }

  return true;
M
Minghao Li 已提交
2689
}