syncMain.c 98.1 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

S
Shengliang Guan 已提交
16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "sync.h"
M
Minghao Li 已提交
18 19
#include "syncAppendEntries.h"
#include "syncAppendEntriesReply.h"
M
Minghao Li 已提交
20
#include "syncCommit.h"
M
Minghao Li 已提交
21
#include "syncElection.h"
M
Minghao Li 已提交
22
#include "syncEnv.h"
M
Minghao Li 已提交
23
#include "syncIndexMgr.h"
M
Minghao Li 已提交
24
#include "syncInt.h"
M
Minghao Li 已提交
25
#include "syncMessage.h"
26
#include "syncPipeline.h"
M
Minghao Li 已提交
27
#include "syncRaftCfg.h"
M
Minghao Li 已提交
28
#include "syncRaftLog.h"
M
Minghao Li 已提交
29
#include "syncRaftStore.h"
M
Minghao Li 已提交
30
#include "syncReplication.h"
M
Minghao Li 已提交
31 32
#include "syncRequestVote.h"
#include "syncRequestVoteReply.h"
M
Minghao Li 已提交
33
#include "syncRespMgr.h"
M
Minghao Li 已提交
34
#include "syncSnapshot.h"
M
Minghao Li 已提交
35
#include "syncTimeout.h"
M
Minghao Li 已提交
36
#include "syncUtil.h"
M
Minghao Li 已提交
37
#include "syncVoteMgr.h"
38
#include "tglobal.h"
M
Minghao Li 已提交
39
#include "tref.h"
M
Minghao Li 已提交
40

M
Minghao Li 已提交
41 42 43 44 45
static void    syncNodeEqPingTimer(void* param, void* tmrId);
static void    syncNodeEqElectTimer(void* param, void* tmrId);
static void    syncNodeEqHeartbeatTimer(void* param, void* tmrId);
static int32_t syncNodeEqNoop(SSyncNode* ths);
static int32_t syncNodeAppendNoop(SSyncNode* ths);
46
static void    syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId);
S
Shengliang Guan 已提交
47
static bool    syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg);
S
Shengliang Guan 已提交
48 49 50
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId);
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
S
Shengliang Guan 已提交
51 52 53 54 55 56 57 58 59 60 61
static int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg);
static bool    syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config);
static void    syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* newConfig, SyncIndex lastConfigChangeIndex);
static bool    syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg);

static bool    syncNodeCanChange(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader);
static int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry);

static ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode);
M
Minghao Li 已提交
62

63
int64_t syncOpen(SSyncInfo* pSyncInfo) {
M
Minghao Li 已提交
64
  SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo);
65
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
66
    sError("vgId:%d, failed to open sync node", pSyncInfo->vgId);
67 68
    return -1;
  }
M
Minghao Li 已提交
69

S
Shengliang Guan 已提交
70
  pSyncNode->rid = syncNodeAdd(pSyncNode);
M
Minghao Li 已提交
71
  if (pSyncNode->rid < 0) {
72
    syncNodeClose(pSyncNode);
M
Minghao Li 已提交
73 74 75
    return -1;
  }

S
Shengliang Guan 已提交
76 77 78 79 80 81
  pSyncNode->pingBaseLine = pSyncInfo->pingMs;
  pSyncNode->pingTimerMS = pSyncInfo->pingMs;
  pSyncNode->electBaseLine = pSyncInfo->electMs;
  pSyncNode->hbBaseLine = pSyncInfo->heartbeatMs;
  pSyncNode->heartbeatTimerMS = pSyncInfo->heartbeatMs;
  pSyncNode->msgcb = pSyncInfo->msgcb;
M
Minghao Li 已提交
82
  return pSyncNode->rid;
M
Minghao Li 已提交
83
}
M
Minghao Li 已提交
84

B
Benguang Zhao 已提交
85
int32_t syncStart(int64_t rid) {
S
Shengliang Guan 已提交
86
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
87
  if (pSyncNode == NULL) {
B
Benguang Zhao 已提交
88 89 90 91 92
    sError("failed to acquire rid: %" PRId64 " of tsNodeReftId for pSyncNode", rid);
    return -1;
  }

  if (syncNodeRestore(pSyncNode) < 0) {
93
    sError("vgId:%d, failed to restore sync log buffer since %s", pSyncNode->vgId, terrstr());
94
    goto _err;
M
Minghao Li 已提交
95
  }
M
Minghao Li 已提交
96

B
Benguang Zhao 已提交
97 98 99 100
  if (syncNodeStart(pSyncNode) < 0) {
    sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr());
    goto _err;
  }
M
Minghao Li 已提交
101

B
Benguang Zhao 已提交
102 103
  syncNodeRelease(pSyncNode);
  return 0;
M
Minghao Li 已提交
104

105 106 107
_err:
  syncNodeRelease(pSyncNode);
  return -1;
M
Minghao Li 已提交
108 109
}

M
Minghao Li 已提交
110
void syncStop(int64_t rid) {
S
Shengliang Guan 已提交
111
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
112
  if (pSyncNode != NULL) {
113
    pSyncNode->isStart = false;
S
Shengliang Guan 已提交
114
    syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
115
    syncNodeRemove(rid);
M
Minghao Li 已提交
116 117 118
  }
}

M
Minghao Li 已提交
119 120
void syncPreStop(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
121 122 123
  if (pSyncNode != NULL) {
    syncNodePreClose(pSyncNode);
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
124 125 126
  }
}

S
Shengliang Guan 已提交
127 128 129
static bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pCfg) {
  if (!syncNodeInConfig(pSyncNode, pCfg)) return false;
  return abs(pCfg->replicaNum - pSyncNode->replicaNum) <= 1;
M
Minghao Li 已提交
130 131
}

S
Shengliang Guan 已提交
132
int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
133
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
134
  if (pSyncNode == NULL) return -1;
M
Minghao Li 已提交
135

M
Minghao Li 已提交
136
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
S
Shengliang Guan 已提交
137
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
138
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
S
Shengliang Guan 已提交
139
    sError("vgId:%d, failed to reconfig since invalid new config", pSyncNode->vgId);
M
Minghao Li 已提交
140
    return -1;
M
Minghao Li 已提交
141
  }
142

S
Shengliang Guan 已提交
143 144
  syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg);
  syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID);
S
Shengliang Guan 已提交
145

M
Minghao Li 已提交
146 147 148 149
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    syncNodeStopHeartbeatTimer(pSyncNode);

    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
S
Shengliang Guan 已提交
150
      syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]);
M
Minghao Li 已提交
151 152 153 154 155
    }

    syncNodeStartHeartbeatTimer(pSyncNode);
    syncNodeReplicate(pSyncNode);
  }
S
Shengliang Guan 已提交
156

S
Shengliang Guan 已提交
157
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
158
  return 0;
M
Minghao Li 已提交
159
}
M
Minghao Li 已提交
160

S
Shengliang Guan 已提交
161 162 163 164
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg) {
  int32_t code = -1;
  if (!syncIsInit()) return code;

S
Shengliang Guan 已提交
165
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
166 167
  if (pSyncNode == NULL) return code;

S
Shengliang Guan 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
  switch (pMsg->msgType) {
    case TDMT_SYNC_HEARTBEAT:
      code = syncNodeOnHeartbeat(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_HEARTBEAT_REPLY:
      code = syncNodeOnHeartbeatReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_TIMEOUT:
      code = syncNodeOnTimeout(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_CLIENT_REQUEST:
      code = syncNodeOnClientRequest(pSyncNode, pMsg, NULL);
      break;
    case TDMT_SYNC_REQUEST_VOTE:
      code = syncNodeOnRequestVote(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_REQUEST_VOTE_REPLY:
      code = syncNodeOnRequestVoteReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES:
      code = syncNodeOnAppendEntries(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES_REPLY:
      code = syncNodeOnAppendEntriesReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_SEND:
      code = syncNodeOnSnapshot(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_RSP:
      code = syncNodeOnSnapshotReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_LOCAL_CMD:
      code = syncNodeOnLocalCmd(pSyncNode, pMsg);
      break;
    default:
      sError("vgId:%d, failed to process msg:%p since invalid type:%s", pSyncNode->vgId, pMsg,
             TMSG_INFO(pMsg->msgType));
      code = -1;
M
Minghao Li 已提交
206 207
  }

S
Shengliang Guan 已提交
208
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
209
  return code;
210 211
}

S
Shengliang Guan 已提交
212
int32_t syncLeaderTransfer(int64_t rid) {
S
Shengliang Guan 已提交
213
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
214
  if (pSyncNode == NULL) return -1;
215

S
Shengliang Guan 已提交
216
  int32_t ret = syncNodeLeaderTransfer(pSyncNode);
S
Shengliang Guan 已提交
217
  syncNodeRelease(pSyncNode);
218 219 220
  return ret;
}

M
Minghao Li 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
SyncIndex syncMinMatchIndex(SSyncNode* pSyncNode) {
  SyncIndex minMatchIndex = SYNC_INDEX_INVALID;

  if (pSyncNode->peersNum > 0) {
    minMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
  }

  for (int32_t i = 1; i < pSyncNode->peersNum; ++i) {
    SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (matchIndex < minMatchIndex) {
      minMatchIndex = matchIndex;
    }
  }
  return minMatchIndex;
}

237
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
S
Shengliang Guan 已提交
238
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
239
  if (pSyncNode == NULL) {
240
    sError("sync begin snapshot error");
241 242
    return -1;
  }
243

244 245
  int32_t code = 0;

M
Minghao Li 已提交
246
  if (syncNodeIsMnode(pSyncNode)) {
M
Minghao Li 已提交
247 248 249
    // mnode
    int64_t logRetention = SYNC_MNODE_LOG_RETENTION;

M
Minghao Li 已提交
250 251 252
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    int64_t   logNum = endIndex - beginIndex;
M
Minghao Li 已提交
253 254 255
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || (!isEmpty && logNum < logRetention)) {
S
Shengliang Guan 已提交
256 257
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", log-num:%" PRId64 ", empty:%d, do not delete wal",
              lastApplyIndex, logNum, isEmpty);
S
Shengliang Guan 已提交
258
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
259 260 261
      return 0;
    }

M
Minghao Li 已提交
262 263 264
    goto _DEL_WAL;

  } else {
265 266 267 268 269 270 271 272 273 274 275 276
    lastApplyIndex -= SYNC_VNODE_LOG_RETENTION;

    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || !(lastApplyIndex >= beginIndex && lastApplyIndex <= endIndex)) {
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", empty:%d, do not delete wal", lastApplyIndex, isEmpty);
      syncNodeRelease(pSyncNode);
      return 0;
    }

M
Minghao Li 已提交
277 278 279 280 281 282 283 284 285 286 287 288 289 290
    // vnode
    if (pSyncNode->replicaNum > 1) {
      // multi replicas

      if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
        pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode);

        for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
          int64_t matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
          if (lastApplyIndex > matchIndex) {
            do {
              char     host[64];
              uint16_t port;
              syncUtilU642Addr(pSyncNode->peersId[i].addr, host, sizeof(host), &port);
S
Shengliang Guan 已提交
291 292 293 294
              sNTrace(pSyncNode,
                      "new-snapshot-index:%" PRId64 " is greater than match-index:%" PRId64
                      " of %s:%d, do not delete wal",
                      lastApplyIndex, matchIndex, host, port);
M
Minghao Li 已提交
295 296
            } while (0);

S
Shengliang Guan 已提交
297
            syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
298 299 300 301 302 303
            return 0;
          }
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
        if (lastApplyIndex > pSyncNode->minMatchIndex) {
S
Shengliang Guan 已提交
304 305 306
          sNTrace(pSyncNode,
                  "new-snapshot-index:%" PRId64 " is greater than min-match-index:%" PRId64 ", do not delete wal",
                  lastApplyIndex, pSyncNode->minMatchIndex);
S
Shengliang Guan 已提交
307
          syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
308 309 310 311
          return 0;
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE) {
S
Shengliang Guan 已提交
312
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
313
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
314 315 316
        return 0;

      } else {
S
Shengliang Guan 已提交
317
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " unknown state, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
318
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
319 320 321 322 323 324 325 326 327
        return 0;
      }

      goto _DEL_WAL;

    } else {
      // one replica

      goto _DEL_WAL;
328 329 330
    }
  }

M
Minghao Li 已提交
331
_DEL_WAL:
332

M
Minghao Li 已提交
333
  do {
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    SyncIndex          snapshotVer = walGetSnapshotVer(pData->pWal);
    SyncIndex          walCommitVer = walGetCommittedVer(pData->pWal);
    SyncIndex          wallastVer = walGetLastVer(pData->pWal);
    if (lastApplyIndex <= walCommitVer) {
      SyncIndex snapshottingIndex = atomic_load_64(&pSyncNode->snapshottingIndex);

      if (snapshottingIndex == SYNC_INDEX_INVALID) {
        atomic_store_64(&pSyncNode->snapshottingIndex, lastApplyIndex);
        pSyncNode->snapshottingTime = taosGetTimestampMs();

        code = walBeginSnapshot(pData->pWal, lastApplyIndex);
        if (code == 0) {
          sNTrace(pSyncNode, "wal snapshot begin, index:%" PRId64 ", last apply index:%" PRId64,
                  pSyncNode->snapshottingIndex, lastApplyIndex);
        } else {
          sNError(pSyncNode, "wal snapshot begin error since:%s, index:%" PRId64 ", last apply index:%" PRId64,
                  terrstr(terrno), pSyncNode->snapshottingIndex, lastApplyIndex);
          atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
        }
354

M
Minghao Li 已提交
355
      } else {
356 357
        sNTrace(pSyncNode, "snapshotting for %" PRId64 ", do not delete wal for new-snapshot-index:%" PRId64,
                snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
358
      }
359
    }
M
Minghao Li 已提交
360
  } while (0);
361

S
Shengliang Guan 已提交
362
  syncNodeRelease(pSyncNode);
363 364 365 366
  return code;
}

int32_t syncEndSnapshot(int64_t rid) {
S
Shengliang Guan 已提交
367
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
368
  if (pSyncNode == NULL) {
369
    sError("sync end snapshot error");
370 371 372
    return -1;
  }

373 374 375 376
  int32_t code = 0;
  if (atomic_load_64(&pSyncNode->snapshottingIndex) != SYNC_INDEX_INVALID) {
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    code = walEndSnapshot(pData->pWal);
M
Minghao Li 已提交
377
    if (code != 0) {
378
      sNError(pSyncNode, "wal snapshot end error since:%s", terrstr());
S
Shengliang Guan 已提交
379
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
380 381
      return -1;
    } else {
S
Shengliang Guan 已提交
382
      sNTrace(pSyncNode, "wal snapshot end, index:%" PRId64, atomic_load_64(&pSyncNode->snapshottingIndex));
M
Minghao Li 已提交
383 384
      atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
    }
385
  }
386

S
Shengliang Guan 已提交
387
  syncNodeRelease(pSyncNode);
388 389 390
  return code;
}

M
Minghao Li 已提交
391
int32_t syncStepDown(int64_t rid, SyncTerm newTerm) {
S
Shengliang Guan 已提交
392
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
393
  if (pSyncNode == NULL) {
394
    sError("sync step down error");
M
Minghao Li 已提交
395 396 397
    return -1;
  }

M
Minghao Li 已提交
398
  syncNodeStepDown(pSyncNode, newTerm);
S
Shengliang Guan 已提交
399
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
400
  return 0;
M
Minghao Li 已提交
401 402
}

403
bool syncNodeIsReadyForRead(SSyncNode* pSyncNode) {
404
  if (pSyncNode == NULL) {
405
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
406
    sError("sync ready for read error");
407 408
    return false;
  }
M
Minghao Li 已提交
409

410 411 412 413 414 415
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return false;
  }

  if (pSyncNode->restoreFinish) {
416
    return true;
M
Minghao Li 已提交
417 418
  }

419
  bool ready = false;
420 421 422
  if (!pSyncNode->pFsm->FpApplyQueueEmptyCb(pSyncNode->pFsm)) {
    // apply queue not empty
    ready = false;
M
Minghao Li 已提交
423

424 425 426 427 428 429 430 431 432 433 434 435 436
  } else {
    if (!pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore)) {
      SyncIndex       lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
      SSyncRaftEntry* pEntry = NULL;
      SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
      LRUHandle*      h = taosLRUCacheLookup(pCache, &lastIndex, sizeof(lastIndex));
      int32_t         code = 0;
      if (h) {
        pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
        code = 0;

        pSyncNode->pLogStore->cacheHit++;
        sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", lastIndex, pEntry->bytes, pEntry);
M
Minghao Li 已提交
437

438 439 440
      } else {
        pSyncNode->pLogStore->cacheMiss++;
        sNTrace(pSyncNode, "miss cache index:%" PRId64, lastIndex);
M
Minghao Li 已提交
441

442 443
        code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, lastIndex, &pEntry);
      }
444

445 446 447
      if (code == 0 && pEntry != NULL) {
        if (pEntry->originalRpcType == TDMT_SYNC_NOOP && pEntry->term == pSyncNode->pRaftStore->currentTerm) {
          ready = true;
448
        }
449

450 451 452 453
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
          syncEntryDestroy(pEntry);
454
        }
455 456 457 458
      }
    }
  }

459
  if (!ready) {
460
    terrno = TSDB_CODE_SYN_RESTORING;
461
  }
462

463 464 465 466 467 468 469 470 471 472 473 474
  return ready;
}

bool syncIsReadyForRead(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    sError("sync ready for read error");
    return false;
  }

  bool ready = syncNodeIsReadyForRead(pSyncNode);

475 476
  syncNodeRelease(pSyncNode);
  return ready;
M
Minghao Li 已提交
477
}
M
Minghao Li 已提交
478

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
bool syncSnapshotSending(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotSending(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

bool syncSnapshotRecving(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotRecving(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

M
Minghao Li 已提交
501 502
int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode) {
  if (pSyncNode->peersNum == 0) {
S
Shengliang Guan 已提交
503
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
504 505
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
506
  }
M
Minghao Li 已提交
507

508
  int32_t ret = 0;
509
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER && pSyncNode->replicaNum > 1) {
510
    SNodeInfo newLeader = (pSyncNode->peersNodeInfo)[0];
511 512 513 514 515 516 517
    if (pSyncNode->peersNum == 2) {
      SyncIndex matchIndex0 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
      SyncIndex matchIndex1 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[1]));
      if (matchIndex1 > matchIndex0) {
        newLeader = (pSyncNode->peersNodeInfo)[1];
      }
    }
518 519 520
    ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
  }

M
Minghao Li 已提交
521
  return ret;
M
Minghao Li 已提交
522 523
}

M
Minghao Li 已提交
524 525
int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader) {
  if (pSyncNode->replicaNum == 1) {
S
Shengliang Guan 已提交
526
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
527 528
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
529
  }
530

S
Shengliang Guan 已提交
531
  sNTrace(pSyncNode, "begin leader transfer to %s:%u", newLeader.nodeFqdn, newLeader.nodePort);
M
Minghao Li 已提交
532

533 534 535 536
  SRpcMsg rpcMsg = {0};
  (void)syncBuildLeaderTransfer(&rpcMsg, pSyncNode->vgId);

  SyncLeaderTransfer* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
537 538 539 540
  pMsg->newLeaderId.addr = syncUtilAddr2U64(newLeader.nodeFqdn, newLeader.nodePort);
  pMsg->newLeaderId.vgId = pSyncNode->vgId;
  pMsg->newNodeInfo = newLeader;

S
Shengliang Guan 已提交
541 542 543
  int32_t ret = syncNodePropose(pSyncNode, &rpcMsg, false);
  rpcFreeCont(rpcMsg.pCont);
  return ret;
M
Minghao Li 已提交
544 545
}

546 547
SSyncState syncGetState(int64_t rid) {
  SSyncState state = {.state = TAOS_SYNC_STATE_ERROR};
M
Minghao Li 已提交
548

S
Shengliang Guan 已提交
549
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
550 551 552
  if (pSyncNode != NULL) {
    state.state = pSyncNode->state;
    state.restored = pSyncNode->restoreFinish;
553 554 555 556 557
    if (pSyncNode->vgId != 1) {
      state.canRead = syncNodeIsReadyForRead(pSyncNode);
    } else {
      state.canRead = state.restored;
    }
558
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
559 560
  }

561
  return state;
M
Minghao Li 已提交
562 563
}

564
#if 0
565 566 567 568 569
int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) {
  if (index < SYNC_INDEX_BEGIN) {
    return -1;
  }

S
Shengliang Guan 已提交
570
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
571 572 573
  if (pSyncNode == NULL) {
    return -1;
  }
574
  tAssert(rid == pSyncNode->rid);
575 576 577 578 579

  SSyncRaftEntry* pEntry = NULL;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
  if (code != 0) {
    if (pEntry != NULL) {
B
Benguang Zhao 已提交
580
      syncEntryDestroy(pEntry);
581
    }
S
Shengliang Guan 已提交
582
    syncNodeRelease(pSyncNode);
583 584
    return -1;
  }
585
  tAssert(pEntry != NULL);
586 587 588 589 590 591

  pSnapshot->data = NULL;
  pSnapshot->lastApplyIndex = index;
  pSnapshot->lastApplyTerm = pEntry->term;
  pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index);

592
  syncEntryDestroy(pEntry);
S
Shengliang Guan 已提交
593
  syncNodeRelease(pSyncNode);
594 595 596
  return 0;
}

597
int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
598
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
599 600 601
  if (pSyncNode == NULL) {
    return -1;
  }
602
  tAssert(rid == pSyncNode->rid);
603 604
  sMeta->lastConfigIndex = pSyncNode->pRaftCfg->lastConfigIndex;

S
Shengliang Guan 已提交
605
  sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->pRaftCfg->lastConfigIndex);
606

S
Shengliang Guan 已提交
607
  syncNodeRelease(pSyncNode);
608 609 610
  return 0;
}

611
int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
612
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
613 614 615
  if (pSyncNode == NULL) {
    return -1;
  }
616
  tAssert(rid == pSyncNode->rid);
617

618
  tAssert(pSyncNode->pRaftCfg->configIndexCount >= 1);
619 620
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
621
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
622 623 624 625 626 627
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
  sMeta->lastConfigIndex = lastIndex;
628
  sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex,
S
Shengliang Guan 已提交
629
         sMeta->lastConfigIndex);
630

S
Shengliang Guan 已提交
631
  syncNodeRelease(pSyncNode);
632 633
  return 0;
}
634
#endif
635

636
SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
637
  tAssert(pSyncNode->pRaftCfg->configIndexCount >= 1);
638 639
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
640
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
641 642 643 644 645
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotLastApplyIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
S
Shengliang Guan 已提交
646
  sTrace("vgId:%d, sync get last config index, index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId,
S
Shengliang Guan 已提交
647
         snapshotLastApplyIndex, lastIndex);
648 649 650 651

  return lastIndex;
}

652 653
void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
  pEpSet->numOfEps = 0;
M
Minghao Li 已提交
654

S
Shengliang Guan 已提交
655
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
656
  if (pSyncNode == NULL) return;
M
Minghao Li 已提交
657

S
Shengliang Guan 已提交
658
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
S
Shengliang Guan 已提交
659 660 661 662
    SEp* pEp = &pEpSet->eps[i];
    tstrncpy(pEp->fqdn, pSyncNode->pRaftCfg->cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN);
    pEp->port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort;
    pEpSet->numOfEps++;
663
    sDebug("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port);
M
Minghao Li 已提交
664
  }
M
Minghao Li 已提交
665 666
  if (pEpSet->numOfEps > 0) {
    pEpSet->inUse = (pSyncNode->pRaftCfg->cfg.myIndex + 1) % pEpSet->numOfEps;
M
Minghao Li 已提交
667 668
  }

S
Shengliang Guan 已提交
669
  sInfo("vgId:%d, sync get retry epset numOfEps:%d inUse:%d", pSyncNode->vgId, pEpSet->numOfEps, pEpSet->inUse);
S
Shengliang Guan 已提交
670
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
671 672
}

M
Minghao Li 已提交
673
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
674
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
675
  if (pSyncNode == NULL) {
676
    sError("sync propose error");
M
Minghao Li 已提交
677
    return -1;
678
  }
679

680
  int32_t ret = syncNodePropose(pSyncNode, pMsg, isWeak);
S
Shengliang Guan 已提交
681
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
682 683
  return ret;
}
M
Minghao Li 已提交
684

685
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
686 687 688 689 690
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    sNError(pSyncNode, "sync propose not leader, %s, type:%s", syncStr(pSyncNode->state), TMSG_INFO(pMsg->msgType));
    return -1;
  }
691

S
Shengliang Guan 已提交
692 693 694 695 696 697 698
  // not restored, vnode enable
  if (!pSyncNode->restoreFinish && pSyncNode->vgId != 1) {
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since not ready, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }
699

700
  // heartbeat timeout
701
  if (syncNodeHeartbeatReplyTimeout(pSyncNode)) {
702 703 704 705 706 707
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since hearbeat timeout, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }

S
Shengliang Guan 已提交
708 709 710
  // optimized one replica
  if (syncNodeIsOptimizedOneReplica(pSyncNode, pMsg)) {
    SyncIndex retIndex;
711
    int32_t   code = syncNodeOnClientRequest(pSyncNode, pMsg, &retIndex);
S
Shengliang Guan 已提交
712 713 714
    if (code == 0) {
      pMsg->info.conn.applyIndex = retIndex;
      pMsg->info.conn.applyTerm = pSyncNode->pRaftStore->currentTerm;
715 716 717
      sTrace("vgId:%d, propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
             TMSG_INFO(pMsg->msgType));
      return 1;
M
Minghao Li 已提交
718
    } else {
S
Shengliang Guan 已提交
719
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
720
      sError("vgId:%d, failed to propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
S
Shengliang Guan 已提交
721
             TMSG_INFO(pMsg->msgType));
722
      return -1;
723
    }
S
Shengliang Guan 已提交
724
  } else {
S
Shengliang Guan 已提交
725 726
    SRespStub stub = {.createTime = taosGetTimestampMs(), .rpcMsg = *pMsg};
    uint64_t  seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);
727
    SRpcMsg   rpcMsg = {0};
S
Shengliang Guan 已提交
728
    int32_t   code = syncBuildClientRequest(&rpcMsg, pMsg, seqNum, isWeak, pSyncNode->vgId);
729 730 731 732
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while serialize since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
      return -1;
M
Minghao Li 已提交
733
    }
734

735 736 737 738 739
    sNTrace(pSyncNode, "propose msg, type:%s", TMSG_INFO(pMsg->msgType));
    code = (*pSyncNode->syncEqMsg)(pSyncNode->msgcb, &rpcMsg);
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while enqueue since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
M
Minghao Li 已提交
740
    }
M
Minghao Li 已提交
741

742
    return code;
M
Minghao Li 已提交
743
  }
M
Minghao Li 已提交
744 745
}

S
Shengliang Guan 已提交
746
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId) {
747 748 749 750 751
  pSyncTimer->pTimer = NULL;
  pSyncTimer->counter = 0;
  pSyncTimer->timerMS = pSyncNode->hbBaseLine;
  pSyncTimer->timerCb = syncNodeEqPeerHeartbeatTimer;
  pSyncTimer->destId = destId;
M
Minghao Li 已提交
752
  pSyncTimer->timeStamp = taosGetTimestampMs();
753 754 755 756
  atomic_store_64(&pSyncTimer->logicClock, 0);
  return 0;
}

S
Shengliang Guan 已提交
757
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
758
  int32_t ret = 0;
S
Shengliang Guan 已提交
759
  int64_t tsNow = taosGetTimestampMs();
S
Shengliang Guan 已提交
760
  if (syncIsInit()) {
761 762 763 764 765 766
    SSyncHbTimerData* pData = syncHbTimerDataAcquire(pSyncTimer->hbDataRid);
    if (pData == NULL) {
      pData = taosMemoryMalloc(sizeof(SSyncHbTimerData));
      pData->rid = syncHbTimerDataAdd(pData);
    }
    pSyncTimer->hbDataRid = pData->rid;
S
Shengliang Guan 已提交
767
    pSyncTimer->timeStamp = tsNow;
768 769

    pData->syncNodeRid = pSyncNode->rid;
770 771 772
    pData->pTimer = pSyncTimer;
    pData->destId = pSyncTimer->destId;
    pData->logicClock = pSyncTimer->logicClock;
S
Shengliang Guan 已提交
773
    pData->execTime = tsNow + pSyncTimer->timerMS;
M
Minghao Li 已提交
774

775 776
    taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)(pData->rid),
                 syncEnv()->pTimerManager, &pSyncTimer->pTimer);
777 778 779 780 781 782
  } else {
    sError("vgId:%d, start ctrl hb timer error, sync env is stop", pSyncNode->vgId);
  }
  return ret;
}

S
Shengliang Guan 已提交
783
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
784 785 786 787
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncTimer->logicClock, 1);
  taosTmrStop(pSyncTimer->pTimer);
  pSyncTimer->pTimer = NULL;
788 789
  syncHbTimerDataRemove(pSyncTimer->hbDataRid);
  pSyncTimer->hbDataRid = -1;
790 791 792
  return ret;
}

793
int32_t syncNodeLogStoreRestoreOnNeed(SSyncNode* pNode) {
S
git lo  
Shengliang Guan 已提交
794 795 796
  tAssertS(pNode->pLogStore != NULL, "log store not created");
  tAssertS(pNode->pFsm != NULL, "pFsm not registered");
  tAssertS(pNode->pFsm->FpGetSnapshotInfo != NULL, "FpGetSnapshotInfo not registered");
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
  SSnapshot snapshot;
  if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) {
    sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr());
    return -1;
  }
  SyncIndex commitIndex = snapshot.lastApplyIndex;
  SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore);
  SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore);
  if (lastVer < commitIndex || firstVer > commitIndex + 1) {
    if (pNode->pLogStore->syncLogRestoreFromSnapshot(pNode->pLogStore, commitIndex)) {
      sError("vgId:%d, failed to restore log store from snapshot since %s. lastVer: %" PRId64 ", snapshotVer: %" PRId64,
             pNode->vgId, terrstr(), lastVer, commitIndex);
      return -1;
    }
  }
  return 0;
}

M
Minghao Li 已提交
815
// open/close --------------
S
Shengliang Guan 已提交
816 817
SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode));
818 819 820 821
  if (pSyncNode == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _error;
  }
M
Minghao Li 已提交
822

M
Minghao Li 已提交
823 824 825 826
  if (!taosDirExist((char*)(pSyncInfo->path))) {
    if (taosMkDir(pSyncInfo->path) != 0) {
      terrno = TAOS_SYSTEM_ERROR(errno);
      sError("failed to create dir:%s since %s", pSyncInfo->path, terrstr());
827
      goto _error;
M
Minghao Li 已提交
828
    }
829
  }
M
Minghao Li 已提交
830

S
Shengliang Guan 已提交
831
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
832
  if (!taosCheckExistFile(pSyncNode->configPath)) {
M
Minghao Li 已提交
833
    // create a new raft config file
S
Shengliang Guan 已提交
834
    SRaftCfgMeta meta = {0};
M
Minghao Li 已提交
835
    meta.isStandBy = pSyncInfo->isStandBy;
M
Minghao Li 已提交
836
    meta.snapshotStrategy = pSyncInfo->snapshotStrategy;
837
    meta.lastConfigIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
838
    meta.batchSize = pSyncInfo->batchSize;
S
Shengliang Guan 已提交
839 840
    if (raftCfgCreateFile(&pSyncInfo->syncCfg, meta, pSyncNode->configPath) != 0) {
      sError("vgId:%d, failed to create raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
841
      goto _error;
842
    }
843
    if (pSyncInfo->syncCfg.replicaNum == 0) {
S
Shengliang Guan 已提交
844
      sInfo("vgId:%d, sync config not input", pSyncNode->vgId);
845 846
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
847 848 849
  } else {
    // update syncCfg by raft_config.json
    pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
850
    if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
851
      sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
852
      goto _error;
853
    }
S
Shengliang Guan 已提交
854 855

    if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->pRaftCfg->cfg, &pSyncInfo->syncCfg)) {
S
Shengliang Guan 已提交
856 857 858 859 860 861
      sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId);
      pSyncNode->pRaftCfg->cfg = pSyncInfo->syncCfg;
      if (raftCfgPersist(pSyncNode->pRaftCfg) != 0) {
        sError("vgId:%d, failed to persist raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
        goto _error;
      }
S
Shengliang Guan 已提交
862 863 864 865
    } else {
      sInfo("vgId:%d, use sync config from raft cfg file", pSyncNode->vgId);
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
866 867

    raftCfgClose(pSyncNode->pRaftCfg);
868
    pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
869 870
  }

M
Minghao Li 已提交
871
  // init by SSyncInfo
M
Minghao Li 已提交
872
  pSyncNode->vgId = pSyncInfo->vgId;
S
Shengliang Guan 已提交
873 874 875 876 877 878 879
  SSyncCfg* pCfg = &pSyncInfo->syncCfg;
  sDebug("vgId:%d, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SNodeInfo* pNode = &pCfg->nodeInfo[i];
    sDebug("vgId:%d, index:%d ep:%s:%u", pSyncNode->vgId, i, pNode->nodeFqdn, pNode->nodePort);
  }

M
Minghao Li 已提交
880
  memcpy(pSyncNode->path, pSyncInfo->path, sizeof(pSyncNode->path));
S
Shengliang Guan 已提交
881 882 883
  snprintf(pSyncNode->raftStorePath, sizeof(pSyncNode->raftStorePath), "%s%sraft_store.json", pSyncInfo->path,
           TD_DIRSEP);
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
M
Minghao Li 已提交
884

M
Minghao Li 已提交
885
  pSyncNode->pWal = pSyncInfo->pWal;
S
Shengliang Guan 已提交
886
  pSyncNode->msgcb = pSyncInfo->msgcb;
S
Shengliang Guan 已提交
887 888 889
  pSyncNode->syncSendMSg = pSyncInfo->syncSendMSg;
  pSyncNode->syncEqMsg = pSyncInfo->syncEqMsg;
  pSyncNode->syncEqCtrlMsg = pSyncInfo->syncEqCtrlMsg;
M
Minghao Li 已提交
890

B
Benguang Zhao 已提交
891 892 893
  // create raft log ring buffer
  pSyncNode->pLogBuf = syncLogBufferCreate();
  if (pSyncNode->pLogBuf == NULL) {
894
    sError("failed to init sync log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId);
B
Benguang Zhao 已提交
895 896 897
    goto _error;
  }

M
Minghao Li 已提交
898 899
  // init raft config
  pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
900
  if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
901
    sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
902 903
    goto _error;
  }
M
Minghao Li 已提交
904

M
Minghao Li 已提交
905
  // init internal
M
Minghao Li 已提交
906
  pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
907
  if (!syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId)) {
S
Shengliang Guan 已提交
908
    sError("vgId:%d, failed to determine my raft member id", pSyncNode->vgId);
H
Hongze Cheng 已提交
909
    goto _error;
910
  }
M
Minghao Li 已提交
911

M
Minghao Li 已提交
912
  // init peersNum, peers, peersId
M
Minghao Li 已提交
913
  pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
914 915
  int32_t j = 0;
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
916 917
    if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
      pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
M
Minghao Li 已提交
918 919 920
      j++;
    }
  }
S
Shengliang Guan 已提交
921
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
922
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i])) {
S
Shengliang Guan 已提交
923
      sError("vgId:%d, failed to determine raft member id, peer:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
924
      goto _error;
925
    }
M
Minghao Li 已提交
926
  }
M
Minghao Li 已提交
927

M
Minghao Li 已提交
928
  // init replicaNum, replicasId
M
Minghao Li 已提交
929
  pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
930
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
931
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) {
S
Shengliang Guan 已提交
932
      sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
933
      goto _error;
934
    }
M
Minghao Li 已提交
935 936
  }

M
Minghao Li 已提交
937
  // init raft algorithm
M
Minghao Li 已提交
938
  pSyncNode->pFsm = pSyncInfo->pFsm;
939
  pSyncInfo->pFsm = NULL;
M
Minghao Li 已提交
940
  pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);
M
Minghao Li 已提交
941 942
  pSyncNode->leaderCache = EMPTY_RAFT_ID;

M
Minghao Li 已提交
943
  // init life cycle outside
M
Minghao Li 已提交
944

M
Minghao Li 已提交
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968
  // TLA+ Spec
  // InitHistoryVars == /\ elections = {}
  //                    /\ allLogs   = {}
  //                    /\ voterLog  = [i \in Server |-> [j \in {} |-> <<>>]]
  // InitServerVars == /\ currentTerm = [i \in Server |-> 1]
  //                   /\ state       = [i \in Server |-> Follower]
  //                   /\ votedFor    = [i \in Server |-> Nil]
  // InitCandidateVars == /\ votesResponded = [i \in Server |-> {}]
  //                      /\ votesGranted   = [i \in Server |-> {}]
  // \* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the
  // \* leader does not send itself messages. It's still easier to include these
  // \* in the functions.
  // InitLeaderVars == /\ nextIndex  = [i \in Server |-> [j \in Server |-> 1]]
  //                   /\ matchIndex = [i \in Server |-> [j \in Server |-> 0]]
  // InitLogVars == /\ log          = [i \in Server |-> << >>]
  //                /\ commitIndex  = [i \in Server |-> 0]
  // Init == /\ messages = [m \in {} |-> 0]
  //         /\ InitHistoryVars
  //         /\ InitServerVars
  //         /\ InitCandidateVars
  //         /\ InitLeaderVars
  //         /\ InitLogVars
  //

M
Minghao Li 已提交
969
  // init TLA+ server vars
M
syncInt  
Minghao Li 已提交
970
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
M
Minghao Li 已提交
971
  pSyncNode->pRaftStore = raftStoreOpen(pSyncNode->raftStorePath);
972
  if (pSyncNode->pRaftStore == NULL) {
S
Shengliang Guan 已提交
973
    sError("vgId:%d, failed to open raft store at path %s", pSyncNode->vgId, pSyncNode->raftStorePath);
974 975
    goto _error;
  }
M
Minghao Li 已提交
976

M
Minghao Li 已提交
977
  // init TLA+ candidate vars
M
Minghao Li 已提交
978
  pSyncNode->pVotesGranted = voteGrantedCreate(pSyncNode);
979
  if (pSyncNode->pVotesGranted == NULL) {
S
Shengliang Guan 已提交
980
    sError("vgId:%d, failed to create VotesGranted", pSyncNode->vgId);
981 982
    goto _error;
  }
M
Minghao Li 已提交
983
  pSyncNode->pVotesRespond = votesRespondCreate(pSyncNode);
984
  if (pSyncNode->pVotesRespond == NULL) {
S
Shengliang Guan 已提交
985
    sError("vgId:%d, failed to create VotesRespond", pSyncNode->vgId);
986 987
    goto _error;
  }
M
Minghao Li 已提交
988

M
Minghao Li 已提交
989 990
  // init TLA+ leader vars
  pSyncNode->pNextIndex = syncIndexMgrCreate(pSyncNode);
991
  if (pSyncNode->pNextIndex == NULL) {
S
Shengliang Guan 已提交
992
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
993 994
    goto _error;
  }
M
Minghao Li 已提交
995
  pSyncNode->pMatchIndex = syncIndexMgrCreate(pSyncNode);
996
  if (pSyncNode->pMatchIndex == NULL) {
S
Shengliang Guan 已提交
997
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
998 999
    goto _error;
  }
M
Minghao Li 已提交
1000 1001 1002

  // init TLA+ log vars
  pSyncNode->pLogStore = logStoreCreate(pSyncNode);
1003
  if (pSyncNode->pLogStore == NULL) {
S
Shengliang Guan 已提交
1004
    sError("vgId:%d, failed to create SyncLogStore", pSyncNode->vgId);
1005 1006
    goto _error;
  }
1007 1008 1009 1010 1011

  SyncIndex commitIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot = {0};
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
1012
    if (code != 0) {
S
Shengliang Guan 已提交
1013
      sError("vgId:%d, failed to get snapshot info, code:%d", pSyncNode->vgId, code);
H
Hongze Cheng 已提交
1014
      goto _error;
1015
    }
1016 1017
    if (snapshot.lastApplyIndex > commitIndex) {
      commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
1018
      sNTrace(pSyncNode, "reset commit index by snapshot");
1019 1020 1021
    }
  }
  pSyncNode->commitIndex = commitIndex;
M
Minghao Li 已提交
1022

1023 1024 1025
  if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) {
    goto _error;
  }
M
Minghao Li 已提交
1026 1027
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
1028 1029
  pSyncNode->electBaseLine = tsElectInterval;
  pSyncNode->hbBaseLine = tsHeartbeatInterval;
M
Minghao Li 已提交
1030

M
Minghao Li 已提交
1031
  // init ping timer
M
Minghao Li 已提交
1032
  pSyncNode->pPingTimer = NULL;
M
Minghao Li 已提交
1033
  pSyncNode->pingTimerMS = pSyncNode->pingBaseLine;
M
Minghao Li 已提交
1034 1035
  atomic_store_64(&pSyncNode->pingTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->pingTimerLogicClockUser, 0);
M
Minghao Li 已提交
1036
  pSyncNode->FpPingTimerCB = syncNodeEqPingTimer;
M
Minghao Li 已提交
1037
  pSyncNode->pingTimerCounter = 0;
M
Minghao Li 已提交
1038

M
Minghao Li 已提交
1039 1040
  // init elect timer
  pSyncNode->pElectTimer = NULL;
M
Minghao Li 已提交
1041
  pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
M
Minghao Li 已提交
1042
  atomic_store_64(&pSyncNode->electTimerLogicClock, 0);
M
Minghao Li 已提交
1043
  pSyncNode->FpElectTimerCB = syncNodeEqElectTimer;
M
Minghao Li 已提交
1044 1045 1046 1047
  pSyncNode->electTimerCounter = 0;

  // init heartbeat timer
  pSyncNode->pHeartbeatTimer = NULL;
M
Minghao Li 已提交
1048
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
M
Minghao Li 已提交
1049 1050
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClockUser, 0);
M
Minghao Li 已提交
1051
  pSyncNode->FpHeartbeatTimerCB = syncNodeEqHeartbeatTimer;
M
Minghao Li 已提交
1052 1053
  pSyncNode->heartbeatTimerCounter = 0;

1054 1055 1056 1057 1058
  // init peer heartbeat timer
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
    syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]);
  }

M
Minghao Li 已提交
1059
  // tools
M
Minghao Li 已提交
1060
  pSyncNode->pSyncRespMgr = syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS);
1061
  if (pSyncNode->pSyncRespMgr == NULL) {
S
Shengliang Guan 已提交
1062
    sError("vgId:%d, failed to create SyncRespMgr", pSyncNode->vgId);
1063 1064
    goto _error;
  }
M
Minghao Li 已提交
1065

1066 1067
  // restore state
  pSyncNode->restoreFinish = false;
1068

M
Minghao Li 已提交
1069
  // snapshot senders
S
Shengliang Guan 已提交
1070
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1071
    SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
1072
    // tAssert(pSender != NULL);
M
Minghao Li 已提交
1073
    (pSyncNode->senders)[i] = pSender;
S
Shengliang Guan 已提交
1074
    sSTrace(pSender, "snapshot sender create new while open, data:%p", pSender);
M
Minghao Li 已提交
1075 1076 1077
  }

  // snapshot receivers
1078
  pSyncNode->pNewNodeReceiver = snapshotReceiverCreate(pSyncNode, EMPTY_RAFT_ID);
M
Minghao Li 已提交
1079

M
Minghao Li 已提交
1080 1081 1082
  // is config changing
  pSyncNode->changing = false;

B
Benguang Zhao 已提交
1083 1084 1085
  // replication mgr
  syncNodeLogReplMgrInit(pSyncNode);

M
Minghao Li 已提交
1086 1087 1088
  // peer state
  syncNodePeerStateInit(pSyncNode);

B
Benguang Zhao 已提交
1089
  //
M
Minghao Li 已提交
1090 1091 1092
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

M
Minghao Li 已提交
1093
  // start in syncNodeStart
M
Minghao Li 已提交
1094
  // start raft
M
Minghao Li 已提交
1095
  // syncNodeBecomeFollower(pSyncNode);
M
Minghao Li 已提交
1096

M
Minghao Li 已提交
1097 1098
  int64_t timeNow = taosGetTimestampMs();
  pSyncNode->startTime = timeNow;
1099
  pSyncNode->leaderTime = timeNow;
M
Minghao Li 已提交
1100 1101
  pSyncNode->lastReplicateTime = timeNow;

1102 1103 1104
  // snapshotting
  atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);

B
Benguang Zhao 已提交
1105 1106
  // init log buffer
  if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) {
1107
    sError("vgId:%d, failed to init sync log buffer since %s", pSyncNode->vgId, terrstr());
1108
    goto _error;
B
Benguang Zhao 已提交
1109 1110
  }

1111
  pSyncNode->isStart = true;
1112 1113 1114
  pSyncNode->electNum = 0;
  pSyncNode->becomeLeaderNum = 0;
  pSyncNode->configChangeNum = 0;
1115 1116
  pSyncNode->hbSlowNum = 0;
  pSyncNode->hbrSlowNum = 0;
M
Minghao Li 已提交
1117
  pSyncNode->tmrRoutineNum = 0;
1118

1119 1120 1121
  sNInfo(pSyncNode, "sync open, node:%p", pSyncNode);
  sTrace("vgId:%d, tsElectInterval:%d, tsHeartbeatInterval:%d, tsHeartbeatTimeout:%d", pSyncNode->vgId, tsElectInterval,
         tsHeartbeatInterval, tsHeartbeatTimeout);
1122

M
Minghao Li 已提交
1123
  return pSyncNode;
1124 1125 1126

_error:
  if (pSyncInfo->pFsm) {
H
Hongze Cheng 已提交
1127 1128
    taosMemoryFree(pSyncInfo->pFsm);
    pSyncInfo->pFsm = NULL;
1129 1130 1131 1132
  }
  syncNodeClose(pSyncNode);
  pSyncNode = NULL;
  return NULL;
M
Minghao Li 已提交
1133 1134
}

M
Minghao Li 已提交
1135 1136 1137 1138
void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) {
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot;
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
1139
    tAssert(code == 0);
M
Minghao Li 已提交
1140 1141 1142 1143 1144 1145
    if (snapshot.lastApplyIndex > pSyncNode->commitIndex) {
      pSyncNode->commitIndex = snapshot.lastApplyIndex;
    }
  }
}

B
Benguang Zhao 已提交
1146
int32_t syncNodeRestore(SSyncNode* pSyncNode) {
S
git lo  
Shengliang Guan 已提交
1147 1148
  tAssertS(pSyncNode->pLogStore != NULL, "log store not created");
  tAssertS(pSyncNode->pLogBuf != NULL, "ring log buffer not created");
B
Benguang Zhao 已提交
1149 1150 1151 1152

  SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore);
  SyncIndex endIndex = pSyncNode->pLogBuf->endIndex;
1153 1154 1155 1156 1157 1158
  if (lastVer != -1 && endIndex != lastVer + 1) {
    terrno = TSDB_CODE_WAL_LOG_INCOMPLETE;
    sError("vgId:%d, failed to restore sync node since %s. expected lastLogIndex: %" PRId64 ", lastVer: %" PRId64 "",
           pSyncNode->vgId, terrstr(), endIndex - 1, lastVer);
    return -1;
  }
B
Benguang Zhao 已提交
1159

1160
  tAssert(endIndex == lastVer + 1);
B
Benguang Zhao 已提交
1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
  commitIndex = TMAX(pSyncNode->commitIndex, commitIndex);

  if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, commitIndex) < 0) {
    return -1;
  }

  return 0;
}

int32_t syncNodeStart(SSyncNode* pSyncNode) {
  // start raft
  if (pSyncNode->replicaNum == 1) {
    raftStoreNextTerm(pSyncNode->pRaftStore);
    syncNodeBecomeLeader(pSyncNode, "one replica start");

    // Raft 3.6.2 Committing entries from previous terms
    syncNodeAppendNoop(pSyncNode);
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
  }

  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1184
  tAssert(ret == 0);
B
Benguang Zhao 已提交
1185 1186 1187 1188
  return ret;
}

void syncNodeStartOld(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1189
  // start raft
1190
  if (pSyncNode->replicaNum == 1) {
M
Minghao Li 已提交
1191
    raftStoreNextTerm(pSyncNode->pRaftStore);
1192
    syncNodeBecomeLeader(pSyncNode, "one replica start");
M
format  
Minghao Li 已提交
1193

1194
    // Raft 3.6.2 Committing entries from previous terms
1195 1196
    syncNodeAppendNoop(pSyncNode);
    syncMaybeAdvanceCommitIndex(pSyncNode);
M
Minghao Li 已提交
1197

M
Minghao Li 已提交
1198 1199
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
1200 1201
  }

1202 1203
  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1204
  tAssert(ret == 0);
M
Minghao Li 已提交
1205 1206
}

B
Benguang Zhao 已提交
1207
int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1208 1209 1210 1211 1212 1213 1214
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1215
  tAssert(ret == 0);
1216

1217 1218
  ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1219
  tAssert(ret == 0);
B
Benguang Zhao 已提交
1220
  return ret;
M
Minghao Li 已提交
1221 1222
}

M
Minghao Li 已提交
1223
void syncNodePreClose(SSyncNode* pSyncNode) {
1224 1225 1226 1227
  if (pSyncNode != NULL && pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpApplyQueueItems != NULL) {
    while (1) {
      int32_t aqItems = pSyncNode->pFsm->FpApplyQueueItems(pSyncNode->pFsm);
      sTrace("vgId:%d, pre close, %d items in apply queue", pSyncNode->vgId, aqItems);
1228
      if (aqItems == 0 || aqItems == -1) {
1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
        break;
      }
      taosMsleep(20);
    }
  }

  if (pSyncNode->pNewNodeReceiver != NULL) {
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
      snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
    }

    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

M
Minghao Li 已提交
1244 1245 1246 1247 1248 1249 1250
  // stop elect timer
  syncNodeStopElectTimer(pSyncNode);

  // stop heartbeat timer
  syncNodeStopHeartbeatTimer(pSyncNode);
}

1251
void syncHbTimerDataFree(SSyncHbTimerData* pData) { taosMemoryFree(pData); }
M
Minghao Li 已提交
1252

M
Minghao Li 已提交
1253
void syncNodeClose(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1254
  if (pSyncNode == NULL) return;
1255
  sNInfo(pSyncNode, "sync close, node:%p", pSyncNode);
M
Minghao Li 已提交
1256

S
Shengliang Guan 已提交
1257
  int32_t ret = raftStoreClose(pSyncNode->pRaftStore);
1258
  tAssert(ret == 0);
M
Minghao Li 已提交
1259
  pSyncNode->pRaftStore = NULL;
M
Minghao Li 已提交
1260

B
Benguang Zhao 已提交
1261
  syncNodeLogReplMgrDestroy(pSyncNode);
M
Minghao Li 已提交
1262
  syncRespMgrDestroy(pSyncNode->pSyncRespMgr);
1263
  pSyncNode->pSyncRespMgr = NULL;
M
Minghao Li 已提交
1264
  voteGrantedDestroy(pSyncNode->pVotesGranted);
1265
  pSyncNode->pVotesGranted = NULL;
M
Minghao Li 已提交
1266
  votesRespondDestory(pSyncNode->pVotesRespond);
1267
  pSyncNode->pVotesRespond = NULL;
M
Minghao Li 已提交
1268
  syncIndexMgrDestroy(pSyncNode->pNextIndex);
1269
  pSyncNode->pNextIndex = NULL;
M
Minghao Li 已提交
1270
  syncIndexMgrDestroy(pSyncNode->pMatchIndex);
1271
  pSyncNode->pMatchIndex = NULL;
M
Minghao Li 已提交
1272
  logStoreDestory(pSyncNode->pLogStore);
1273
  pSyncNode->pLogStore = NULL;
B
Benguang Zhao 已提交
1274 1275
  syncLogBufferDestroy(pSyncNode->pLogBuf);
  pSyncNode->pLogBuf = NULL;
M
Minghao Li 已提交
1276
  raftCfgClose(pSyncNode->pRaftCfg);
1277
  pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
1278 1279 1280 1281 1282

  syncNodeStopPingTimer(pSyncNode);
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1283 1284 1285 1286
  if (pSyncNode->pFsm != NULL) {
    taosMemoryFree(pSyncNode->pFsm);
  }

S
Shengliang Guan 已提交
1287
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1288
    if ((pSyncNode->senders)[i] != NULL) {
S
Shengliang Guan 已提交
1289
      sSTrace((pSyncNode->senders)[i], "snapshot sender destroy while close, data:%p", (pSyncNode->senders)[i]);
1290 1291 1292 1293 1294

      if (snapshotSenderIsStart((pSyncNode->senders)[i])) {
        snapshotSenderStop((pSyncNode->senders)[i], false);
      }

M
Minghao Li 已提交
1295 1296 1297 1298 1299
      snapshotSenderDestroy((pSyncNode->senders)[i]);
      (pSyncNode->senders)[i] = NULL;
    }
  }

M
Minghao Li 已提交
1300
  if (pSyncNode->pNewNodeReceiver != NULL) {
1301 1302 1303 1304
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
      snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
    }

M
Minghao Li 已提交
1305 1306 1307 1308
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

1309
  taosMemoryFree(pSyncNode);
M
Minghao Li 已提交
1310 1311
}

M
Minghao Li 已提交
1312
ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode) { return pSyncNode->pRaftCfg->snapshotStrategy; }
M
Minghao Li 已提交
1313

M
Minghao Li 已提交
1314 1315 1316
// timer control --------------
int32_t syncNodeStartPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1317 1318
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpPingTimerCB, pSyncNode->pingTimerMS, pSyncNode, syncEnv()->pTimerManager,
1319 1320 1321
                 &pSyncNode->pPingTimer);
    atomic_store_64(&pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1322
    sError("vgId:%d, start ping timer error, sync env is stop", pSyncNode->vgId);
1323
  }
M
Minghao Li 已提交
1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
  return ret;
}

int32_t syncNodeStopPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->pingTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pPingTimer);
  pSyncNode->pPingTimer = NULL;
  return ret;
}

int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1337
  if (syncIsInit()) {
1338
    pSyncNode->electTimerMS = ms;
S
Shengliang Guan 已提交
1339

1340 1341 1342 1343 1344
    int64_t execTime = taosGetTimestampMs() + ms;
    atomic_store_64(&(pSyncNode->electTimerParam.executeTime), execTime);
    atomic_store_64(&(pSyncNode->electTimerParam.logicClock), pSyncNode->electTimerLogicClock);
    pSyncNode->electTimerParam.pSyncNode = pSyncNode;
    pSyncNode->electTimerParam.pData = NULL;
S
Shengliang Guan 已提交
1345

M
Minghao Li 已提交
1346
    taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, (void*)(pSyncNode->rid), syncEnv()->pTimerManager,
1347
                 &pSyncNode->pElectTimer);
1348

1349
  } else {
M
Minghao Li 已提交
1350
    sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId);
1351
  }
M
Minghao Li 已提交
1352 1353 1354 1355 1356
  return ret;
}

int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1357
  atomic_add_fetch_64(&pSyncNode->electTimerLogicClock, 1);
M
Minghao Li 已提交
1358 1359
  taosTmrStop(pSyncNode->pElectTimer);
  pSyncNode->pElectTimer = NULL;
1360

M
Minghao Li 已提交
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
  return ret;
}

int32_t syncNodeRestartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStartElectTimer(pSyncNode, ms);
  return ret;
}

M
Minghao Li 已提交
1371 1372
int32_t syncNodeResetElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1373 1374 1375 1376 1377 1378 1379
  int32_t electMS;

  if (pSyncNode->pRaftCfg->isStandBy) {
    electMS = TIMER_MAX_MS;
  } else {
    electMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
  }
M
Minghao Li 已提交
1380
  ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1381

S
Shengliang Guan 已提交
1382 1383
  sNTrace(pSyncNode, "reset elect timer, min:%d, max:%d, ms:%d", pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine,
          electMS);
M
Minghao Li 已提交
1384 1385 1386
  return ret;
}

M
Minghao Li 已提交
1387
static int32_t syncNodeDoStartHeartbeatTimer(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1388
  int32_t ret = 0;
S
Shengliang Guan 已提交
1389 1390
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, pSyncNode->heartbeatTimerMS, pSyncNode, syncEnv()->pTimerManager,
1391 1392 1393
                 &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1394
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
1395
  }
1396

S
Shengliang Guan 已提交
1397
  sNTrace(pSyncNode, "start heartbeat timer, ms:%d", pSyncNode->heartbeatTimerMS);
M
Minghao Li 已提交
1398 1399 1400
  return ret;
}

M
Minghao Li 已提交
1401
int32_t syncNodeStartHeartbeatTimer(SSyncNode* pSyncNode) {
1402
  int32_t ret = 0;
M
Minghao Li 已提交
1403

1404
#if 0
M
Minghao Li 已提交
1405
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
1406 1407
  ret = syncNodeDoStartHeartbeatTimer(pSyncNode);
#endif
1408

S
Shengliang Guan 已提交
1409
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1410
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1411 1412 1413
    if (pSyncTimer != NULL) {
      syncHbTimerStart(pSyncNode, pSyncTimer);
    }
1414
  }
1415

M
Minghao Li 已提交
1416 1417 1418
  return ret;
}

M
Minghao Li 已提交
1419 1420
int32_t syncNodeStopHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1421 1422

#if 0
M
Minghao Li 已提交
1423 1424 1425
  atomic_add_fetch_64(&pSyncNode->heartbeatTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pHeartbeatTimer);
  pSyncNode->pHeartbeatTimer = NULL;
1426
#endif
1427

S
Shengliang Guan 已提交
1428
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1429
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1430 1431 1432
    if (pSyncTimer != NULL) {
      syncHbTimerStop(pSyncNode, pSyncTimer);
    }
1433
  }
1434

M
Minghao Li 已提交
1435 1436 1437
  return ret;
}

1438 1439 1440 1441 1442 1443
int32_t syncNodeRestartHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartHeartbeatTimer(pSyncNode);
  return 0;
}

M
Minghao Li 已提交
1444 1445 1446
// utils --------------
int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1447
  syncUtilRaftId2EpSet(destRaftId, &epSet);
S
Shengliang Guan 已提交
1448
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1449 1450 1451
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1452
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1453
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1454
  } else {
M
Minghao Li 已提交
1455
    sError("vgId:%d, sync send msg by id error, fp-send-msg is null", pSyncNode->vgId);
S
Shengliang Guan 已提交
1456
    rpcFreeCont(pMsg->pCont);
M
Minghao Li 已提交
1457
    return -1;
M
Minghao Li 已提交
1458
  }
M
Minghao Li 已提交
1459

M
Minghao Li 已提交
1460 1461 1462 1463 1464
  return 0;
}

int32_t syncNodeSendMsgByInfo(const SNodeInfo* nodeInfo, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1465
  syncUtilNodeInfo2EpSet(nodeInfo, &epSet);
S
Shengliang Guan 已提交
1466
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1467 1468 1469
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1470
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1471
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1472
  } else {
M
Minghao Li 已提交
1473
    sError("vgId:%d, sync send msg by info error, fp-send-msg is null", pSyncNode->vgId);
M
Minghao Li 已提交
1474
  }
M
Minghao Li 已提交
1475 1476 1477
  return 0;
}

1478
inline bool syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config) {
1479 1480 1481
  bool b1 = false;
  bool b2 = false;

S
Shengliang Guan 已提交
1482
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1483 1484 1485 1486 1487 1488 1489
    if (strcmp((config->nodeInfo)[i].nodeFqdn, pSyncNode->myNodeInfo.nodeFqdn) == 0 &&
        (config->nodeInfo)[i].nodePort == pSyncNode->myNodeInfo.nodePort) {
      b1 = true;
      break;
    }
  }

S
Shengliang Guan 已提交
1490
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1491 1492 1493 1494 1495 1496 1497 1498 1499 1500
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((config->nodeInfo)[i].nodeFqdn, (config->nodeInfo)[i].nodePort);
    raftId.vgId = pSyncNode->vgId;

    if (syncUtilSameId(&raftId, &(pSyncNode->myRaftId))) {
      b2 = true;
      break;
    }
  }

1501
  tAssert(b1 == b2);
1502 1503 1504
  return b1;
}

1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) {
  if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true;
  if (pOldCfg->myIndex != pNewCfg->myIndex) return true;
  for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) {
    const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i];
    const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i];
    if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true;
    if (pOldInfo->nodePort != pNewInfo->nodePort) return true;
  }

  return false;
}

M
Minghao Li 已提交
1518
void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncIndex lastConfigChangeIndex) {
1519
  SSyncCfg oldConfig = pSyncNode->pRaftCfg->cfg;
1520 1521 1522 1523
  if (!syncIsConfigChanged(&oldConfig, pNewConfig)) {
    sInfo("vgId:1, sync not reconfig since not changed");
    return;
  }
S
Shengliang Guan 已提交
1524

1525
  pSyncNode->pRaftCfg->cfg = *pNewConfig;
1526 1527
  pSyncNode->pRaftCfg->lastConfigIndex = lastConfigChangeIndex;

1528 1529
  pSyncNode->configChangeNum++;

M
Minghao Li 已提交
1530 1531
  bool IamInOld = syncNodeInConfig(pSyncNode, &oldConfig);
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewConfig);
M
Minghao Li 已提交
1532

M
Minghao Li 已提交
1533 1534
  bool isDrop = false;
  bool isAdd = false;
M
Minghao Li 已提交
1535

M
Minghao Li 已提交
1536 1537 1538 1539
  if (IamInOld && !IamInNew) {
    isDrop = true;
  } else {
    isDrop = false;
1540
  }
1541

M
Minghao Li 已提交
1542 1543 1544 1545 1546
  if (!IamInOld && IamInNew) {
    isAdd = true;
  } else {
    isAdd = false;
  }
M
Minghao Li 已提交
1547

M
Minghao Li 已提交
1548
  // log begin config change
S
Shengliang Guan 已提交
1549 1550 1551 1552
  char oldCfgStr[1024] = {0};
  char newCfgStr[1024] = {0};
  syncCfg2SimpleStr(&oldConfig, oldCfgStr, sizeof(oldCfgStr));
  syncCfg2SimpleStr(pNewConfig, oldCfgStr, sizeof(oldCfgStr));
1553
  sNInfo(pSyncNode, "begin do config change, from %s to %s", oldCfgStr, oldCfgStr);
M
Minghao Li 已提交
1554

M
Minghao Li 已提交
1555 1556
  if (IamInNew) {
    pSyncNode->pRaftCfg->isStandBy = 0;  // change isStandBy to normal
M
Minghao Li 已提交
1557
  }
M
Minghao Li 已提交
1558 1559
  if (isDrop) {
    pSyncNode->pRaftCfg->isStandBy = 1;  // set standby
M
Minghao Li 已提交
1560 1561
  }

M
Minghao Li 已提交
1562
  // add last config index
M
Minghao Li 已提交
1563
  raftCfgAddConfigIndex(pSyncNode->pRaftCfg, lastConfigChangeIndex);
M
Minghao Li 已提交
1564

M
Minghao Li 已提交
1565 1566 1567 1568 1569 1570 1571 1572 1573
  if (IamInNew) {
    //-----------------------------------------
    int32_t ret = 0;

    // save snapshot senders
    int32_t oldReplicaNum = pSyncNode->replicaNum;
    SRaftId oldReplicasId[TSDB_MAX_REPLICA];
    memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
    SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
S
Shengliang Guan 已提交
1574
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1575
      oldSenders[i] = (pSyncNode->senders)[i];
S
Shengliang Guan 已提交
1576
      sSTrace(oldSenders[i], "snapshot sender save old");
M
Minghao Li 已提交
1577
    }
1578

M
Minghao Li 已提交
1579 1580
    // init internal
    pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
1581
    syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1582 1583 1584

    // init peersNum, peers, peersId
    pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
1585 1586
    int32_t j = 0;
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
1587 1588 1589 1590 1591
      if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
        pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
        j++;
      }
    }
S
Shengliang Guan 已提交
1592
    for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1593
      syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
M
Minghao Li 已提交
1594
    }
1595

M
Minghao Li 已提交
1596 1597
    // init replicaNum, replicasId
    pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
1598
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
1599
      syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
M
Minghao Li 已提交
1600
    }
1601

1602 1603 1604
    // update quorum first
    pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);

M
Minghao Li 已提交
1605 1606 1607 1608
    syncIndexMgrUpdate(pSyncNode->pNextIndex, pSyncNode);
    syncIndexMgrUpdate(pSyncNode->pMatchIndex, pSyncNode);
    voteGrantedUpdate(pSyncNode->pVotesGranted, pSyncNode);
    votesRespondUpdate(pSyncNode->pVotesRespond, pSyncNode);
M
Minghao Li 已提交
1609

M
Minghao Li 已提交
1610
    // reset snapshot senders
1611

M
Minghao Li 已提交
1612
    // clear new
S
Shengliang Guan 已提交
1613
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1614 1615
      (pSyncNode->senders)[i] = NULL;
    }
M
Minghao Li 已提交
1616

M
Minghao Li 已提交
1617
    // reset new
S
Shengliang Guan 已提交
1618
    for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
M
Minghao Li 已提交
1619 1620
      // reset sender
      bool reset = false;
S
Shengliang Guan 已提交
1621
      for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) {
M
Minghao Li 已提交
1622
        if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j]) && oldSenders[j] != NULL) {
M
Minghao Li 已提交
1623 1624 1625
          char     host[128];
          uint16_t port;
          syncUtilU642Addr((pSyncNode->replicasId)[i].addr, host, sizeof(host), &port);
1626
          sNTrace(pSyncNode, "snapshot sender reset for: %" PRId64 ", newIndex:%d, %s:%d, %p",
S
Shengliang Guan 已提交
1627
                  (pSyncNode->replicasId)[i].addr, i, host, port, oldSenders[j]);
M
Minghao Li 已提交
1628 1629 1630 1631 1632 1633 1634 1635 1636

          (pSyncNode->senders)[i] = oldSenders[j];
          oldSenders[j] = NULL;
          reset = true;

          // reset replicaIndex
          int32_t oldreplicaIndex = (pSyncNode->senders)[i]->replicaIndex;
          (pSyncNode->senders)[i]->replicaIndex = i;

S
Shengliang Guan 已提交
1637 1638
          sNTrace(pSyncNode, "snapshot sender udpate replicaIndex from %d to %d, %s:%d, %p, reset:%d", oldreplicaIndex,
                  i, host, port, (pSyncNode->senders)[i], reset);
M
Minghao Li 已提交
1639 1640

          break;
M
Minghao Li 已提交
1641
        }
1642 1643
      }
    }
1644

M
Minghao Li 已提交
1645
    // create new
S
Shengliang Guan 已提交
1646
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1647 1648
      if ((pSyncNode->senders)[i] == NULL) {
        (pSyncNode->senders)[i] = snapshotSenderCreate(pSyncNode, i);
S
Shengliang Guan 已提交
1649 1650 1651
        sSTrace((pSyncNode->senders)[i], "snapshot sender create new while reconfig, data:%p", (pSyncNode->senders)[i]);
      } else {
        sSTrace((pSyncNode->senders)[i], "snapshot sender already exist, data:%p", (pSyncNode->senders)[i]);
M
Minghao Li 已提交
1652
      }
1653 1654
    }

M
Minghao Li 已提交
1655
    // free old
S
Shengliang Guan 已提交
1656
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1657
      if (oldSenders[i] != NULL) {
S
Shengliang Guan 已提交
1658
        sNTrace(pSyncNode, "snapshot sender destroy old, data:%p replica-index:%d", oldSenders[i], i);
M
Minghao Li 已提交
1659 1660 1661
        snapshotSenderDestroy(oldSenders[i]);
        oldSenders[i] = NULL;
      }
1662 1663
    }

1664
    // persist cfg
M
Minghao Li 已提交
1665
    raftCfgPersist(pSyncNode->pRaftCfg);
1666

S
Shengliang Guan 已提交
1667
    char tmpbuf[1024] = {0};
1668
    snprintf(tmpbuf, sizeof(tmpbuf), "config change from %d to %d, index:%" PRId64 ", %s  -->  %s",
S
Shengliang Guan 已提交
1669
             oldConfig.replicaNum, pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1670

M
Minghao Li 已提交
1671 1672 1673
    // change isStandBy to normal (election timeout)
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
      syncNodeBecomeLeader(pSyncNode, tmpbuf);
1674 1675 1676

      // Raft 3.6.2 Committing entries from previous terms
      syncNodeAppendNoop(pSyncNode);
1677
      // syncMaybeAdvanceCommitIndex(pSyncNode);
1678

M
Minghao Li 已提交
1679 1680 1681 1682
    } else {
      syncNodeBecomeFollower(pSyncNode, tmpbuf);
    }
  } else {
1683
    // persist cfg
M
Minghao Li 已提交
1684
    raftCfgPersist(pSyncNode->pRaftCfg);
1685 1686
    sNInfo(pSyncNode, "do not config change from %d to %d, index:%" PRId64 ", %s  -->  %s", oldConfig.replicaNum,
           pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
1687
  }
1688

M
Minghao Li 已提交
1689
_END:
M
Minghao Li 已提交
1690
  // log end config change
1691
  sNInfo(pSyncNode, "end do config change, from %s to %s", oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1692 1693
}

M
Minghao Li 已提交
1694 1695 1696 1697
// raft state change --------------
void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
1698
    char tmpBuf[64];
1699
    snprintf(tmpBuf, sizeof(tmpBuf), "update term to %" PRId64, term);
1700
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
M
Minghao Li 已提交
1701 1702 1703 1704
    raftStoreClearVote(pSyncNode->pRaftStore);
  }
}

1705 1706 1707 1708 1709 1710
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
  }
}

M
Minghao Li 已提交
1711
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) {
M
Minghao Li 已提交
1712
  if (pSyncNode->pRaftStore->currentTerm > newTerm) {
1713
    sNTrace(pSyncNode, "step down, ignore, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1714
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1715 1716
    return;
  }
M
Minghao Li 已提交
1717 1718

  do {
1719
    sNTrace(pSyncNode, "step down, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1720
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1721 1722 1723 1724 1725
  } while (0);

  if (pSyncNode->pRaftStore->currentTerm < newTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, newTerm);
    char tmpBuf[64];
1726
    snprintf(tmpBuf, sizeof(tmpBuf), "step down, update term to %" PRId64, newTerm);
M
Minghao Li 已提交
1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
    raftStoreClearVote(pSyncNode->pRaftStore);

  } else {
    if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
      syncNodeBecomeFollower(pSyncNode, "step down");
    }
  }
}

1737 1738
void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); }

1739
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
M
Minghao Li 已提交
1740
  // maybe clear leader cache
M
Minghao Li 已提交
1741 1742 1743 1744
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    pSyncNode->leaderCache = EMPTY_RAFT_ID;
  }

1745 1746
  pSyncNode->hbSlowNum = 0;

M
Minghao Li 已提交
1747
  // state change
M
Minghao Li 已提交
1748 1749 1750
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1751 1752
  // reset elect timer
  syncNodeResetElectTimer(pSyncNode);
M
Minghao Li 已提交
1753

1754 1755 1756
  // send rsp to client
  syncNodeLeaderChangeRsp(pSyncNode);

1757 1758 1759 1760 1761
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeFollowerCb != NULL) {
    pSyncNode->pFsm->FpBecomeFollowerCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1762 1763 1764
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1765 1766 1767
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1768
  // trace log
S
Shengliang Guan 已提交
1769
  sNTrace(pSyncNode, "become follower %s", debugStr);
M
Minghao Li 已提交
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789
}

// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
//     /\ state[i] = Candidate
//     /\ votesGranted[i] \in Quorum
//     /\ state'      = [state EXCEPT ![i] = Leader]
//     /\ nextIndex'  = [nextIndex EXCEPT ![i] =
//                          [j \in Server |-> Len(log[i]) + 1]]
//     /\ matchIndex' = [matchIndex EXCEPT ![i] =
//                          [j \in Server |-> 0]]
//     /\ elections'  = elections \cup
//                          {[eterm     |-> currentTerm[i],
//                            eleader   |-> i,
//                            elog      |-> log[i],
//                            evotes    |-> votesGranted[i],
//                            evoterLog |-> voterLog[i]]}
//     /\ UNCHANGED <<messages, currentTerm, votedFor, candidateVars, logVars>>
//
1790
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
1791 1792
  pSyncNode->leaderTime = taosGetTimestampMs();

1793
  pSyncNode->becomeLeaderNum++;
1794
  pSyncNode->hbrSlowNum = 0;
1795

1796 1797 1798
  // reset restoreFinish
  pSyncNode->restoreFinish = false;

M
Minghao Li 已提交
1799
  // state change
M
Minghao Li 已提交
1800
  pSyncNode->state = TAOS_SYNC_STATE_LEADER;
M
Minghao Li 已提交
1801 1802

  // set leader cache
M
Minghao Li 已提交
1803 1804
  pSyncNode->leaderCache = pSyncNode->myRaftId;

S
Shengliang Guan 已提交
1805
  for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1806 1807
    // maybe overwrite myself, no harm
    // just do it!
1808 1809 1810 1811 1812 1813 1814

    // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1;

    // maybe wal is deleted
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
1815
    tAssert(code == 0);
1816
    pSyncNode->pNextIndex->index[i] = lastIndex + 1;
M
Minghao Li 已提交
1817 1818
  }

S
Shengliang Guan 已提交
1819
  for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1820 1821
    // maybe overwrite myself, no harm
    // just do it!
M
Minghao Li 已提交
1822 1823 1824
    pSyncNode->pMatchIndex->index[i] = SYNC_INDEX_INVALID;
  }

M
Minghao Li 已提交
1825 1826 1827
  // init peer mgr
  syncNodePeerStateInit(pSyncNode);

M
Minghao Li 已提交
1828
#if 0
1829 1830
  // update sender private term
  SSyncSnapshotSender* pMySender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->myRaftId));
1831
  if (pMySender != NULL) {
S
Shengliang Guan 已提交
1832
    for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
1833 1834 1835
      if ((pSyncNode->senders)[i]->privateTerm > pMySender->privateTerm) {
        pMySender->privateTerm = (pSyncNode->senders)[i]->privateTerm;
      }
1836
    }
1837
    (pMySender->privateTerm) += 100;
1838
  }
M
Minghao Li 已提交
1839
#endif
1840

1841 1842 1843 1844 1845
  // close receiver
  if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
    snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
  }

M
Minghao Li 已提交
1846
  // stop elect timer
M
Minghao Li 已提交
1847
  syncNodeStopElectTimer(pSyncNode);
M
Minghao Li 已提交
1848

M
Minghao Li 已提交
1849 1850
  // start heartbeat timer
  syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
1851

M
Minghao Li 已提交
1852 1853
  // send heartbeat right now
  syncNodeHeartbeatPeers(pSyncNode);
M
Minghao Li 已提交
1854

1855 1856 1857 1858 1859
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLeaderCb != NULL) {
    pSyncNode->pFsm->FpBecomeLeaderCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1860 1861 1862
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1863 1864 1865
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1866
  // trace log
1867
  sNInfo(pSyncNode, "become leader %s", debugStr);
M
Minghao Li 已提交
1868 1869 1870
}

void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
1871 1872
  tAssert(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  tAssert(voteGrantedMajority(pSyncNode->pVotesGranted));
1873
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
M
Minghao Li 已提交
1874

S
Shengliang Guan 已提交
1875
  sNTrace(pSyncNode, "state change syncNodeCandidate2Leader");
M
Minghao Li 已提交
1876

B
Benguang Zhao 已提交
1877
  int32_t ret = syncNodeAppendNoop(pSyncNode);
1878 1879 1880 1881
  if (ret < 0) {
    sError("vgId:%d, failed to append noop entry since %s", pSyncNode->vgId, terrstr());
  }

B
Benguang Zhao 已提交
1882
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
1883
  tAssert(lastIndex >= 0);
B
Benguang Zhao 已提交
1884 1885
  sInfo("vgId:%d, become leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64 "",
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1886 1887 1888
}

void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) {
1889 1890
  tAssert(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  tAssert(voteGrantedMajority(pSyncNode->pVotesGranted));
B
Benguang Zhao 已提交
1891 1892
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");

M
Minghao Li 已提交
1893
  // Raft 3.6.2 Committing entries from previous terms
1894 1895
  syncNodeAppendNoop(pSyncNode);
  syncMaybeAdvanceCommitIndex(pSyncNode);
1896 1897

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
1898
    syncNodeReplicate(pSyncNode);
1899
  }
M
Minghao Li 已提交
1900 1901
}

M
Minghao Li 已提交
1902 1903
bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }

M
Minghao Li 已提交
1904
int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1905
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1906 1907 1908 1909 1910
    pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID;
    pSyncNode->peerStates[i].lastSendTime = 0;
  }

  return 0;
M
Minghao Li 已提交
1911 1912 1913
}

void syncNodeFollower2Candidate(SSyncNode* pSyncNode) {
1914
  tAssert(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER);
M
Minghao Li 已提交
1915
  pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE;
B
Benguang Zhao 已提交
1916 1917 1918
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become candidate from follower. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
M
Minghao Li 已提交
1919

S
Shengliang Guan 已提交
1920
  sNTrace(pSyncNode, "follower to candidate");
M
Minghao Li 已提交
1921 1922 1923
}

void syncNodeLeader2Follower(SSyncNode* pSyncNode) {
1924
  tAssert(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
1925
  syncNodeBecomeFollower(pSyncNode, "leader to follower");
B
Benguang Zhao 已提交
1926 1927 1928 1929
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1930
  sNTrace(pSyncNode, "leader to follower");
M
Minghao Li 已提交
1931 1932 1933
}

void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
1934
  tAssert(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
1935
  syncNodeBecomeFollower(pSyncNode, "candidate to follower");
B
Benguang Zhao 已提交
1936 1937 1938 1939
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from candidate. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1940
  sNTrace(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
1941 1942
}

M
Minghao Li 已提交
1943 1944
// just called by syncNodeVoteForSelf
// need assert
M
Minghao Li 已提交
1945
void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
1946 1947
  tAssert(term == pSyncNode->pRaftStore->currentTerm);
  tAssert(!raftStoreHasVoted(pSyncNode->pRaftStore));
M
Minghao Li 已提交
1948 1949 1950 1951

  raftStoreVote(pSyncNode->pRaftStore, pRaftId);
}

M
Minghao Li 已提交
1952
// simulate get vote from outside
M
Minghao Li 已提交
1953
void syncNodeVoteForSelf(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1954
  syncNodeVoteForTerm(pSyncNode, pSyncNode->pRaftStore->currentTerm, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1955

S
Shengliang Guan 已提交
1956 1957
  SRpcMsg rpcMsg = {0};
  int32_t ret = syncBuildRequestVoteReply(&rpcMsg, pSyncNode->vgId);
S
Shengliang Guan 已提交
1958
  if (ret != 0) return;
M
Minghao Li 已提交
1959

S
Shengliang Guan 已提交
1960
  SyncRequestVoteReply* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
1961 1962 1963 1964 1965 1966 1967
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = pSyncNode->myRaftId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->voteGranted = true;

  voteGrantedVote(pSyncNode->pVotesGranted, pMsg);
  votesRespondAdd(pSyncNode->pVotesRespond, pMsg);
S
Shengliang Guan 已提交
1968
  rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
1969 1970
}

M
Minghao Li 已提交
1971
// return if has a snapshot
M
Minghao Li 已提交
1972 1973
bool syncNodeHasSnapshot(SSyncNode* pSyncNode) {
  bool      ret = false;
1974
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1975 1976
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1977 1978 1979 1980 1981 1982 1983
    if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN) {
      ret = true;
    }
  }
  return ret;
}

M
Minghao Li 已提交
1984 1985
// return max(logLastIndex, snapshotLastIndex)
// if no snapshot and log, return -1
1986
SyncIndex syncNodeGetLastIndex(const SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1987
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1988 1989
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1990 1991 1992 1993 1994 1995 1996
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);

  SyncIndex lastIndex = logLastIndex > snapshot.lastApplyIndex ? logLastIndex : snapshot.lastApplyIndex;
  return lastIndex;
}

M
Minghao Li 已提交
1997 1998
// return the last term of snapshot and log
// if error, return SYNC_TERM_INVALID (by syncLogLastTerm)
M
Minghao Li 已提交
1999 2000
SyncTerm syncNodeGetLastTerm(SSyncNode* pSyncNode) {
  SyncTerm lastTerm = 0;
M
Minghao Li 已提交
2001 2002
  if (syncNodeHasSnapshot(pSyncNode)) {
    // has snapshot
2003
    SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
2004 2005
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
2006 2007
    }

M
Minghao Li 已提交
2008 2009 2010
    SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    if (logLastIndex > snapshot.lastApplyIndex) {
      lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
M
Minghao Li 已提交
2011 2012 2013 2014
    } else {
      lastTerm = snapshot.lastApplyTerm;
    }

M
Minghao Li 已提交
2015
  } else {
M
Minghao Li 已提交
2016 2017
    // no snapshot
    lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
2018
  }
M
Minghao Li 已提交
2019

M
Minghao Li 已提交
2020 2021 2022 2023 2024 2025 2026
  return lastTerm;
}

// get last index and term along with snapshot
int32_t syncNodeGetLastIndexTerm(SSyncNode* pSyncNode, SyncIndex* pLastIndex, SyncTerm* pLastTerm) {
  *pLastIndex = syncNodeGetLastIndex(pSyncNode);
  *pLastTerm = syncNodeGetLastTerm(pSyncNode);
2027 2028
  return 0;
}
M
Minghao Li 已提交
2029

M
Minghao Li 已提交
2030
// return append-entries first try index
M
Minghao Li 已提交
2031 2032 2033 2034 2035
SyncIndex syncNodeSyncStartIndex(SSyncNode* pSyncNode) {
  SyncIndex syncStartIndex = syncNodeGetLastIndex(pSyncNode) + 1;
  return syncStartIndex;
}

M
Minghao Li 已提交
2036 2037
// if index > 0, return index - 1
// else, return -1
2038 2039 2040 2041 2042 2043 2044 2045 2046
SyncIndex syncNodeGetPreIndex(SSyncNode* pSyncNode, SyncIndex index) {
  SyncIndex preIndex = index - 1;
  if (preIndex < SYNC_INDEX_INVALID) {
    preIndex = SYNC_INDEX_INVALID;
  }

  return preIndex;
}

M
Minghao Li 已提交
2047 2048 2049 2050
// if index < 0, return SYNC_TERM_INVALID
// if index == 0, return 0
// if index > 0, return preTerm
// if error, return SYNC_TERM_INVALID
2051 2052 2053 2054 2055 2056 2057 2058 2059
SyncTerm syncNodeGetPreTerm(SSyncNode* pSyncNode, SyncIndex index) {
  if (index < SYNC_INDEX_BEGIN) {
    return SYNC_TERM_INVALID;
  }

  if (index == SYNC_INDEX_BEGIN) {
    return 0;
  }

2060 2061 2062
  SyncTerm  preTerm = 0;
  SyncIndex preIndex = index - 1;

2063
  SSyncRaftEntry* pPreEntry = NULL;
2064 2065 2066 2067 2068 2069 2070
  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
  LRUHandle*      h = taosLRUCacheLookup(pCache, &preIndex, sizeof(preIndex));
  int32_t         code = 0;
  if (h) {
    pPreEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
    code = 0;

2071
    pSyncNode->pLogStore->cacheHit++;
2072 2073 2074
    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", preIndex, pPreEntry->bytes, pPreEntry);

  } else {
2075
    pSyncNode->pLogStore->cacheMiss++;
2076 2077 2078 2079
    sNTrace(pSyncNode, "miss cache index:%" PRId64, preIndex);

    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, preIndex, &pPreEntry);
  }
M
Minghao Li 已提交
2080 2081 2082 2083 2084 2085

  SSnapshot snapshot = {.data = NULL,
                        .lastApplyIndex = SYNC_INDEX_INVALID,
                        .lastApplyTerm = SYNC_TERM_INVALID,
                        .lastConfigIndex = SYNC_INDEX_INVALID};

2086
  if (code == 0) {
2087
    tAssert(pPreEntry != NULL);
2088
    preTerm = pPreEntry->term;
2089 2090 2091 2092

    if (h) {
      taosLRUCacheRelease(pCache, h, false);
    } else {
2093
      syncEntryDestroy(pPreEntry);
2094 2095
    }

2096 2097
    return preTerm;
  } else {
2098 2099 2100 2101
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
      if (snapshot.lastApplyIndex == preIndex) {
        return snapshot.lastApplyTerm;
2102 2103 2104 2105
      }
    }
  }

2106
  sNError(pSyncNode, "sync node get pre term error, index:%" PRId64 ", snap-index:%" PRId64 ", snap-term:%" PRId64,
S
Shengliang Guan 已提交
2107
          index, snapshot.lastApplyIndex, snapshot.lastApplyTerm);
2108 2109
  return SYNC_TERM_INVALID;
}
M
Minghao Li 已提交
2110 2111 2112 2113

// get pre index and term of "index"
int32_t syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncIndex* pPreIndex, SyncTerm* pPreTerm) {
  *pPreIndex = syncNodeGetPreIndex(pSyncNode, index);
M
Minghao Li 已提交
2114
  *pPreTerm = syncNodeGetPreTerm(pSyncNode, index);
M
Minghao Li 已提交
2115 2116 2117
  return 0;
}

M
Minghao Li 已提交
2118
static void syncNodeEqPingTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2119
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2120

S
Shengliang Guan 已提交
2121 2122 2123
  SSyncNode* pNode = param;
  if (atomic_load_64(&pNode->pingTimerLogicClockUser) <= atomic_load_64(&pNode->pingTimerLogicClock)) {
    SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2124
    int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_PING, atomic_load_64(&pNode->pingTimerLogicClock),
S
Shengliang Guan 已提交
2125 2126
                                    pNode->pingTimerMS, pNode);
    if (code != 0) {
M
Minghao Li 已提交
2127
      sError("failed to build ping msg");
S
Shengliang Guan 已提交
2128 2129
      rpcFreeCont(rpcMsg.pCont);
      return;
M
Minghao Li 已提交
2130
    }
M
Minghao Li 已提交
2131

M
Minghao Li 已提交
2132
    // sTrace("enqueue ping msg");
S
Shengliang Guan 已提交
2133 2134
    code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
    if (code != 0) {
M
Minghao Li 已提交
2135
      sError("failed to sync enqueue ping msg since %s", terrstr());
S
Shengliang Guan 已提交
2136 2137
      rpcFreeCont(rpcMsg.pCont);
      return;
2138
    }
M
Minghao Li 已提交
2139

S
Shengliang Guan 已提交
2140
    taosTmrReset(syncNodeEqPingTimer, pNode->pingTimerMS, pNode, syncEnv()->pTimerManager, &pNode->pPingTimer);
2141
  }
M
Minghao Li 已提交
2142 2143
}

M
Minghao Li 已提交
2144
static void syncNodeEqElectTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2145
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2146

M
Minghao Li 已提交
2147 2148
  int64_t    rid = (int64_t)param;
  SSyncNode* pNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
2149

2150
  if (pNode == NULL) return;
M
Minghao Li 已提交
2151 2152 2153 2154 2155

  if (pNode->syncEqMsg == NULL) {
    syncNodeRelease(pNode);
    return;
  }
2156

2157
  int64_t tsNow = taosGetTimestampMs();
M
Minghao Li 已提交
2158 2159 2160 2161
  if (tsNow < pNode->electTimerParam.executeTime) {
    syncNodeRelease(pNode);
    return;
  }
M
Minghao Li 已提交
2162

S
Shengliang Guan 已提交
2163
  SRpcMsg rpcMsg = {0};
2164 2165
  int32_t code =
      syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_ELECTION, pNode->electTimerParam.logicClock, pNode->electTimerMS, pNode);
S
Shengliang Guan 已提交
2166

S
Shengliang Guan 已提交
2167
  if (code != 0) {
M
Minghao Li 已提交
2168
    sError("failed to build elect msg");
M
Minghao Li 已提交
2169
    syncNodeRelease(pNode);
S
Shengliang Guan 已提交
2170
    return;
M
Minghao Li 已提交
2171 2172
  }

S
Shengliang Guan 已提交
2173
  SyncTimeout* pTimeout = rpcMsg.pCont;
S
Shengliang Guan 已提交
2174
  sNTrace(pNode, "enqueue elect msg lc:%" PRId64, pTimeout->logicClock);
S
Shengliang Guan 已提交
2175 2176 2177

  code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2178
    sError("failed to sync enqueue elect msg since %s", terrstr());
S
Shengliang Guan 已提交
2179
    rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
2180
    syncNodeRelease(pNode);
2181
    return;
M
Minghao Li 已提交
2182
  }
M
Minghao Li 已提交
2183 2184

  syncNodeRelease(pNode);
M
Minghao Li 已提交
2185 2186
}

M
Minghao Li 已提交
2187
static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2188
  if (!syncIsInit()) return;
2189

S
Shengliang Guan 已提交
2190 2191 2192 2193
  SSyncNode* pNode = param;
  if (pNode->replicaNum > 1) {
    if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) {
      SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2194
      int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock),
S
Shengliang Guan 已提交
2195 2196 2197
                                      pNode->heartbeatTimerMS, pNode);

      if (code != 0) {
M
Minghao Li 已提交
2198
        sError("failed to build heartbeat msg");
S
Shengliang Guan 已提交
2199
        return;
2200
      }
M
Minghao Li 已提交
2201

2202
      sTrace("vgId:%d, enqueue heartbeat timer", pNode->vgId);
S
Shengliang Guan 已提交
2203 2204
      code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
      if (code != 0) {
M
Minghao Li 已提交
2205
        sError("failed to enqueue heartbeat msg since %s", terrstr());
S
Shengliang Guan 已提交
2206 2207
        rpcFreeCont(rpcMsg.pCont);
        return;
2208
      }
S
Shengliang Guan 已提交
2209 2210 2211 2212

      taosTmrReset(syncNodeEqHeartbeatTimer, pNode->heartbeatTimerMS, pNode, syncEnv()->pTimerManager,
                   &pNode->pHeartbeatTimer);

2213
    } else {
S
Shengliang Guan 已提交
2214 2215
      sTrace("==syncNodeEqHeartbeatTimer== heartbeatTimerLogicClock:%" PRId64 ", heartbeatTimerLogicClockUser:%" PRId64,
             pNode->heartbeatTimerLogicClock, pNode->heartbeatTimerLogicClockUser);
2216
    }
M
Minghao Li 已提交
2217 2218 2219
  }
}

2220
static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
2221
  int64_t hbDataRid = (int64_t)param;
2222
  int64_t tsNow = taosGetTimestampMs();
2223

2224 2225
  SSyncHbTimerData* pData = syncHbTimerDataAcquire(hbDataRid);
  if (pData == NULL) {
M
Minghao Li 已提交
2226
    sError("hb timer get pData NULL, %" PRId64, hbDataRid);
2227 2228
    return;
  }
2229

2230
  SSyncNode* pSyncNode = syncNodeAcquire(pData->syncNodeRid);
M
Minghao Li 已提交
2231
  if (pSyncNode == NULL) {
2232
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2233
    sError("hb timer get pSyncNode NULL");
2234 2235 2236 2237 2238 2239 2240 2241
    return;
  }

  SSyncTimer* pSyncTimer = pData->pTimer;

  if (!pSyncNode->isStart) {
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2242
    sError("vgId:%d, hb timer sync node already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2243 2244 2245
    return;
  }

M
Minghao Li 已提交
2246
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
2247 2248
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2249
    sError("vgId:%d, hb timer sync node not leader", pSyncNode->vgId);
M
Minghao Li 已提交
2250 2251 2252
    return;
  }

M
Minghao Li 已提交
2253
  if (pSyncNode->pRaftStore == NULL) {
2254 2255
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2256
    sError("vgId:%d, hb timer raft store already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2257 2258 2259
    return;
  }

M
Minghao Li 已提交
2260
  // sTrace("vgId:%d, eq peer hb timer", pSyncNode->vgId);
2261 2262

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
2263 2264 2265
    int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock);
    int64_t msgLogicClock = atomic_load_64(&pData->logicClock);

2266
    if (timerLogicClock == msgLogicClock) {
2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
      if (tsNow > pData->execTime) {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  EXECUTE this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, "
            "---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif

        pData->execTime += pSyncTimer->timerMS;

        SRpcMsg rpcMsg = {0};
        (void)syncBuildHeartbeat(&rpcMsg, pSyncNode->vgId);

        SyncHeartbeat* pSyncMsg = rpcMsg.pCont;
        pSyncMsg->srcId = pSyncNode->myRaftId;
        pSyncMsg->destId = pData->destId;
        pSyncMsg->term = pSyncNode->pRaftStore->currentTerm;
        pSyncMsg->commitIndex = pSyncNode->commitIndex;
        pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
        pSyncMsg->privateTerm = 0;
2287
        pSyncMsg->timeStamp = tsNow;
2288 2289 2290 2291 2292 2293

        // update reset time
        int64_t timerElapsed = tsNow - pSyncTimer->timeStamp;
        pSyncTimer->timeStamp = tsNow;

        // send msg
2294 2295
        syncLogSendHeartbeat(pSyncNode, pSyncMsg, false, timerElapsed, pData->execTime);
        syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
2296 2297 2298 2299 2300 2301 2302 2303
      } else {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  pass this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, ---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif
      }

M
Minghao Li 已提交
2304 2305
      if (syncIsInit()) {
        // sTrace("vgId:%d, reset peer hb timer", pSyncNode->vgId);
2306 2307
        taosTmrReset(syncNodeEqPeerHeartbeatTimer, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)hbDataRid,
                     syncEnv()->pTimerManager, &pSyncTimer->pTimer);
M
Minghao Li 已提交
2308 2309 2310 2311
      } else {
        sError("sync env is stop, reset peer hb timer error");
      }

2312
    } else {
M
Minghao Li 已提交
2313 2314
      sTrace("vgId:%d, do not send hb, timerLogicClock:%" PRId64 ", msgLogicClock:%" PRId64 "", pSyncNode->vgId,
             timerLogicClock, msgLogicClock);
2315 2316
    }
  }
2317 2318 2319

  syncHbTimerDataRelease(pData);
  syncNodeRelease(pSyncNode);
2320 2321
}

2322 2323 2324 2325 2326
static int32_t syncNodeEqNoop(SSyncNode* pNode) {
  if (pNode->state == TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return -1;
  }
M
Minghao Li 已提交
2327

2328 2329 2330 2331
  SyncIndex       index = pNode->pLogStore->syncLogWriteIndex(pNode->pLogStore);
  SyncTerm        term = pNode->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, pNode->vgId);
  if (pEntry == NULL) return -1;
M
Minghao Li 已提交
2332

S
Shengliang Guan 已提交
2333
  SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2334
  int32_t code = syncBuildClientRequestFromNoopEntry(&rpcMsg, pEntry, pNode->vgId);
2335
  syncEntryDestroy(pEntry);
M
Minghao Li 已提交
2336

2337 2338 2339
  sNTrace(pNode, "propose msg, type:noop");
  code = (*pNode->syncEqMsg)(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2340
    sError("failed to propose noop msg while enqueue since %s", terrstr());
2341
  }
M
Minghao Li 已提交
2342

2343
  return code;
M
Minghao Li 已提交
2344 2345
}

2346 2347
static void deleteCacheEntry(const void* key, size_t keyLen, void* value) { taosMemoryFree(value); }

2348 2349 2350 2351
int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, LRUHandle** h) {
  SSyncLogStoreData* pData = pLogStore->data;
  sNTrace(pData->pSyncNode, "in cache index:%" PRId64 ", bytes:%u, %p", pEntry->index, pEntry->bytes, pEntry);

S
Shengliang Guan 已提交
2352 2353
  int32_t   code = 0;
  int32_t   entryLen = sizeof(*pEntry) + pEntry->dataLen;
2354 2355 2356 2357 2358 2359 2360 2361 2362
  LRUStatus status = taosLRUCacheInsert(pLogStore->pCache, &pEntry->index, sizeof(pEntry->index), pEntry, entryLen,
                                        deleteCacheEntry, h, TAOS_LRU_PRIORITY_LOW);
  if (status != TAOS_LRU_STATUS_OK) {
    code = -1;
  }

  return code;
}

B
Benguang Zhao 已提交
2363 2364 2365
int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) {
  // append to log buffer
  if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) {
2366
    sError("vgId:%d, failed to enqueue sync log buffer. index:%" PRId64 "", ths->vgId, pEntry->index);
B
Benguang Zhao 已提交
2367 2368 2369 2370
    return -1;
  }

  // proceed match index, with replicating on needed
2371
  SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, NULL);
B
Benguang Zhao 已提交
2372

2373
  sTrace("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64
2374 2375 2376
         ", %" PRId64 ")",
         ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex,
         ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex);
B
Benguang Zhao 已提交
2377

B
Benguang Zhao 已提交
2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393
  // multi replica
  if (ths->replicaNum > 1) {
    return 0;
  }

  // single replica
  (void)syncNodeUpdateCommitIndex(ths, matchIndex);

  if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
    sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex);
    return -1;
  }

  return 0;
}

2394
bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) {
2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406
  if (pSyncNode->replicaNum == 1) {
    return false;
  }

  int32_t toCount = 0;
  int64_t tsNow = taosGetTimestampMs();
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
    int64_t recvTime = syncIndexMgrGetRecvTime(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (recvTime == 0 || recvTime == -1) {
      continue;
    }

2407
    if (tsNow - recvTime > tsHeartbeatTimeout) {
2408 2409 2410 2411 2412 2413 2414 2415 2416
      toCount++;
    }
  }

  bool b = (toCount >= pSyncNode->quorum ? true : false);

  return b;
}

2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435
bool syncNodeSnapshotSending(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  bool b = false;
  for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
    if (pSyncNode->senders[i] != NULL && pSyncNode->senders[i]->start) {
      b = true;
      break;
    }
  }
  return b;
}

bool syncNodeSnapshotRecving(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  if (pSyncNode->pNewNodeReceiver == NULL) return false;
  if (pSyncNode->pNewNodeReceiver->start) return true;
  return false;
}

M
Minghao Li 已提交
2436
static int32_t syncNodeAppendNoop(SSyncNode* ths) {
B
Benguang Zhao 已提交
2437 2438 2439 2440 2441 2442 2443 2444 2445
  SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm  term = ths->pRaftStore->currentTerm;

  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
  if (pEntry == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return -1;
  }

B
Benguang Zhao 已提交
2446 2447
  int32_t ret = syncNodeAppend(ths, pEntry);
  return 0;
B
Benguang Zhao 已提交
2448 2449 2450
}

static int32_t syncNodeAppendNoopOld(SSyncNode* ths) {
M
Minghao Li 已提交
2451 2452
  int32_t ret = 0;

2453
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2454
  SyncTerm        term = ths->pRaftStore->currentTerm;
M
Minghao Li 已提交
2455
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
2456
  tAssert(pEntry != NULL);
M
Minghao Li 已提交
2457

2458 2459
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2460
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2461
    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
2462
    if (code != 0) {
M
Minghao Li 已提交
2463
      sError("append noop error");
2464 2465
      return -1;
    }
2466 2467

    syncCacheEntry(ths->pLogStore, pEntry, &h);
M
Minghao Li 已提交
2468 2469
  }

2470 2471 2472
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2473
    syncEntryDestroy(pEntry);
2474 2475
  }

M
Minghao Li 已提交
2476 2477 2478
  return ret;
}

S
Shengliang Guan 已提交
2479 2480
int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncHeartbeat* pMsg = pRpcMsg->pCont;
2481

M
Minghao Li 已提交
2482 2483 2484 2485
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2486
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2487
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2488
  syncLogRecvHeartbeat(ths, pMsg, timeDiff, tbuf);
2489

2490 2491 2492 2493
  SRpcMsg rpcMsg = {0};
  (void)syncBuildHeartbeatReply(&rpcMsg, ths->vgId);

  SyncHeartbeatReply* pMsgReply = rpcMsg.pCont;
2494 2495 2496 2497
  pMsgReply->destId = pMsg->srcId;
  pMsgReply->srcId = ths->myRaftId;
  pMsgReply->term = ths->pRaftStore->currentTerm;
  pMsgReply->privateTerm = 8864;  // magic number
2498
  pMsgReply->startTime = ths->startTime;
2499
  pMsgReply->timeStamp = tsMs;
2500

M
Minghao Li 已提交
2501
  if (pMsg->term == ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) {
2502 2503
    syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), tsMs);

2504
    syncNodeResetElectTimer(ths);
M
Minghao Li 已提交
2505
    ths->minMatchIndex = pMsg->minMatchIndex;
2506 2507

    if (ths->state == TAOS_SYNC_STATE_FOLLOWER) {
2508
      // syncNodeFollowerCommit(ths, pMsg->commitIndex);
S
Shengliang Guan 已提交
2509 2510 2511 2512
      SRpcMsg rpcMsgLocalCmd = {0};
      (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

      SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2513 2514
      pSyncMsg->cmd = SYNC_LOCAL_CMD_FOLLOWER_CMT;
      pSyncMsg->fcIndex = pMsg->commitIndex;
2515
      SyncIndex fcIndex = pSyncMsg->fcIndex;
2516 2517 2518 2519 2520 2521 2522

      if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
        int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
        if (code != 0) {
          sError("vgId:%d, sync enqueue fc-commit msg error, code:%d", ths->vgId, code);
          rpcFreeCont(rpcMsgLocalCmd.pCont);
        } else {
2523
          sTrace("vgId:%d, sync enqueue fc-commit msg, fc-index:%" PRId64, ths->vgId, fcIndex);
2524 2525
        }
      }
2526 2527 2528
    }
  }

M
Minghao Li 已提交
2529
  if (pMsg->term >= ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) {
2530
    // syncNodeStepDown(ths, pMsg->term);
S
Shengliang Guan 已提交
2531 2532 2533 2534
    SRpcMsg rpcMsgLocalCmd = {0};
    (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

    SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2535 2536 2537
    pSyncMsg->cmd = SYNC_LOCAL_CMD_STEP_DOWN;
    pSyncMsg->sdNewTerm = pMsg->term;

S
Shengliang Guan 已提交
2538 2539
    if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
      int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
2540 2541 2542 2543
      if (code != 0) {
        sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code);
        rpcFreeCont(rpcMsgLocalCmd.pCont);
      } else {
2544
        sTrace("vgId:%d, sync enqueue step-down msg, new-term: %" PRId64, ths->vgId, pSyncMsg->sdNewTerm);
2545
      }
2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560
    }
  }

  /*
    // htonl
    SMsgHead* pHead = rpcMsg.pCont;
    pHead->contLen = htonl(pHead->contLen);
    pHead->vgId = htonl(pHead->vgId);
  */

  // reply
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);
  return 0;
}

2561
int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
S
Shengliang Guan 已提交
2562 2563 2564 2565
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2566
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
B
Benguang Zhao 已提交
2567
  SSyncLogReplMgr*    pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId);
2568 2569 2570 2571
  if (pMgr == NULL) {
    sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr);
    return -1;
  }
2572 2573

  int64_t tsMs = taosGetTimestampMs();
S
Shengliang Guan 已提交
2574
  syncLogRecvHeartbeatReply(ths, pMsg, tsMs - pMsg->timeStamp, tbuf);
2575

2576 2577
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);

2578 2579 2580
  return syncLogReplMgrProcessHeartbeatReply(pMgr, ths, pMsg);
}

2581
int32_t syncNodeOnHeartbeatReplyOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
2582
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
2583

M
Minghao Li 已提交
2584 2585 2586 2587
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

M
Minghao Li 已提交
2588
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2589
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2590
  syncLogRecvHeartbeatReply(ths, pMsg, timeDiff, tbuf);
M
Minghao Li 已提交
2591

2592
  // update last reply time, make decision whether the other node is alive or not
M
Minghao Li 已提交
2593
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);
2594 2595 2596
  return 0;
}

S
Shengliang Guan 已提交
2597 2598
int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
2599 2600
  syncLogRecvLocalCmd(ths, pMsg, "");

2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    (void)syncNodeUpdateCommitIndex(ths, pMsg->fcIndex);
    if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
      sError("vgId:%d, failed to commit raft log since %s. commit index: %" PRId64 "", ths->vgId, terrstr(),
             ths->commitIndex);
    }
  } else {
    sError("error local cmd");
  }

  return 0;
}

int32_t syncNodeOnLocalCmdOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
  syncLogRecvLocalCmd(ths, pMsg, "");

M
Minghao Li 已提交
2621 2622 2623
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

2624 2625 2626
  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    syncNodeFollowerCommit(ths, pMsg->fcIndex);

M
Minghao Li 已提交
2627
  } else {
M
Minghao Li 已提交
2628
    sError("error local cmd");
M
Minghao Li 已提交
2629
  }
2630 2631 2632 2633

  return 0;
}

M
Minghao Li 已提交
2634 2635 2636 2637 2638 2639 2640 2641 2642 2643
// TLA+ Spec
// ClientRequest(i, v) ==
//     /\ state[i] = Leader
//     /\ LET entry == [term  |-> currentTerm[i],
//                      value |-> v]
//            newLog == Append(log[i], entry)
//        IN  log' = [log EXCEPT ![i] = newLog]
//     /\ UNCHANGED <<messages, serverVars, candidateVars,
//                    leaderVars, commitIndex>>
//
M
Minghao Li 已提交
2644

2645
int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
S
Shengliang Guan 已提交
2646
  sNTrace(ths, "on client request");
2647

B
Benguang Zhao 已提交
2648 2649
  int32_t code = 0;

B
Benguang Zhao 已提交
2650 2651 2652
  SyncIndex       index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm        term = ths->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = NULL;
2653 2654 2655 2656
  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
B
Benguang Zhao 已提交
2657 2658 2659 2660 2661 2662 2663
  }

  if (ths->state == TAOS_SYNC_STATE_LEADER) {
    if (pRetIndex) {
      (*pRetIndex) = index;
    }

2664 2665
    int32_t code = syncNodeAppend(ths, pEntry);
    if (code < 0 && ths->vgId != 1 && vnodeIsMsgBlock(pEntry->originalRpcType)) {
S
git lo  
Shengliang Guan 已提交
2666
      tAssertS(false, "failed to append blocking msg");
2667 2668
    }
    return code;
B
Benguang Zhao 已提交
2669 2670
  }

B
Benguang Zhao 已提交
2671
  return -1;
B
Benguang Zhao 已提交
2672 2673
}

2674 2675
int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
  sNTrace(ths, "on client request");
B
Benguang Zhao 已提交
2676

M
Minghao Li 已提交
2677
  int32_t ret = 0;
2678
  int32_t code = 0;
M
Minghao Li 已提交
2679

M
Minghao Li 已提交
2680
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2681
  SyncTerm        term = ths->pRaftStore->currentTerm;
2682 2683 2684 2685 2686 2687 2688
  SSyncRaftEntry* pEntry;

  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
  }
M
Minghao Li 已提交
2689

2690 2691
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2692
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2693 2694 2695
    // append entry
    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    if (code != 0) {
2696 2697 2698 2699
      if (ths->replicaNum == 1) {
        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2700
          syncEntryDestroy(pEntry);
2701
        }
2702

2703 2704 2705 2706
        return -1;

      } else {
        // del resp mgr, call FpCommitCb
2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717
        SFsmCbMeta cbMeta = {
            .index = pEntry->index,
            .lastConfigIndex = SYNC_INDEX_INVALID,
            .isWeak = pEntry->isWeak,
            .code = -1,
            .state = ths->state,
            .seqNum = pEntry->seqNum,
            .term = pEntry->term,
            .currentTerm = ths->pRaftStore->currentTerm,
            .flag = 0,
        };
2718
        ths->pFsm->FpCommitCb(ths->pFsm, pMsg, &cbMeta);
2719 2720 2721 2722

        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2723
          syncEntryDestroy(pEntry);
2724 2725
        }

2726 2727
        return -1;
      }
2728
    }
M
Minghao Li 已提交
2729

2730 2731
    syncCacheEntry(ths->pLogStore, pEntry, &h);

2732 2733
    // if mulit replica, start replicate right now
    if (ths->replicaNum > 1) {
M
Minghao Li 已提交
2734
      syncNodeReplicate(ths);
2735
    }
2736

2737 2738
    // if only myself, maybe commit right now
    if (ths->replicaNum == 1) {
2739 2740 2741 2742 2743
      if (syncNodeIsMnode(ths)) {
        syncMaybeAdvanceCommitIndex(ths);
      } else {
        syncOneReplicaAdvance(ths);
      }
2744
    }
M
Minghao Li 已提交
2745 2746
  }

2747 2748 2749 2750 2751 2752 2753 2754
  if (pRetIndex != NULL) {
    if (ret == 0 && pEntry != NULL) {
      *pRetIndex = pEntry->index;
    } else {
      *pRetIndex = SYNC_INDEX_INVALID;
    }
  }

2755 2756 2757
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2758
    syncEntryDestroy(pEntry);
2759 2760
  }

M
Minghao Li 已提交
2761
  return ret;
2762
}
M
Minghao Li 已提交
2763

S
Shengliang Guan 已提交
2764 2765 2766
const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
2767
      return "follower";
S
Shengliang Guan 已提交
2768
    case TAOS_SYNC_STATE_CANDIDATE:
2769
      return "candidate";
S
Shengliang Guan 已提交
2770
    case TAOS_SYNC_STATE_LEADER:
2771
      return "leader";
S
Shengliang Guan 已提交
2772
    case TAOS_SYNC_STATE_ERROR:
2773
      return "error";
S
Shengliang Guan 已提交
2774 2775 2776 2777
    case TAOS_SYNC_STATE_OFFLINE:
      return "offline";
    default:
      return "unknown";
S
Shengliang Guan 已提交
2778
  }
M
Minghao Li 已提交
2779
}
2780

2781
#if 0
2782
int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry) {
2783
  if (ths->state != TAOS_SYNC_STATE_FOLLOWER) {
S
Shengliang Guan 已提交
2784
    sNTrace(ths, "I am not follower, can not do leader transfer");
2785 2786
    return 0;
  }
2787 2788

  if (!ths->restoreFinish) {
S
Shengliang Guan 已提交
2789
    sNTrace(ths, "restore not finish, can not do leader transfer");
2790 2791 2792
    return 0;
  }

2793
  if (pEntry->term < ths->pRaftStore->currentTerm) {
2794
    sNTrace(ths, "little term:%" PRId64 ", can not do leader transfer", pEntry->term);
2795 2796 2797 2798
    return 0;
  }

  if (pEntry->index < syncNodeGetLastIndex(ths)) {
S
Shengliang Guan 已提交
2799
    sNTrace(ths, "little index:%" PRId64 ", can not do leader transfer", pEntry->index);
2800 2801 2802
    return 0;
  }

2803 2804
  /*
    if (ths->vgId > 1) {
S
Shengliang Guan 已提交
2805
      sNTrace(ths, "I am vnode, can not do leader transfer");
2806 2807 2808 2809
      return 0;
    }
  */

2810
  SyncLeaderTransfer* pSyncLeaderTransfer = pRpcMsg->pCont;
S
Shengliang Guan 已提交
2811
  sNTrace(ths, "do leader transfer, index:%" PRId64, pEntry->index);
M
Minghao Li 已提交
2812

M
Minghao Li 已提交
2813 2814 2815
  bool sameId = syncUtilSameId(&(pSyncLeaderTransfer->newLeaderId), &(ths->myRaftId));
  bool sameNodeInfo = strcmp(pSyncLeaderTransfer->newNodeInfo.nodeFqdn, ths->myNodeInfo.nodeFqdn) == 0 &&
                      pSyncLeaderTransfer->newNodeInfo.nodePort == ths->myNodeInfo.nodePort;
M
Minghao Li 已提交
2816

M
Minghao Li 已提交
2817 2818
  bool same = sameId || sameNodeInfo;
  if (same) {
M
Minghao Li 已提交
2819 2820 2821
    // reset elect timer now!
    int32_t electMS = 1;
    int32_t ret = syncNodeRestartElectTimer(ths, electMS);
2822
    tAssert(ret == 0);
M
Minghao Li 已提交
2823

2824
    sNTrace(ths, "maybe leader transfer to %s:%d %" PRId64, pSyncLeaderTransfer->newNodeInfo.nodeFqdn,
S
Shengliang Guan 已提交
2825
            pSyncLeaderTransfer->newNodeInfo.nodePort, pSyncLeaderTransfer->newLeaderId.addr);
2826 2827
  }

M
Minghao Li 已提交
2828
  if (ths->pFsm->FpLeaderTransferCb != NULL) {
S
Shengliang Guan 已提交
2829
    SFsmCbMeta cbMeta = {
S
Shengliang Guan 已提交
2830 2831 2832 2833 2834 2835 2836 2837 2838
        .code = 0,
        .currentTerm = ths->pRaftStore->currentTerm,
        .flag = 0,
        .index = pEntry->index,
        .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
        .isWeak = pEntry->isWeak,
        .seqNum = pEntry->seqNum,
        .state = ths->state,
        .term = pEntry->term,
S
Shengliang Guan 已提交
2839 2840
    };
    ths->pFsm->FpLeaderTransferCb(ths->pFsm, pRpcMsg, &cbMeta);
2841 2842
  }

2843 2844 2845
  return 0;
}

2846 2847
#endif

2848
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
2849
  for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) {
2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((pNewCfg->nodeInfo)[i].nodeFqdn, (pNewCfg->nodeInfo)[i].nodePort);
    raftId.vgId = ths->vgId;

    if (syncUtilSameId(&(ths->myRaftId), &raftId)) {
      pNewCfg->myIndex = i;
      return 0;
    }
  }

  return -1;
}

2863 2864 2865 2866
bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
}

M
Minghao Li 已提交
2867
int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) {
2868
  tAssert(false);
2869 2870 2871 2872
  if (beginIndex > endIndex) {
    return 0;
  }

M
Minghao Li 已提交
2873 2874 2875 2876 2877 2878 2879 2880 2881
  if (ths == NULL) {
    return -1;
  }

  if (ths->pFsm != NULL && ths->pFsm->FpGetSnapshotInfo != NULL) {
    // advance commit index to sanpshot first
    SSnapshot snapshot = {0};
    ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot);
    if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex >= beginIndex) {
S
Shengliang Guan 已提交
2882
      sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, beginIndex, snapshot.lastApplyIndex);
2883

M
Minghao Li 已提交
2884 2885 2886
      // update begin index
      beginIndex = snapshot.lastApplyIndex + 1;
    }
2887 2888
  }

2889 2890
  int32_t    code = 0;
  ESyncState state = flag;
M
Minghao Li 已提交
2891

S
Shengliang Guan 已提交
2892
  sNTrace(ths, "commit by wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex);
2893 2894 2895 2896 2897 2898

  // execute fsm
  if (ths->pFsm != NULL) {
    for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
      if (i != SYNC_INDEX_INVALID) {
        SSyncRaftEntry* pEntry;
2899 2900 2901 2902
        SLRUCache*      pCache = ths->pLogStore->pCache;
        LRUHandle*      h = taosLRUCacheLookup(pCache, &i, sizeof(i));
        if (h) {
          pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
2903

2904
          ths->pLogStore->cacheHit++;
2905 2906
          sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", i, pEntry->bytes, pEntry);

2907
        } else {
2908
          ths->pLogStore->cacheMiss++;
2909 2910
          sNTrace(ths, "miss cache index:%" PRId64, i);

2911
          code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry);
2912 2913
          // tAssert(code == 0);
          // tAssert(pEntry != NULL);
M
Minghao Li 已提交
2914
          if (code != 0 || pEntry == NULL) {
S
Shengliang Guan 已提交
2915
            sNError(ths, "get log entry error");
2916
            sFatal("vgId:%d, get log entry %" PRId64 " error when commit since %s", ths->vgId, i, terrstr());
M
Minghao Li 已提交
2917 2918
            continue;
          }
2919
        }
2920

2921
        SRpcMsg rpcMsg = {0};
2922 2923
        syncEntry2OriginalRpc(pEntry, &rpcMsg);

2924
        sTrace("do commit index:%" PRId64 ", type:%s", i, TMSG_INFO(pEntry->msgType));
M
Minghao Li 已提交
2925

2926
        // user commit
2927 2928
        if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) {
          bool internalExecute = true;
S
Shengliang Guan 已提交
2929
          if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) {
2930 2931 2932
            internalExecute = false;
          }

M
Minghao Li 已提交
2933 2934
          sNTrace(ths, "user commit index:%" PRId64 ", internal:%d, type:%s", i, internalExecute,
                  TMSG_INFO(pEntry->msgType));
2935

2936 2937
          // execute fsm in apply thread, or execute outside syncPropose
          if (internalExecute) {
S
Shengliang Guan 已提交
2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949
            SFsmCbMeta cbMeta = {
                .index = pEntry->index,
                .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
                .isWeak = pEntry->isWeak,
                .code = 0,
                .state = ths->state,
                .seqNum = pEntry->seqNum,
                .term = pEntry->term,
                .currentTerm = ths->pRaftStore->currentTerm,
                .flag = flag,
            };

S
Shengliang Guan 已提交
2950
            syncRespMgrGetAndDel(ths->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info);
S
Shengliang Guan 已提交
2951
            ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, &cbMeta);
M
Minghao Li 已提交
2952
          }
2953
        }
2954

2955 2956
#if 0
        // execute in pre-commit
M
Minghao Li 已提交
2957
        // leader transfer
2958 2959
        if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) {
          code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry);
2960
          tAssert(code == 0);
2961
        }
2962
#endif
2963 2964

        // restore finish
2965
        // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok
2966 2967 2968 2969 2970 2971
        if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
          if (ths->restoreFinish == false) {
            if (ths->pFsm->FpRestoreFinishCb != NULL) {
              ths->pFsm->FpRestoreFinishCb(ths->pFsm);
            }
            ths->restoreFinish = true;
M
Minghao Li 已提交
2972

2973
            int64_t restoreDelay = taosGetTimestampMs() - ths->leaderTime;
S
Shengliang Guan 已提交
2974
            sNTrace(ths, "restore finish, index:%" PRId64 ", elapsed:%" PRId64 " ms", pEntry->index, restoreDelay);
2975 2976 2977 2978
          }
        }

        rpcFreeCont(rpcMsg.pCont);
2979 2980 2981
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
B
Benguang Zhao 已提交
2982
          syncEntryDestroy(pEntry);
2983
        }
2984 2985 2986 2987
      }
    }
  }
  return 0;
2988 2989 2990
}

bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
S
Shengliang Guan 已提交
2991
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2992 2993 2994 2995 2996
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
      return true;
    }
  }
  return false;
M
Minghao Li 已提交
2997 2998 2999 3000
}

SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
  SSyncSnapshotSender* pSender = NULL;
S
Shengliang Guan 已提交
3001
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
3002 3003 3004 3005 3006
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pSender = (ths->senders)[i];
    }
  }
  return pSender;
M
Minghao Li 已提交
3007
}
M
Minghao Li 已提交
3008

3009 3010
SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
  SSyncTimer* pTimer = NULL;
S
Shengliang Guan 已提交
3011
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
3012 3013 3014 3015 3016 3017 3018
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pTimer = &((ths->peerHeartbeatTimerArr)[i]);
    }
  }
  return pTimer;
}

M
Minghao Li 已提交
3019 3020
SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) {
  SPeerState* pState = NULL;
S
Shengliang Guan 已提交
3021
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
3022 3023 3024 3025 3026 3027 3028 3029 3030
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pState = &((ths->peerStates)[i]);
    }
  }
  return pState;
}

bool syncNodeNeedSendAppendEntries(SSyncNode* ths, const SRaftId* pDestId, const SyncAppendEntries* pMsg) {
  SPeerState* pState = syncNodeGetPeerState(ths, pDestId);
M
Minghao Li 已提交
3031
  if (pState == NULL) {
3032
    sError("vgId:%d, replica maybe dropped", ths->vgId);
M
Minghao Li 已提交
3033 3034
    return false;
  }
M
Minghao Li 已提交
3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045

  SyncIndex sendIndex = pMsg->prevLogIndex + 1;
  int64_t   tsNow = taosGetTimestampMs();

  if (pState->lastSendIndex == sendIndex && tsNow - pState->lastSendTime < SYNC_APPEND_ENTRIES_TIMEOUT_MS) {
    return false;
  }

  return true;
}

M
Minghao Li 已提交
3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059
bool syncNodeCanChange(SSyncNode* pSyncNode) {
  if (pSyncNode->changing) {
    sError("sync cannot change");
    return false;
  }

  if ((pSyncNode->commitIndex >= SYNC_INDEX_BEGIN)) {
    SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
    if (pSyncNode->commitIndex != lastIndex) {
      sError("sync cannot change2");
      return false;
    }
  }

S
Shengliang Guan 已提交
3060
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
3061
    SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->peersId)[i]);
M
Minghao Li 已提交
3062
    if (pSender != NULL && pSender->start) {
M
Minghao Li 已提交
3063 3064 3065 3066 3067 3068
      sError("sync cannot change3");
      return false;
    }
  }

  return true;
M
Minghao Li 已提交
3069
}