syncMain.c 97.4 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

S
Shengliang Guan 已提交
16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "sync.h"
M
Minghao Li 已提交
18 19
#include "syncAppendEntries.h"
#include "syncAppendEntriesReply.h"
M
Minghao Li 已提交
20
#include "syncCommit.h"
M
Minghao Li 已提交
21
#include "syncElection.h"
M
Minghao Li 已提交
22
#include "syncEnv.h"
M
Minghao Li 已提交
23
#include "syncIndexMgr.h"
M
Minghao Li 已提交
24
#include "syncInt.h"
M
Minghao Li 已提交
25
#include "syncMessage.h"
26
#include "syncPipeline.h"
M
Minghao Li 已提交
27
#include "syncRaftCfg.h"
M
Minghao Li 已提交
28
#include "syncRaftLog.h"
M
Minghao Li 已提交
29
#include "syncRaftStore.h"
M
Minghao Li 已提交
30
#include "syncReplication.h"
M
Minghao Li 已提交
31 32
#include "syncRequestVote.h"
#include "syncRequestVoteReply.h"
M
Minghao Li 已提交
33
#include "syncRespMgr.h"
M
Minghao Li 已提交
34
#include "syncSnapshot.h"
M
Minghao Li 已提交
35
#include "syncTimeout.h"
M
Minghao Li 已提交
36
#include "syncUtil.h"
M
Minghao Li 已提交
37
#include "syncVoteMgr.h"
38
#include "tglobal.h"
M
Minghao Li 已提交
39
#include "tref.h"
M
Minghao Li 已提交
40

M
Minghao Li 已提交
41 42 43 44 45
static void    syncNodeEqPingTimer(void* param, void* tmrId);
static void    syncNodeEqElectTimer(void* param, void* tmrId);
static void    syncNodeEqHeartbeatTimer(void* param, void* tmrId);
static int32_t syncNodeEqNoop(SSyncNode* ths);
static int32_t syncNodeAppendNoop(SSyncNode* ths);
46
static void    syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId);
S
Shengliang Guan 已提交
47
static bool    syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg);
S
Shengliang Guan 已提交
48 49 50
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId);
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
S
Shengliang Guan 已提交
51 52 53 54 55 56 57 58 59 60 61
static int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg);
static bool    syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config);
static void    syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* newConfig, SyncIndex lastConfigChangeIndex);
static bool    syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg);

static bool    syncNodeCanChange(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader);
static int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry);

static ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode);
M
Minghao Li 已提交
62

63
int64_t syncOpen(SSyncInfo* pSyncInfo) {
M
Minghao Li 已提交
64
  SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo);
65
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
66
    sError("vgId:%d, failed to open sync node", pSyncInfo->vgId);
67 68
    return -1;
  }
M
Minghao Li 已提交
69

S
Shengliang Guan 已提交
70
  pSyncNode->rid = syncNodeAdd(pSyncNode);
M
Minghao Li 已提交
71
  if (pSyncNode->rid < 0) {
72
    syncNodeClose(pSyncNode);
M
Minghao Li 已提交
73 74 75
    return -1;
  }

S
Shengliang Guan 已提交
76 77 78 79 80 81
  pSyncNode->pingBaseLine = pSyncInfo->pingMs;
  pSyncNode->pingTimerMS = pSyncInfo->pingMs;
  pSyncNode->electBaseLine = pSyncInfo->electMs;
  pSyncNode->hbBaseLine = pSyncInfo->heartbeatMs;
  pSyncNode->heartbeatTimerMS = pSyncInfo->heartbeatMs;
  pSyncNode->msgcb = pSyncInfo->msgcb;
M
Minghao Li 已提交
82
  return pSyncNode->rid;
M
Minghao Li 已提交
83
}
M
Minghao Li 已提交
84

B
Benguang Zhao 已提交
85
int32_t syncStart(int64_t rid) {
S
Shengliang Guan 已提交
86
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
87
  if (pSyncNode == NULL) {
B
Benguang Zhao 已提交
88 89 90 91 92
    sError("failed to acquire rid: %" PRId64 " of tsNodeReftId for pSyncNode", rid);
    return -1;
  }

  if (syncNodeRestore(pSyncNode) < 0) {
93
    sError("vgId:%d, failed to restore sync log buffer since %s", pSyncNode->vgId, terrstr());
94
    goto _err;
M
Minghao Li 已提交
95
  }
M
Minghao Li 已提交
96

B
Benguang Zhao 已提交
97 98 99 100
  if (syncNodeStart(pSyncNode) < 0) {
    sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr());
    goto _err;
  }
M
Minghao Li 已提交
101

B
Benguang Zhao 已提交
102 103
  syncNodeRelease(pSyncNode);
  return 0;
M
Minghao Li 已提交
104

105 106 107
_err:
  syncNodeRelease(pSyncNode);
  return -1;
M
Minghao Li 已提交
108 109
}

M
Minghao Li 已提交
110
void syncStop(int64_t rid) {
S
Shengliang Guan 已提交
111
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
112
  if (pSyncNode != NULL) {
113
    pSyncNode->isStart = false;
S
Shengliang Guan 已提交
114
    syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
115
    syncNodeRemove(rid);
M
Minghao Li 已提交
116 117 118
  }
}

M
Minghao Li 已提交
119 120
void syncPreStop(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
121 122 123
  if (pSyncNode != NULL) {
    syncNodePreClose(pSyncNode);
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
124 125 126
  }
}

S
Shengliang Guan 已提交
127 128 129
static bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pCfg) {
  if (!syncNodeInConfig(pSyncNode, pCfg)) return false;
  return abs(pCfg->replicaNum - pSyncNode->replicaNum) <= 1;
M
Minghao Li 已提交
130 131
}

S
Shengliang Guan 已提交
132
int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
133
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
134
  if (pSyncNode == NULL) return -1;
M
Minghao Li 已提交
135

M
Minghao Li 已提交
136
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
S
Shengliang Guan 已提交
137
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
138
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
S
Shengliang Guan 已提交
139
    sError("vgId:%d, failed to reconfig since invalid new config", pSyncNode->vgId);
M
Minghao Li 已提交
140
    return -1;
M
Minghao Li 已提交
141
  }
142

S
Shengliang Guan 已提交
143 144
  syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg);
  syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID);
S
Shengliang Guan 已提交
145

M
Minghao Li 已提交
146 147 148 149
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    syncNodeStopHeartbeatTimer(pSyncNode);

    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
S
Shengliang Guan 已提交
150
      syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]);
M
Minghao Li 已提交
151 152 153 154 155
    }

    syncNodeStartHeartbeatTimer(pSyncNode);
    syncNodeReplicate(pSyncNode);
  }
S
Shengliang Guan 已提交
156

S
Shengliang Guan 已提交
157
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
158
  return 0;
M
Minghao Li 已提交
159
}
M
Minghao Li 已提交
160

S
Shengliang Guan 已提交
161 162 163 164
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg) {
  int32_t code = -1;
  if (!syncIsInit()) return code;

S
Shengliang Guan 已提交
165
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
166 167
  if (pSyncNode == NULL) return code;

S
Shengliang Guan 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
  switch (pMsg->msgType) {
    case TDMT_SYNC_HEARTBEAT:
      code = syncNodeOnHeartbeat(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_HEARTBEAT_REPLY:
      code = syncNodeOnHeartbeatReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_TIMEOUT:
      code = syncNodeOnTimeout(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_CLIENT_REQUEST:
      code = syncNodeOnClientRequest(pSyncNode, pMsg, NULL);
      break;
    case TDMT_SYNC_REQUEST_VOTE:
      code = syncNodeOnRequestVote(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_REQUEST_VOTE_REPLY:
      code = syncNodeOnRequestVoteReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES:
      code = syncNodeOnAppendEntries(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES_REPLY:
      code = syncNodeOnAppendEntriesReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_SEND:
      code = syncNodeOnSnapshot(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_RSP:
      code = syncNodeOnSnapshotReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_LOCAL_CMD:
      code = syncNodeOnLocalCmd(pSyncNode, pMsg);
      break;
    default:
      sError("vgId:%d, failed to process msg:%p since invalid type:%s", pSyncNode->vgId, pMsg,
             TMSG_INFO(pMsg->msgType));
      code = -1;
M
Minghao Li 已提交
206 207
  }

S
Shengliang Guan 已提交
208
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
209
  return code;
210 211
}

S
Shengliang Guan 已提交
212
int32_t syncLeaderTransfer(int64_t rid) {
S
Shengliang Guan 已提交
213
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
214
  if (pSyncNode == NULL) return -1;
215

S
Shengliang Guan 已提交
216
  int32_t ret = syncNodeLeaderTransfer(pSyncNode);
S
Shengliang Guan 已提交
217
  syncNodeRelease(pSyncNode);
218 219 220
  return ret;
}

M
Minghao Li 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
SyncIndex syncMinMatchIndex(SSyncNode* pSyncNode) {
  SyncIndex minMatchIndex = SYNC_INDEX_INVALID;

  if (pSyncNode->peersNum > 0) {
    minMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
  }

  for (int32_t i = 1; i < pSyncNode->peersNum; ++i) {
    SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (matchIndex < minMatchIndex) {
      minMatchIndex = matchIndex;
    }
  }
  return minMatchIndex;
}

237
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
S
Shengliang Guan 已提交
238
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
239
  if (pSyncNode == NULL) {
240
    sError("sync begin snapshot error");
241 242
    return -1;
  }
243

244 245
  int32_t code = 0;

M
Minghao Li 已提交
246
  if (syncNodeIsMnode(pSyncNode)) {
M
Minghao Li 已提交
247 248 249
    // mnode
    int64_t logRetention = SYNC_MNODE_LOG_RETENTION;

M
Minghao Li 已提交
250 251 252
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    int64_t   logNum = endIndex - beginIndex;
M
Minghao Li 已提交
253 254 255
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || (!isEmpty && logNum < logRetention)) {
S
Shengliang Guan 已提交
256 257
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", log-num:%" PRId64 ", empty:%d, do not delete wal",
              lastApplyIndex, logNum, isEmpty);
S
Shengliang Guan 已提交
258
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
259 260 261
      return 0;
    }

M
Minghao Li 已提交
262 263 264
    goto _DEL_WAL;

  } else {
265 266 267 268 269 270 271 272 273 274 275 276
    lastApplyIndex -= SYNC_VNODE_LOG_RETENTION;

    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || !(lastApplyIndex >= beginIndex && lastApplyIndex <= endIndex)) {
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", empty:%d, do not delete wal", lastApplyIndex, isEmpty);
      syncNodeRelease(pSyncNode);
      return 0;
    }

M
Minghao Li 已提交
277 278 279 280 281 282 283 284 285 286 287 288 289 290
    // vnode
    if (pSyncNode->replicaNum > 1) {
      // multi replicas

      if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
        pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode);

        for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
          int64_t matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
          if (lastApplyIndex > matchIndex) {
            do {
              char     host[64];
              uint16_t port;
              syncUtilU642Addr(pSyncNode->peersId[i].addr, host, sizeof(host), &port);
S
Shengliang Guan 已提交
291 292 293 294
              sNTrace(pSyncNode,
                      "new-snapshot-index:%" PRId64 " is greater than match-index:%" PRId64
                      " of %s:%d, do not delete wal",
                      lastApplyIndex, matchIndex, host, port);
M
Minghao Li 已提交
295 296
            } while (0);

S
Shengliang Guan 已提交
297
            syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
298 299 300 301 302 303
            return 0;
          }
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
        if (lastApplyIndex > pSyncNode->minMatchIndex) {
S
Shengliang Guan 已提交
304 305 306
          sNTrace(pSyncNode,
                  "new-snapshot-index:%" PRId64 " is greater than min-match-index:%" PRId64 ", do not delete wal",
                  lastApplyIndex, pSyncNode->minMatchIndex);
S
Shengliang Guan 已提交
307
          syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
308 309 310 311
          return 0;
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE) {
S
Shengliang Guan 已提交
312
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
313
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
314 315 316
        return 0;

      } else {
S
Shengliang Guan 已提交
317
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " unknown state, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
318
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
319 320 321 322 323 324 325 326 327
        return 0;
      }

      goto _DEL_WAL;

    } else {
      // one replica

      goto _DEL_WAL;
328 329 330
    }
  }

M
Minghao Li 已提交
331
_DEL_WAL:
332

M
Minghao Li 已提交
333
  do {
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    SyncIndex          snapshotVer = walGetSnapshotVer(pData->pWal);
    SyncIndex          walCommitVer = walGetCommittedVer(pData->pWal);
    SyncIndex          wallastVer = walGetLastVer(pData->pWal);
    if (lastApplyIndex <= walCommitVer) {
      SyncIndex snapshottingIndex = atomic_load_64(&pSyncNode->snapshottingIndex);

      if (snapshottingIndex == SYNC_INDEX_INVALID) {
        atomic_store_64(&pSyncNode->snapshottingIndex, lastApplyIndex);
        pSyncNode->snapshottingTime = taosGetTimestampMs();

        code = walBeginSnapshot(pData->pWal, lastApplyIndex);
        if (code == 0) {
          sNTrace(pSyncNode, "wal snapshot begin, index:%" PRId64 ", last apply index:%" PRId64,
                  pSyncNode->snapshottingIndex, lastApplyIndex);
        } else {
          sNError(pSyncNode, "wal snapshot begin error since:%s, index:%" PRId64 ", last apply index:%" PRId64,
                  terrstr(terrno), pSyncNode->snapshottingIndex, lastApplyIndex);
          atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
        }
354

M
Minghao Li 已提交
355
      } else {
356 357
        sNTrace(pSyncNode, "snapshotting for %" PRId64 ", do not delete wal for new-snapshot-index:%" PRId64,
                snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
358
      }
359
    }
M
Minghao Li 已提交
360
  } while (0);
361

S
Shengliang Guan 已提交
362
  syncNodeRelease(pSyncNode);
363 364 365 366
  return code;
}

int32_t syncEndSnapshot(int64_t rid) {
S
Shengliang Guan 已提交
367
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
368
  if (pSyncNode == NULL) {
369
    sError("sync end snapshot error");
370 371 372
    return -1;
  }

373 374 375 376
  int32_t code = 0;
  if (atomic_load_64(&pSyncNode->snapshottingIndex) != SYNC_INDEX_INVALID) {
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    code = walEndSnapshot(pData->pWal);
M
Minghao Li 已提交
377
    if (code != 0) {
378
      sNError(pSyncNode, "wal snapshot end error since:%s", terrstr());
S
Shengliang Guan 已提交
379
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
380 381
      return -1;
    } else {
S
Shengliang Guan 已提交
382
      sNTrace(pSyncNode, "wal snapshot end, index:%" PRId64, atomic_load_64(&pSyncNode->snapshottingIndex));
M
Minghao Li 已提交
383 384
      atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
    }
385
  }
386

S
Shengliang Guan 已提交
387
  syncNodeRelease(pSyncNode);
388 389 390
  return code;
}

M
Minghao Li 已提交
391
int32_t syncStepDown(int64_t rid, SyncTerm newTerm) {
S
Shengliang Guan 已提交
392
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
393
  if (pSyncNode == NULL) {
394
    sError("sync step down error");
M
Minghao Li 已提交
395 396 397
    return -1;
  }

M
Minghao Li 已提交
398
  syncNodeStepDown(pSyncNode, newTerm);
S
Shengliang Guan 已提交
399
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
400
  return 0;
M
Minghao Li 已提交
401 402
}

403
bool syncNodeIsReadyForRead(SSyncNode* pSyncNode) {
404
  if (pSyncNode == NULL) {
405
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
406
    sError("sync ready for read error");
407 408
    return false;
  }
M
Minghao Li 已提交
409

410 411 412 413 414 415
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return false;
  }

  if (pSyncNode->restoreFinish) {
416
    return true;
M
Minghao Li 已提交
417 418
  }

419
  bool ready = false;
420 421 422
  if (!pSyncNode->pFsm->FpApplyQueueEmptyCb(pSyncNode->pFsm)) {
    // apply queue not empty
    ready = false;
M
Minghao Li 已提交
423

424 425 426 427 428 429 430 431 432 433 434 435 436
  } else {
    if (!pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore)) {
      SyncIndex       lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
      SSyncRaftEntry* pEntry = NULL;
      SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
      LRUHandle*      h = taosLRUCacheLookup(pCache, &lastIndex, sizeof(lastIndex));
      int32_t         code = 0;
      if (h) {
        pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
        code = 0;

        pSyncNode->pLogStore->cacheHit++;
        sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", lastIndex, pEntry->bytes, pEntry);
M
Minghao Li 已提交
437

438 439 440
      } else {
        pSyncNode->pLogStore->cacheMiss++;
        sNTrace(pSyncNode, "miss cache index:%" PRId64, lastIndex);
M
Minghao Li 已提交
441

442 443
        code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, lastIndex, &pEntry);
      }
444

445 446 447
      if (code == 0 && pEntry != NULL) {
        if (pEntry->originalRpcType == TDMT_SYNC_NOOP && pEntry->term == pSyncNode->pRaftStore->currentTerm) {
          ready = true;
448
        }
449

450 451 452 453
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
          syncEntryDestroy(pEntry);
454
        }
455 456 457 458
      }
    }
  }

459
  if (!ready) {
460
    terrno = TSDB_CODE_SYN_RESTORING;
461
  }
462

463 464 465 466 467 468 469 470 471 472 473 474
  return ready;
}

bool syncIsReadyForRead(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    sError("sync ready for read error");
    return false;
  }

  bool ready = syncNodeIsReadyForRead(pSyncNode);

475 476
  syncNodeRelease(pSyncNode);
  return ready;
M
Minghao Li 已提交
477
}
M
Minghao Li 已提交
478

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
bool syncSnapshotSending(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotSending(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

bool syncSnapshotRecving(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotRecving(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

M
Minghao Li 已提交
501 502
int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode) {
  if (pSyncNode->peersNum == 0) {
S
Shengliang Guan 已提交
503
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
504 505
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
506
  }
M
Minghao Li 已提交
507

508
  int32_t ret = 0;
509
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER && pSyncNode->replicaNum > 1) {
510
    SNodeInfo newLeader = (pSyncNode->peersNodeInfo)[0];
511 512 513 514 515 516 517
    if (pSyncNode->peersNum == 2) {
      SyncIndex matchIndex0 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
      SyncIndex matchIndex1 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[1]));
      if (matchIndex1 > matchIndex0) {
        newLeader = (pSyncNode->peersNodeInfo)[1];
      }
    }
518 519 520
    ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
  }

M
Minghao Li 已提交
521
  return ret;
M
Minghao Li 已提交
522 523
}

M
Minghao Li 已提交
524 525
int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader) {
  if (pSyncNode->replicaNum == 1) {
S
Shengliang Guan 已提交
526
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
527 528
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
529
  }
530

S
Shengliang Guan 已提交
531
  sNTrace(pSyncNode, "begin leader transfer to %s:%u", newLeader.nodeFqdn, newLeader.nodePort);
M
Minghao Li 已提交
532

533 534 535 536
  SRpcMsg rpcMsg = {0};
  (void)syncBuildLeaderTransfer(&rpcMsg, pSyncNode->vgId);

  SyncLeaderTransfer* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
537 538 539 540
  pMsg->newLeaderId.addr = syncUtilAddr2U64(newLeader.nodeFqdn, newLeader.nodePort);
  pMsg->newLeaderId.vgId = pSyncNode->vgId;
  pMsg->newNodeInfo = newLeader;

S
Shengliang Guan 已提交
541 542 543
  int32_t ret = syncNodePropose(pSyncNode, &rpcMsg, false);
  rpcFreeCont(rpcMsg.pCont);
  return ret;
M
Minghao Li 已提交
544 545
}

546 547
SSyncState syncGetState(int64_t rid) {
  SSyncState state = {.state = TAOS_SYNC_STATE_ERROR};
M
Minghao Li 已提交
548

S
Shengliang Guan 已提交
549
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
550 551 552
  if (pSyncNode != NULL) {
    state.state = pSyncNode->state;
    state.restored = pSyncNode->restoreFinish;
553 554 555 556 557
    if (pSyncNode->vgId != 1) {
      state.canRead = syncNodeIsReadyForRead(pSyncNode);
    } else {
      state.canRead = state.restored;
    }
558
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
559 560
  }

561
  return state;
M
Minghao Li 已提交
562 563
}

564
#if 0
565 566 567 568 569
int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) {
  if (index < SYNC_INDEX_BEGIN) {
    return -1;
  }

S
Shengliang Guan 已提交
570
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
571 572 573 574 575 576 577 578 579
  if (pSyncNode == NULL) {
    return -1;
  }
  ASSERT(rid == pSyncNode->rid);

  SSyncRaftEntry* pEntry = NULL;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
  if (code != 0) {
    if (pEntry != NULL) {
B
Benguang Zhao 已提交
580
      syncEntryDestroy(pEntry);
581
    }
S
Shengliang Guan 已提交
582
    syncNodeRelease(pSyncNode);
583 584 585 586 587 588 589 590 591
    return -1;
  }
  ASSERT(pEntry != NULL);

  pSnapshot->data = NULL;
  pSnapshot->lastApplyIndex = index;
  pSnapshot->lastApplyTerm = pEntry->term;
  pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index);

592
  syncEntryDestroy(pEntry);
S
Shengliang Guan 已提交
593
  syncNodeRelease(pSyncNode);
594 595 596
  return 0;
}

597
int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
598
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
599 600 601
  if (pSyncNode == NULL) {
    return -1;
  }
M
Minghao Li 已提交
602
  ASSERT(rid == pSyncNode->rid);
603 604
  sMeta->lastConfigIndex = pSyncNode->pRaftCfg->lastConfigIndex;

S
Shengliang Guan 已提交
605
  sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->pRaftCfg->lastConfigIndex);
606

S
Shengliang Guan 已提交
607
  syncNodeRelease(pSyncNode);
608 609 610
  return 0;
}

611
int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
612
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
613 614 615
  if (pSyncNode == NULL) {
    return -1;
  }
M
Minghao Li 已提交
616
  ASSERT(rid == pSyncNode->rid);
617 618 619 620

  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
621
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
622 623 624 625 626 627
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
  sMeta->lastConfigIndex = lastIndex;
628
  sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex,
S
Shengliang Guan 已提交
629
         sMeta->lastConfigIndex);
630

S
Shengliang Guan 已提交
631
  syncNodeRelease(pSyncNode);
632 633
  return 0;
}
634
#endif
635

636 637 638 639
SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
640
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
641 642 643 644 645
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotLastApplyIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
S
Shengliang Guan 已提交
646
  sTrace("vgId:%d, sync get last config index, index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId,
S
Shengliang Guan 已提交
647
         snapshotLastApplyIndex, lastIndex);
648 649 650 651

  return lastIndex;
}

652 653
void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
  pEpSet->numOfEps = 0;
M
Minghao Li 已提交
654

S
Shengliang Guan 已提交
655
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
656
  if (pSyncNode == NULL) return;
M
Minghao Li 已提交
657

S
Shengliang Guan 已提交
658
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
S
Shengliang Guan 已提交
659 660 661 662
    SEp* pEp = &pEpSet->eps[i];
    tstrncpy(pEp->fqdn, pSyncNode->pRaftCfg->cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN);
    pEp->port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort;
    pEpSet->numOfEps++;
663
    sDebug("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port);
M
Minghao Li 已提交
664
  }
M
Minghao Li 已提交
665 666
  if (pEpSet->numOfEps > 0) {
    pEpSet->inUse = (pSyncNode->pRaftCfg->cfg.myIndex + 1) % pEpSet->numOfEps;
M
Minghao Li 已提交
667 668
  }

S
Shengliang Guan 已提交
669
  sInfo("vgId:%d, sync get retry epset numOfEps:%d inUse:%d", pSyncNode->vgId, pEpSet->numOfEps, pEpSet->inUse);
S
Shengliang Guan 已提交
670
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
671 672
}

M
Minghao Li 已提交
673
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
674
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
675
  if (pSyncNode == NULL) {
676
    sError("sync propose error");
M
Minghao Li 已提交
677
    return -1;
678
  }
679

680
  int32_t ret = syncNodePropose(pSyncNode, pMsg, isWeak);
S
Shengliang Guan 已提交
681
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
682 683
  return ret;
}
M
Minghao Li 已提交
684

685
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
686 687 688 689 690
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    sNError(pSyncNode, "sync propose not leader, %s, type:%s", syncStr(pSyncNode->state), TMSG_INFO(pMsg->msgType));
    return -1;
  }
691

S
Shengliang Guan 已提交
692 693 694 695 696 697 698
  // not restored, vnode enable
  if (!pSyncNode->restoreFinish && pSyncNode->vgId != 1) {
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since not ready, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }
699

700
  // heartbeat timeout
701
  if (syncNodeHeartbeatReplyTimeout(pSyncNode)) {
702 703 704 705 706 707
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since hearbeat timeout, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }

S
Shengliang Guan 已提交
708 709 710
  // optimized one replica
  if (syncNodeIsOptimizedOneReplica(pSyncNode, pMsg)) {
    SyncIndex retIndex;
711
    int32_t   code = syncNodeOnClientRequest(pSyncNode, pMsg, &retIndex);
S
Shengliang Guan 已提交
712 713 714
    if (code == 0) {
      pMsg->info.conn.applyIndex = retIndex;
      pMsg->info.conn.applyTerm = pSyncNode->pRaftStore->currentTerm;
715 716 717
      sTrace("vgId:%d, propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
             TMSG_INFO(pMsg->msgType));
      return 1;
M
Minghao Li 已提交
718
    } else {
S
Shengliang Guan 已提交
719
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
720
      sError("vgId:%d, failed to propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
S
Shengliang Guan 已提交
721
             TMSG_INFO(pMsg->msgType));
722
      return -1;
723
    }
S
Shengliang Guan 已提交
724
  } else {
S
Shengliang Guan 已提交
725 726
    SRespStub stub = {.createTime = taosGetTimestampMs(), .rpcMsg = *pMsg};
    uint64_t  seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);
727
    SRpcMsg   rpcMsg = {0};
S
Shengliang Guan 已提交
728
    int32_t   code = syncBuildClientRequest(&rpcMsg, pMsg, seqNum, isWeak, pSyncNode->vgId);
729 730 731 732
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while serialize since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
      return -1;
M
Minghao Li 已提交
733
    }
734

735 736 737 738 739
    sNTrace(pSyncNode, "propose msg, type:%s", TMSG_INFO(pMsg->msgType));
    code = (*pSyncNode->syncEqMsg)(pSyncNode->msgcb, &rpcMsg);
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while enqueue since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
M
Minghao Li 已提交
740
    }
M
Minghao Li 已提交
741

742
    return code;
M
Minghao Li 已提交
743
  }
M
Minghao Li 已提交
744 745
}

S
Shengliang Guan 已提交
746
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId) {
747 748 749 750 751
  pSyncTimer->pTimer = NULL;
  pSyncTimer->counter = 0;
  pSyncTimer->timerMS = pSyncNode->hbBaseLine;
  pSyncTimer->timerCb = syncNodeEqPeerHeartbeatTimer;
  pSyncTimer->destId = destId;
M
Minghao Li 已提交
752
  pSyncTimer->timeStamp = taosGetTimestampMs();
753 754 755 756
  atomic_store_64(&pSyncTimer->logicClock, 0);
  return 0;
}

S
Shengliang Guan 已提交
757
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
758
  int32_t ret = 0;
S
Shengliang Guan 已提交
759
  int64_t tsNow = taosGetTimestampMs();
S
Shengliang Guan 已提交
760
  if (syncIsInit()) {
761 762 763 764 765 766
    SSyncHbTimerData* pData = syncHbTimerDataAcquire(pSyncTimer->hbDataRid);
    if (pData == NULL) {
      pData = taosMemoryMalloc(sizeof(SSyncHbTimerData));
      pData->rid = syncHbTimerDataAdd(pData);
    }
    pSyncTimer->hbDataRid = pData->rid;
S
Shengliang Guan 已提交
767
    pSyncTimer->timeStamp = tsNow;
768 769

    pData->syncNodeRid = pSyncNode->rid;
770 771 772
    pData->pTimer = pSyncTimer;
    pData->destId = pSyncTimer->destId;
    pData->logicClock = pSyncTimer->logicClock;
S
Shengliang Guan 已提交
773
    pData->execTime = tsNow + pSyncTimer->timerMS;
M
Minghao Li 已提交
774

775 776
    taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)(pData->rid),
                 syncEnv()->pTimerManager, &pSyncTimer->pTimer);
777 778 779 780 781 782
  } else {
    sError("vgId:%d, start ctrl hb timer error, sync env is stop", pSyncNode->vgId);
  }
  return ret;
}

S
Shengliang Guan 已提交
783
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
784 785 786 787
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncTimer->logicClock, 1);
  taosTmrStop(pSyncTimer->pTimer);
  pSyncTimer->pTimer = NULL;
788 789
  syncHbTimerDataRemove(pSyncTimer->hbDataRid);
  pSyncTimer->hbDataRid = -1;
790 791 792
  return ret;
}

793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
int32_t syncNodeLogStoreRestoreOnNeed(SSyncNode* pNode) {
  ASSERT(pNode->pLogStore != NULL && "log store not created");
  ASSERT(pNode->pFsm != NULL && "pFsm not registered");
  ASSERT(pNode->pFsm->FpGetSnapshotInfo != NULL && "FpGetSnapshotInfo not registered");
  SSnapshot snapshot;
  if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) {
    sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr());
    return -1;
  }
  SyncIndex commitIndex = snapshot.lastApplyIndex;
  SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore);
  SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore);
  if (lastVer < commitIndex || firstVer > commitIndex + 1) {
    if (pNode->pLogStore->syncLogRestoreFromSnapshot(pNode->pLogStore, commitIndex)) {
      sError("vgId:%d, failed to restore log store from snapshot since %s. lastVer: %" PRId64 ", snapshotVer: %" PRId64,
             pNode->vgId, terrstr(), lastVer, commitIndex);
      return -1;
    }
  }
  return 0;
}

M
Minghao Li 已提交
815
// open/close --------------
S
Shengliang Guan 已提交
816 817
SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode));
818 819 820 821
  if (pSyncNode == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _error;
  }
M
Minghao Li 已提交
822

M
Minghao Li 已提交
823 824 825 826
  if (!taosDirExist((char*)(pSyncInfo->path))) {
    if (taosMkDir(pSyncInfo->path) != 0) {
      terrno = TAOS_SYSTEM_ERROR(errno);
      sError("failed to create dir:%s since %s", pSyncInfo->path, terrstr());
827
      goto _error;
M
Minghao Li 已提交
828
    }
829
  }
M
Minghao Li 已提交
830

S
Shengliang Guan 已提交
831
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
832
  if (!taosCheckExistFile(pSyncNode->configPath)) {
M
Minghao Li 已提交
833
    // create a new raft config file
S
Shengliang Guan 已提交
834
    SRaftCfgMeta meta = {0};
M
Minghao Li 已提交
835
    meta.isStandBy = pSyncInfo->isStandBy;
M
Minghao Li 已提交
836
    meta.snapshotStrategy = pSyncInfo->snapshotStrategy;
837
    meta.lastConfigIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
838
    meta.batchSize = pSyncInfo->batchSize;
S
Shengliang Guan 已提交
839 840
    if (raftCfgCreateFile(&pSyncInfo->syncCfg, meta, pSyncNode->configPath) != 0) {
      sError("vgId:%d, failed to create raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
841
      goto _error;
842
    }
843
    if (pSyncInfo->syncCfg.replicaNum == 0) {
S
Shengliang Guan 已提交
844
      sInfo("vgId:%d, sync config not input", pSyncNode->vgId);
845 846
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
847 848 849
  } else {
    // update syncCfg by raft_config.json
    pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
850
    if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
851
      sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
852
      goto _error;
853
    }
S
Shengliang Guan 已提交
854 855

    if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->pRaftCfg->cfg, &pSyncInfo->syncCfg)) {
S
Shengliang Guan 已提交
856 857 858 859 860 861
      sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId);
      pSyncNode->pRaftCfg->cfg = pSyncInfo->syncCfg;
      if (raftCfgPersist(pSyncNode->pRaftCfg) != 0) {
        sError("vgId:%d, failed to persist raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
        goto _error;
      }
S
Shengliang Guan 已提交
862 863 864 865
    } else {
      sInfo("vgId:%d, use sync config from raft cfg file", pSyncNode->vgId);
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
866 867

    raftCfgClose(pSyncNode->pRaftCfg);
868
    pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
869 870
  }

M
Minghao Li 已提交
871
  // init by SSyncInfo
M
Minghao Li 已提交
872
  pSyncNode->vgId = pSyncInfo->vgId;
S
Shengliang Guan 已提交
873 874 875 876 877 878 879
  SSyncCfg* pCfg = &pSyncInfo->syncCfg;
  sDebug("vgId:%d, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SNodeInfo* pNode = &pCfg->nodeInfo[i];
    sDebug("vgId:%d, index:%d ep:%s:%u", pSyncNode->vgId, i, pNode->nodeFqdn, pNode->nodePort);
  }

M
Minghao Li 已提交
880
  memcpy(pSyncNode->path, pSyncInfo->path, sizeof(pSyncNode->path));
S
Shengliang Guan 已提交
881 882 883
  snprintf(pSyncNode->raftStorePath, sizeof(pSyncNode->raftStorePath), "%s%sraft_store.json", pSyncInfo->path,
           TD_DIRSEP);
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
M
Minghao Li 已提交
884

M
Minghao Li 已提交
885
  pSyncNode->pWal = pSyncInfo->pWal;
S
Shengliang Guan 已提交
886
  pSyncNode->msgcb = pSyncInfo->msgcb;
S
Shengliang Guan 已提交
887 888 889
  pSyncNode->syncSendMSg = pSyncInfo->syncSendMSg;
  pSyncNode->syncEqMsg = pSyncInfo->syncEqMsg;
  pSyncNode->syncEqCtrlMsg = pSyncInfo->syncEqCtrlMsg;
M
Minghao Li 已提交
890

B
Benguang Zhao 已提交
891 892 893
  // create raft log ring buffer
  pSyncNode->pLogBuf = syncLogBufferCreate();
  if (pSyncNode->pLogBuf == NULL) {
894
    sError("failed to init sync log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId);
B
Benguang Zhao 已提交
895 896 897
    goto _error;
  }

M
Minghao Li 已提交
898 899
  // init raft config
  pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
900
  if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
901
    sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
902 903
    goto _error;
  }
M
Minghao Li 已提交
904

M
Minghao Li 已提交
905
  // init internal
M
Minghao Li 已提交
906
  pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
907
  if (!syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId)) {
S
Shengliang Guan 已提交
908
    sError("vgId:%d, failed to determine my raft member id", pSyncNode->vgId);
H
Hongze Cheng 已提交
909
    goto _error;
910
  }
M
Minghao Li 已提交
911

M
Minghao Li 已提交
912
  // init peersNum, peers, peersId
M
Minghao Li 已提交
913
  pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
914 915
  int32_t j = 0;
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
916 917
    if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
      pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
M
Minghao Li 已提交
918 919 920
      j++;
    }
  }
S
Shengliang Guan 已提交
921
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
922
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i])) {
S
Shengliang Guan 已提交
923
      sError("vgId:%d, failed to determine raft member id, peer:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
924
      goto _error;
925
    }
M
Minghao Li 已提交
926
  }
M
Minghao Li 已提交
927

M
Minghao Li 已提交
928
  // init replicaNum, replicasId
M
Minghao Li 已提交
929
  pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
930
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
931
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) {
S
Shengliang Guan 已提交
932
      sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
933
      goto _error;
934
    }
M
Minghao Li 已提交
935 936
  }

M
Minghao Li 已提交
937
  // init raft algorithm
M
Minghao Li 已提交
938
  pSyncNode->pFsm = pSyncInfo->pFsm;
939
  pSyncInfo->pFsm = NULL;
M
Minghao Li 已提交
940
  pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);
M
Minghao Li 已提交
941 942
  pSyncNode->leaderCache = EMPTY_RAFT_ID;

M
Minghao Li 已提交
943
  // init life cycle outside
M
Minghao Li 已提交
944

M
Minghao Li 已提交
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968
  // TLA+ Spec
  // InitHistoryVars == /\ elections = {}
  //                    /\ allLogs   = {}
  //                    /\ voterLog  = [i \in Server |-> [j \in {} |-> <<>>]]
  // InitServerVars == /\ currentTerm = [i \in Server |-> 1]
  //                   /\ state       = [i \in Server |-> Follower]
  //                   /\ votedFor    = [i \in Server |-> Nil]
  // InitCandidateVars == /\ votesResponded = [i \in Server |-> {}]
  //                      /\ votesGranted   = [i \in Server |-> {}]
  // \* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the
  // \* leader does not send itself messages. It's still easier to include these
  // \* in the functions.
  // InitLeaderVars == /\ nextIndex  = [i \in Server |-> [j \in Server |-> 1]]
  //                   /\ matchIndex = [i \in Server |-> [j \in Server |-> 0]]
  // InitLogVars == /\ log          = [i \in Server |-> << >>]
  //                /\ commitIndex  = [i \in Server |-> 0]
  // Init == /\ messages = [m \in {} |-> 0]
  //         /\ InitHistoryVars
  //         /\ InitServerVars
  //         /\ InitCandidateVars
  //         /\ InitLeaderVars
  //         /\ InitLogVars
  //

M
Minghao Li 已提交
969
  // init TLA+ server vars
M
syncInt  
Minghao Li 已提交
970
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
M
Minghao Li 已提交
971
  pSyncNode->pRaftStore = raftStoreOpen(pSyncNode->raftStorePath);
972
  if (pSyncNode->pRaftStore == NULL) {
S
Shengliang Guan 已提交
973
    sError("vgId:%d, failed to open raft store at path %s", pSyncNode->vgId, pSyncNode->raftStorePath);
974 975
    goto _error;
  }
M
Minghao Li 已提交
976

M
Minghao Li 已提交
977
  // init TLA+ candidate vars
M
Minghao Li 已提交
978
  pSyncNode->pVotesGranted = voteGrantedCreate(pSyncNode);
979
  if (pSyncNode->pVotesGranted == NULL) {
S
Shengliang Guan 已提交
980
    sError("vgId:%d, failed to create VotesGranted", pSyncNode->vgId);
981 982
    goto _error;
  }
M
Minghao Li 已提交
983
  pSyncNode->pVotesRespond = votesRespondCreate(pSyncNode);
984
  if (pSyncNode->pVotesRespond == NULL) {
S
Shengliang Guan 已提交
985
    sError("vgId:%d, failed to create VotesRespond", pSyncNode->vgId);
986 987
    goto _error;
  }
M
Minghao Li 已提交
988

M
Minghao Li 已提交
989 990
  // init TLA+ leader vars
  pSyncNode->pNextIndex = syncIndexMgrCreate(pSyncNode);
991
  if (pSyncNode->pNextIndex == NULL) {
S
Shengliang Guan 已提交
992
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
993 994
    goto _error;
  }
M
Minghao Li 已提交
995
  pSyncNode->pMatchIndex = syncIndexMgrCreate(pSyncNode);
996
  if (pSyncNode->pMatchIndex == NULL) {
S
Shengliang Guan 已提交
997
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
998 999
    goto _error;
  }
M
Minghao Li 已提交
1000 1001 1002

  // init TLA+ log vars
  pSyncNode->pLogStore = logStoreCreate(pSyncNode);
1003
  if (pSyncNode->pLogStore == NULL) {
S
Shengliang Guan 已提交
1004
    sError("vgId:%d, failed to create SyncLogStore", pSyncNode->vgId);
1005 1006
    goto _error;
  }
1007 1008 1009 1010 1011

  SyncIndex commitIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot = {0};
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
1012
    if (code != 0) {
S
Shengliang Guan 已提交
1013
      sError("vgId:%d, failed to get snapshot info, code:%d", pSyncNode->vgId, code);
H
Hongze Cheng 已提交
1014
      goto _error;
1015
    }
1016 1017
    if (snapshot.lastApplyIndex > commitIndex) {
      commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
1018
      sNTrace(pSyncNode, "reset commit index by snapshot");
1019 1020 1021
    }
  }
  pSyncNode->commitIndex = commitIndex;
M
Minghao Li 已提交
1022

1023 1024 1025
  if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) {
    goto _error;
  }
M
Minghao Li 已提交
1026 1027
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
1028 1029
  pSyncNode->electBaseLine = tsElectInterval;
  pSyncNode->hbBaseLine = tsHeartbeatInterval;
M
Minghao Li 已提交
1030

M
Minghao Li 已提交
1031
  // init ping timer
M
Minghao Li 已提交
1032
  pSyncNode->pPingTimer = NULL;
M
Minghao Li 已提交
1033
  pSyncNode->pingTimerMS = pSyncNode->pingBaseLine;
M
Minghao Li 已提交
1034 1035
  atomic_store_64(&pSyncNode->pingTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->pingTimerLogicClockUser, 0);
M
Minghao Li 已提交
1036
  pSyncNode->FpPingTimerCB = syncNodeEqPingTimer;
M
Minghao Li 已提交
1037
  pSyncNode->pingTimerCounter = 0;
M
Minghao Li 已提交
1038

M
Minghao Li 已提交
1039 1040
  // init elect timer
  pSyncNode->pElectTimer = NULL;
M
Minghao Li 已提交
1041
  pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
M
Minghao Li 已提交
1042
  atomic_store_64(&pSyncNode->electTimerLogicClock, 0);
M
Minghao Li 已提交
1043
  pSyncNode->FpElectTimerCB = syncNodeEqElectTimer;
M
Minghao Li 已提交
1044 1045 1046 1047
  pSyncNode->electTimerCounter = 0;

  // init heartbeat timer
  pSyncNode->pHeartbeatTimer = NULL;
M
Minghao Li 已提交
1048
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
M
Minghao Li 已提交
1049 1050
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClockUser, 0);
M
Minghao Li 已提交
1051
  pSyncNode->FpHeartbeatTimerCB = syncNodeEqHeartbeatTimer;
M
Minghao Li 已提交
1052 1053
  pSyncNode->heartbeatTimerCounter = 0;

1054 1055 1056 1057 1058
  // init peer heartbeat timer
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
    syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]);
  }

M
Minghao Li 已提交
1059
  // tools
M
Minghao Li 已提交
1060
  pSyncNode->pSyncRespMgr = syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS);
1061
  if (pSyncNode->pSyncRespMgr == NULL) {
S
Shengliang Guan 已提交
1062
    sError("vgId:%d, failed to create SyncRespMgr", pSyncNode->vgId);
1063 1064
    goto _error;
  }
M
Minghao Li 已提交
1065

1066 1067
  // restore state
  pSyncNode->restoreFinish = false;
1068

M
Minghao Li 已提交
1069
  // snapshot senders
S
Shengliang Guan 已提交
1070
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1071 1072 1073
    SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
    // ASSERT(pSender != NULL);
    (pSyncNode->senders)[i] = pSender;
S
Shengliang Guan 已提交
1074
    sSTrace(pSender, "snapshot sender create new while open, data:%p", pSender);
M
Minghao Li 已提交
1075 1076 1077
  }

  // snapshot receivers
1078
  pSyncNode->pNewNodeReceiver = snapshotReceiverCreate(pSyncNode, EMPTY_RAFT_ID);
M
Minghao Li 已提交
1079

M
Minghao Li 已提交
1080 1081 1082
  // is config changing
  pSyncNode->changing = false;

B
Benguang Zhao 已提交
1083 1084 1085
  // replication mgr
  syncNodeLogReplMgrInit(pSyncNode);

M
Minghao Li 已提交
1086 1087 1088
  // peer state
  syncNodePeerStateInit(pSyncNode);

B
Benguang Zhao 已提交
1089
  //
M
Minghao Li 已提交
1090 1091 1092
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

M
Minghao Li 已提交
1093
  // start in syncNodeStart
M
Minghao Li 已提交
1094
  // start raft
M
Minghao Li 已提交
1095
  // syncNodeBecomeFollower(pSyncNode);
M
Minghao Li 已提交
1096

M
Minghao Li 已提交
1097 1098
  int64_t timeNow = taosGetTimestampMs();
  pSyncNode->startTime = timeNow;
1099
  pSyncNode->leaderTime = timeNow;
M
Minghao Li 已提交
1100 1101
  pSyncNode->lastReplicateTime = timeNow;

1102 1103 1104
  // snapshotting
  atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);

B
Benguang Zhao 已提交
1105 1106
  // init log buffer
  if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) {
1107
    sError("vgId:%d, failed to init sync log buffer since %s", pSyncNode->vgId, terrstr());
1108
    goto _error;
B
Benguang Zhao 已提交
1109 1110
  }

1111
  pSyncNode->isStart = true;
1112 1113 1114
  pSyncNode->electNum = 0;
  pSyncNode->becomeLeaderNum = 0;
  pSyncNode->configChangeNum = 0;
1115 1116
  pSyncNode->hbSlowNum = 0;
  pSyncNode->hbrSlowNum = 0;
M
Minghao Li 已提交
1117
  pSyncNode->tmrRoutineNum = 0;
1118

1119 1120 1121
  sNInfo(pSyncNode, "sync open, node:%p", pSyncNode);
  sTrace("vgId:%d, tsElectInterval:%d, tsHeartbeatInterval:%d, tsHeartbeatTimeout:%d", pSyncNode->vgId, tsElectInterval,
         tsHeartbeatInterval, tsHeartbeatTimeout);
1122

M
Minghao Li 已提交
1123
  return pSyncNode;
1124 1125 1126

_error:
  if (pSyncInfo->pFsm) {
H
Hongze Cheng 已提交
1127 1128
    taosMemoryFree(pSyncInfo->pFsm);
    pSyncInfo->pFsm = NULL;
1129 1130 1131 1132
  }
  syncNodeClose(pSyncNode);
  pSyncNode = NULL;
  return NULL;
M
Minghao Li 已提交
1133 1134
}

M
Minghao Li 已提交
1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) {
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot;
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
    ASSERT(code == 0);
    if (snapshot.lastApplyIndex > pSyncNode->commitIndex) {
      pSyncNode->commitIndex = snapshot.lastApplyIndex;
    }
  }
}

B
Benguang Zhao 已提交
1146 1147 1148 1149 1150 1151 1152
int32_t syncNodeRestore(SSyncNode* pSyncNode) {
  ASSERT(pSyncNode->pLogStore != NULL && "log store not created");
  ASSERT(pSyncNode->pLogBuf != NULL && "ring log buffer not created");

  SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore);
  SyncIndex endIndex = pSyncNode->pLogBuf->endIndex;
1153 1154 1155 1156 1157 1158
  if (lastVer != -1 && endIndex != lastVer + 1) {
    terrno = TSDB_CODE_WAL_LOG_INCOMPLETE;
    sError("vgId:%d, failed to restore sync node since %s. expected lastLogIndex: %" PRId64 ", lastVer: %" PRId64 "",
           pSyncNode->vgId, terrstr(), endIndex - 1, lastVer);
    return -1;
  }
B
Benguang Zhao 已提交
1159

1160
  ASSERT(endIndex == lastVer + 1);
B
Benguang Zhao 已提交
1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
  commitIndex = TMAX(pSyncNode->commitIndex, commitIndex);

  if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, commitIndex) < 0) {
    return -1;
  }

  return 0;
}

int32_t syncNodeStart(SSyncNode* pSyncNode) {
  // start raft
  if (pSyncNode->replicaNum == 1) {
    raftStoreNextTerm(pSyncNode->pRaftStore);
    syncNodeBecomeLeader(pSyncNode, "one replica start");

    // Raft 3.6.2 Committing entries from previous terms
    syncNodeAppendNoop(pSyncNode);
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
  }

  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
  ASSERT(ret == 0);
  return ret;
}

void syncNodeStartOld(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1189
  // start raft
1190
  if (pSyncNode->replicaNum == 1) {
M
Minghao Li 已提交
1191
    raftStoreNextTerm(pSyncNode->pRaftStore);
1192
    syncNodeBecomeLeader(pSyncNode, "one replica start");
M
format  
Minghao Li 已提交
1193

1194
    // Raft 3.6.2 Committing entries from previous terms
1195 1196
    syncNodeAppendNoop(pSyncNode);
    syncMaybeAdvanceCommitIndex(pSyncNode);
M
Minghao Li 已提交
1197

M
Minghao Li 已提交
1198 1199
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
1200 1201
  }

1202 1203 1204
  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
  ASSERT(ret == 0);
M
Minghao Li 已提交
1205 1206
}

B
Benguang Zhao 已提交
1207
int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1208 1209 1210 1211 1212 1213 1214 1215
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
  ASSERT(ret == 0);
1216

1217 1218 1219
  ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
  ASSERT(ret == 0);
B
Benguang Zhao 已提交
1220
  return ret;
M
Minghao Li 已提交
1221 1222
}

M
Minghao Li 已提交
1223 1224 1225 1226 1227 1228 1229 1230
void syncNodePreClose(SSyncNode* pSyncNode) {
  // stop elect timer
  syncNodeStopElectTimer(pSyncNode);

  // stop heartbeat timer
  syncNodeStopHeartbeatTimer(pSyncNode);
}

1231
void syncHbTimerDataFree(SSyncHbTimerData* pData) { taosMemoryFree(pData); }
M
Minghao Li 已提交
1232

M
Minghao Li 已提交
1233
void syncNodeClose(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1234
  if (pSyncNode == NULL) return;
1235
  sNInfo(pSyncNode, "sync close, node:%p", pSyncNode);
M
Minghao Li 已提交
1236

S
Shengliang Guan 已提交
1237
  int32_t ret = raftStoreClose(pSyncNode->pRaftStore);
M
Minghao Li 已提交
1238
  ASSERT(ret == 0);
M
Minghao Li 已提交
1239
  pSyncNode->pRaftStore = NULL;
M
Minghao Li 已提交
1240

B
Benguang Zhao 已提交
1241
  syncNodeLogReplMgrDestroy(pSyncNode);
M
Minghao Li 已提交
1242
  syncRespMgrDestroy(pSyncNode->pSyncRespMgr);
1243
  pSyncNode->pSyncRespMgr = NULL;
M
Minghao Li 已提交
1244
  voteGrantedDestroy(pSyncNode->pVotesGranted);
1245
  pSyncNode->pVotesGranted = NULL;
M
Minghao Li 已提交
1246
  votesRespondDestory(pSyncNode->pVotesRespond);
1247
  pSyncNode->pVotesRespond = NULL;
M
Minghao Li 已提交
1248
  syncIndexMgrDestroy(pSyncNode->pNextIndex);
1249
  pSyncNode->pNextIndex = NULL;
M
Minghao Li 已提交
1250
  syncIndexMgrDestroy(pSyncNode->pMatchIndex);
1251
  pSyncNode->pMatchIndex = NULL;
M
Minghao Li 已提交
1252
  logStoreDestory(pSyncNode->pLogStore);
1253
  pSyncNode->pLogStore = NULL;
B
Benguang Zhao 已提交
1254 1255
  syncLogBufferDestroy(pSyncNode->pLogBuf);
  pSyncNode->pLogBuf = NULL;
M
Minghao Li 已提交
1256
  raftCfgClose(pSyncNode->pRaftCfg);
1257
  pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
1258 1259 1260 1261 1262

  syncNodeStopPingTimer(pSyncNode);
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1263 1264 1265 1266
  if (pSyncNode->pFsm != NULL) {
    taosMemoryFree(pSyncNode->pFsm);
  }

S
Shengliang Guan 已提交
1267
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1268
    if ((pSyncNode->senders)[i] != NULL) {
S
Shengliang Guan 已提交
1269
      sSTrace((pSyncNode->senders)[i], "snapshot sender destroy while close, data:%p", (pSyncNode->senders)[i]);
1270 1271 1272 1273 1274

      if (snapshotSenderIsStart((pSyncNode->senders)[i])) {
        snapshotSenderStop((pSyncNode->senders)[i], false);
      }

M
Minghao Li 已提交
1275 1276 1277 1278 1279
      snapshotSenderDestroy((pSyncNode->senders)[i]);
      (pSyncNode->senders)[i] = NULL;
    }
  }

M
Minghao Li 已提交
1280
  if (pSyncNode->pNewNodeReceiver != NULL) {
1281 1282 1283 1284
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
      snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
    }

M
Minghao Li 已提交
1285 1286 1287 1288
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

1289
  taosMemoryFree(pSyncNode);
M
Minghao Li 已提交
1290 1291
}

M
Minghao Li 已提交
1292
ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode) { return pSyncNode->pRaftCfg->snapshotStrategy; }
M
Minghao Li 已提交
1293

M
Minghao Li 已提交
1294 1295 1296
// timer control --------------
int32_t syncNodeStartPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1297 1298
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpPingTimerCB, pSyncNode->pingTimerMS, pSyncNode, syncEnv()->pTimerManager,
1299 1300 1301
                 &pSyncNode->pPingTimer);
    atomic_store_64(&pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1302
    sError("vgId:%d, start ping timer error, sync env is stop", pSyncNode->vgId);
1303
  }
M
Minghao Li 已提交
1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
  return ret;
}

int32_t syncNodeStopPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->pingTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pPingTimer);
  pSyncNode->pPingTimer = NULL;
  return ret;
}

int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1317
  if (syncIsInit()) {
1318
    pSyncNode->electTimerMS = ms;
S
Shengliang Guan 已提交
1319

1320 1321 1322 1323 1324
    int64_t execTime = taosGetTimestampMs() + ms;
    atomic_store_64(&(pSyncNode->electTimerParam.executeTime), execTime);
    atomic_store_64(&(pSyncNode->electTimerParam.logicClock), pSyncNode->electTimerLogicClock);
    pSyncNode->electTimerParam.pSyncNode = pSyncNode;
    pSyncNode->electTimerParam.pData = NULL;
S
Shengliang Guan 已提交
1325

M
Minghao Li 已提交
1326
    taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, (void*)(pSyncNode->rid), syncEnv()->pTimerManager,
1327
                 &pSyncNode->pElectTimer);
1328

1329
  } else {
M
Minghao Li 已提交
1330
    sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId);
1331
  }
M
Minghao Li 已提交
1332 1333 1334 1335 1336
  return ret;
}

int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1337
  atomic_add_fetch_64(&pSyncNode->electTimerLogicClock, 1);
M
Minghao Li 已提交
1338 1339
  taosTmrStop(pSyncNode->pElectTimer);
  pSyncNode->pElectTimer = NULL;
1340

M
Minghao Li 已提交
1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
  return ret;
}

int32_t syncNodeRestartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStartElectTimer(pSyncNode, ms);
  return ret;
}

M
Minghao Li 已提交
1351 1352
int32_t syncNodeResetElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1353 1354 1355 1356 1357 1358 1359
  int32_t electMS;

  if (pSyncNode->pRaftCfg->isStandBy) {
    electMS = TIMER_MAX_MS;
  } else {
    electMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
  }
M
Minghao Li 已提交
1360
  ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1361

S
Shengliang Guan 已提交
1362 1363
  sNTrace(pSyncNode, "reset elect timer, min:%d, max:%d, ms:%d", pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine,
          electMS);
M
Minghao Li 已提交
1364 1365 1366
  return ret;
}

M
Minghao Li 已提交
1367
static int32_t syncNodeDoStartHeartbeatTimer(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1368
  int32_t ret = 0;
S
Shengliang Guan 已提交
1369 1370
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, pSyncNode->heartbeatTimerMS, pSyncNode, syncEnv()->pTimerManager,
1371 1372 1373
                 &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1374
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
1375
  }
1376

S
Shengliang Guan 已提交
1377
  sNTrace(pSyncNode, "start heartbeat timer, ms:%d", pSyncNode->heartbeatTimerMS);
M
Minghao Li 已提交
1378 1379 1380
  return ret;
}

M
Minghao Li 已提交
1381
int32_t syncNodeStartHeartbeatTimer(SSyncNode* pSyncNode) {
1382
  int32_t ret = 0;
M
Minghao Li 已提交
1383

1384
#if 0
M
Minghao Li 已提交
1385
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
1386 1387
  ret = syncNodeDoStartHeartbeatTimer(pSyncNode);
#endif
1388

S
Shengliang Guan 已提交
1389
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1390
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1391 1392 1393
    if (pSyncTimer != NULL) {
      syncHbTimerStart(pSyncNode, pSyncTimer);
    }
1394
  }
1395

M
Minghao Li 已提交
1396 1397 1398
  return ret;
}

M
Minghao Li 已提交
1399 1400
int32_t syncNodeStopHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1401 1402

#if 0
M
Minghao Li 已提交
1403 1404 1405
  atomic_add_fetch_64(&pSyncNode->heartbeatTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pHeartbeatTimer);
  pSyncNode->pHeartbeatTimer = NULL;
1406
#endif
1407

S
Shengliang Guan 已提交
1408
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1409
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1410 1411 1412
    if (pSyncTimer != NULL) {
      syncHbTimerStop(pSyncNode, pSyncTimer);
    }
1413
  }
1414

M
Minghao Li 已提交
1415 1416 1417
  return ret;
}

1418 1419 1420 1421 1422 1423
int32_t syncNodeRestartHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartHeartbeatTimer(pSyncNode);
  return 0;
}

M
Minghao Li 已提交
1424 1425 1426
// utils --------------
int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1427
  syncUtilRaftId2EpSet(destRaftId, &epSet);
S
Shengliang Guan 已提交
1428
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1429 1430 1431
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1432
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1433
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1434
  } else {
M
Minghao Li 已提交
1435
    sError("vgId:%d, sync send msg by id error, fp-send-msg is null", pSyncNode->vgId);
S
Shengliang Guan 已提交
1436
    rpcFreeCont(pMsg->pCont);
M
Minghao Li 已提交
1437
    return -1;
M
Minghao Li 已提交
1438
  }
M
Minghao Li 已提交
1439

M
Minghao Li 已提交
1440 1441 1442 1443 1444
  return 0;
}

int32_t syncNodeSendMsgByInfo(const SNodeInfo* nodeInfo, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1445
  syncUtilNodeInfo2EpSet(nodeInfo, &epSet);
S
Shengliang Guan 已提交
1446
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1447 1448 1449
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1450
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1451
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1452
  } else {
M
Minghao Li 已提交
1453
    sError("vgId:%d, sync send msg by info error, fp-send-msg is null", pSyncNode->vgId);
M
Minghao Li 已提交
1454
  }
M
Minghao Li 已提交
1455 1456 1457
  return 0;
}

1458
inline bool syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config) {
1459 1460 1461
  bool b1 = false;
  bool b2 = false;

S
Shengliang Guan 已提交
1462
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1463 1464 1465 1466 1467 1468 1469
    if (strcmp((config->nodeInfo)[i].nodeFqdn, pSyncNode->myNodeInfo.nodeFqdn) == 0 &&
        (config->nodeInfo)[i].nodePort == pSyncNode->myNodeInfo.nodePort) {
      b1 = true;
      break;
    }
  }

S
Shengliang Guan 已提交
1470
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((config->nodeInfo)[i].nodeFqdn, (config->nodeInfo)[i].nodePort);
    raftId.vgId = pSyncNode->vgId;

    if (syncUtilSameId(&raftId, &(pSyncNode->myRaftId))) {
      b2 = true;
      break;
    }
  }

  ASSERT(b1 == b2);
  return b1;
}

1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497
static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) {
  if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true;
  if (pOldCfg->myIndex != pNewCfg->myIndex) return true;
  for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) {
    const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i];
    const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i];
    if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true;
    if (pOldInfo->nodePort != pNewInfo->nodePort) return true;
  }

  return false;
}

M
Minghao Li 已提交
1498
void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncIndex lastConfigChangeIndex) {
1499
  SSyncCfg oldConfig = pSyncNode->pRaftCfg->cfg;
1500 1501 1502 1503
  if (!syncIsConfigChanged(&oldConfig, pNewConfig)) {
    sInfo("vgId:1, sync not reconfig since not changed");
    return;
  }
S
Shengliang Guan 已提交
1504

1505
  pSyncNode->pRaftCfg->cfg = *pNewConfig;
1506 1507
  pSyncNode->pRaftCfg->lastConfigIndex = lastConfigChangeIndex;

1508 1509
  pSyncNode->configChangeNum++;

M
Minghao Li 已提交
1510 1511
  bool IamInOld = syncNodeInConfig(pSyncNode, &oldConfig);
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewConfig);
M
Minghao Li 已提交
1512

M
Minghao Li 已提交
1513 1514
  bool isDrop = false;
  bool isAdd = false;
M
Minghao Li 已提交
1515

M
Minghao Li 已提交
1516 1517 1518 1519
  if (IamInOld && !IamInNew) {
    isDrop = true;
  } else {
    isDrop = false;
1520
  }
1521

M
Minghao Li 已提交
1522 1523 1524 1525 1526
  if (!IamInOld && IamInNew) {
    isAdd = true;
  } else {
    isAdd = false;
  }
M
Minghao Li 已提交
1527

M
Minghao Li 已提交
1528
  // log begin config change
S
Shengliang Guan 已提交
1529 1530 1531 1532
  char oldCfgStr[1024] = {0};
  char newCfgStr[1024] = {0};
  syncCfg2SimpleStr(&oldConfig, oldCfgStr, sizeof(oldCfgStr));
  syncCfg2SimpleStr(pNewConfig, oldCfgStr, sizeof(oldCfgStr));
1533
  sNInfo(pSyncNode, "begin do config change, from %s to %s", oldCfgStr, oldCfgStr);
M
Minghao Li 已提交
1534

M
Minghao Li 已提交
1535 1536
  if (IamInNew) {
    pSyncNode->pRaftCfg->isStandBy = 0;  // change isStandBy to normal
M
Minghao Li 已提交
1537
  }
M
Minghao Li 已提交
1538 1539
  if (isDrop) {
    pSyncNode->pRaftCfg->isStandBy = 1;  // set standby
M
Minghao Li 已提交
1540 1541
  }

M
Minghao Li 已提交
1542
  // add last config index
M
Minghao Li 已提交
1543
  raftCfgAddConfigIndex(pSyncNode->pRaftCfg, lastConfigChangeIndex);
M
Minghao Li 已提交
1544

M
Minghao Li 已提交
1545 1546 1547 1548 1549 1550 1551 1552 1553
  if (IamInNew) {
    //-----------------------------------------
    int32_t ret = 0;

    // save snapshot senders
    int32_t oldReplicaNum = pSyncNode->replicaNum;
    SRaftId oldReplicasId[TSDB_MAX_REPLICA];
    memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
    SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
S
Shengliang Guan 已提交
1554
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1555
      oldSenders[i] = (pSyncNode->senders)[i];
S
Shengliang Guan 已提交
1556
      sSTrace(oldSenders[i], "snapshot sender save old");
M
Minghao Li 已提交
1557
    }
1558

M
Minghao Li 已提交
1559 1560
    // init internal
    pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
1561
    syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1562 1563 1564

    // init peersNum, peers, peersId
    pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
1565 1566
    int32_t j = 0;
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
1567 1568 1569 1570 1571
      if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
        pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
        j++;
      }
    }
S
Shengliang Guan 已提交
1572
    for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1573
      syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
M
Minghao Li 已提交
1574
    }
1575

M
Minghao Li 已提交
1576 1577
    // init replicaNum, replicasId
    pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
1578
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
1579
      syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
M
Minghao Li 已提交
1580
    }
1581

1582 1583 1584
    // update quorum first
    pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);

M
Minghao Li 已提交
1585 1586 1587 1588
    syncIndexMgrUpdate(pSyncNode->pNextIndex, pSyncNode);
    syncIndexMgrUpdate(pSyncNode->pMatchIndex, pSyncNode);
    voteGrantedUpdate(pSyncNode->pVotesGranted, pSyncNode);
    votesRespondUpdate(pSyncNode->pVotesRespond, pSyncNode);
M
Minghao Li 已提交
1589

M
Minghao Li 已提交
1590
    // reset snapshot senders
1591

M
Minghao Li 已提交
1592
    // clear new
S
Shengliang Guan 已提交
1593
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1594 1595
      (pSyncNode->senders)[i] = NULL;
    }
M
Minghao Li 已提交
1596

M
Minghao Li 已提交
1597
    // reset new
S
Shengliang Guan 已提交
1598
    for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
M
Minghao Li 已提交
1599 1600
      // reset sender
      bool reset = false;
S
Shengliang Guan 已提交
1601
      for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) {
M
Minghao Li 已提交
1602
        if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j]) && oldSenders[j] != NULL) {
M
Minghao Li 已提交
1603 1604 1605
          char     host[128];
          uint16_t port;
          syncUtilU642Addr((pSyncNode->replicasId)[i].addr, host, sizeof(host), &port);
1606
          sNTrace(pSyncNode, "snapshot sender reset for: %" PRId64 ", newIndex:%d, %s:%d, %p",
S
Shengliang Guan 已提交
1607
                  (pSyncNode->replicasId)[i].addr, i, host, port, oldSenders[j]);
M
Minghao Li 已提交
1608 1609 1610 1611 1612 1613 1614 1615 1616

          (pSyncNode->senders)[i] = oldSenders[j];
          oldSenders[j] = NULL;
          reset = true;

          // reset replicaIndex
          int32_t oldreplicaIndex = (pSyncNode->senders)[i]->replicaIndex;
          (pSyncNode->senders)[i]->replicaIndex = i;

S
Shengliang Guan 已提交
1617 1618
          sNTrace(pSyncNode, "snapshot sender udpate replicaIndex from %d to %d, %s:%d, %p, reset:%d", oldreplicaIndex,
                  i, host, port, (pSyncNode->senders)[i], reset);
M
Minghao Li 已提交
1619 1620

          break;
M
Minghao Li 已提交
1621
        }
1622 1623
      }
    }
1624

M
Minghao Li 已提交
1625
    // create new
S
Shengliang Guan 已提交
1626
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1627 1628
      if ((pSyncNode->senders)[i] == NULL) {
        (pSyncNode->senders)[i] = snapshotSenderCreate(pSyncNode, i);
S
Shengliang Guan 已提交
1629 1630 1631
        sSTrace((pSyncNode->senders)[i], "snapshot sender create new while reconfig, data:%p", (pSyncNode->senders)[i]);
      } else {
        sSTrace((pSyncNode->senders)[i], "snapshot sender already exist, data:%p", (pSyncNode->senders)[i]);
M
Minghao Li 已提交
1632
      }
1633 1634
    }

M
Minghao Li 已提交
1635
    // free old
S
Shengliang Guan 已提交
1636
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1637
      if (oldSenders[i] != NULL) {
S
Shengliang Guan 已提交
1638
        sNTrace(pSyncNode, "snapshot sender destroy old, data:%p replica-index:%d", oldSenders[i], i);
M
Minghao Li 已提交
1639 1640 1641
        snapshotSenderDestroy(oldSenders[i]);
        oldSenders[i] = NULL;
      }
1642 1643
    }

1644
    // persist cfg
M
Minghao Li 已提交
1645
    raftCfgPersist(pSyncNode->pRaftCfg);
1646

S
Shengliang Guan 已提交
1647
    char tmpbuf[1024] = {0};
1648
    snprintf(tmpbuf, sizeof(tmpbuf), "config change from %d to %d, index:%" PRId64 ", %s  -->  %s",
S
Shengliang Guan 已提交
1649
             oldConfig.replicaNum, pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1650

M
Minghao Li 已提交
1651 1652 1653
    // change isStandBy to normal (election timeout)
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
      syncNodeBecomeLeader(pSyncNode, tmpbuf);
1654 1655 1656

      // Raft 3.6.2 Committing entries from previous terms
      syncNodeAppendNoop(pSyncNode);
1657
      // syncMaybeAdvanceCommitIndex(pSyncNode);
1658

M
Minghao Li 已提交
1659 1660 1661 1662
    } else {
      syncNodeBecomeFollower(pSyncNode, tmpbuf);
    }
  } else {
1663
    // persist cfg
M
Minghao Li 已提交
1664
    raftCfgPersist(pSyncNode->pRaftCfg);
1665 1666
    sNInfo(pSyncNode, "do not config change from %d to %d, index:%" PRId64 ", %s  -->  %s", oldConfig.replicaNum,
           pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
1667
  }
1668

M
Minghao Li 已提交
1669
_END:
M
Minghao Li 已提交
1670
  // log end config change
1671
  sNInfo(pSyncNode, "end do config change, from %s to %s", oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1672 1673
}

M
Minghao Li 已提交
1674 1675 1676 1677
// raft state change --------------
void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
1678
    char tmpBuf[64];
1679
    snprintf(tmpBuf, sizeof(tmpBuf), "update term to %" PRId64, term);
1680
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
M
Minghao Li 已提交
1681 1682 1683 1684
    raftStoreClearVote(pSyncNode->pRaftStore);
  }
}

1685 1686 1687 1688 1689 1690
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
  }
}

M
Minghao Li 已提交
1691
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) {
M
Minghao Li 已提交
1692
  if (pSyncNode->pRaftStore->currentTerm > newTerm) {
1693
    sNTrace(pSyncNode, "step down, ignore, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1694
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1695 1696
    return;
  }
M
Minghao Li 已提交
1697 1698

  do {
1699
    sNTrace(pSyncNode, "step down, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1700
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1701 1702 1703 1704 1705
  } while (0);

  if (pSyncNode->pRaftStore->currentTerm < newTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, newTerm);
    char tmpBuf[64];
1706
    snprintf(tmpBuf, sizeof(tmpBuf), "step down, update term to %" PRId64, newTerm);
M
Minghao Li 已提交
1707 1708 1709 1710 1711 1712 1713 1714 1715 1716
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
    raftStoreClearVote(pSyncNode->pRaftStore);

  } else {
    if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
      syncNodeBecomeFollower(pSyncNode, "step down");
    }
  }
}

1717 1718
void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); }

1719
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
M
Minghao Li 已提交
1720
  // maybe clear leader cache
M
Minghao Li 已提交
1721 1722 1723 1724
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    pSyncNode->leaderCache = EMPTY_RAFT_ID;
  }

1725 1726
  pSyncNode->hbSlowNum = 0;

M
Minghao Li 已提交
1727
  // state change
M
Minghao Li 已提交
1728 1729 1730
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1731 1732
  // reset elect timer
  syncNodeResetElectTimer(pSyncNode);
M
Minghao Li 已提交
1733

1734 1735 1736
  // send rsp to client
  syncNodeLeaderChangeRsp(pSyncNode);

1737 1738 1739 1740 1741
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeFollowerCb != NULL) {
    pSyncNode->pFsm->FpBecomeFollowerCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1742 1743 1744
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1745 1746 1747
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1748
  // trace log
S
Shengliang Guan 已提交
1749
  sNTrace(pSyncNode, "become follower %s", debugStr);
M
Minghao Li 已提交
1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
}

// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
//     /\ state[i] = Candidate
//     /\ votesGranted[i] \in Quorum
//     /\ state'      = [state EXCEPT ![i] = Leader]
//     /\ nextIndex'  = [nextIndex EXCEPT ![i] =
//                          [j \in Server |-> Len(log[i]) + 1]]
//     /\ matchIndex' = [matchIndex EXCEPT ![i] =
//                          [j \in Server |-> 0]]
//     /\ elections'  = elections \cup
//                          {[eterm     |-> currentTerm[i],
//                            eleader   |-> i,
//                            elog      |-> log[i],
//                            evotes    |-> votesGranted[i],
//                            evoterLog |-> voterLog[i]]}
//     /\ UNCHANGED <<messages, currentTerm, votedFor, candidateVars, logVars>>
//
1770
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
1771 1772
  pSyncNode->leaderTime = taosGetTimestampMs();

1773
  pSyncNode->becomeLeaderNum++;
1774
  pSyncNode->hbrSlowNum = 0;
1775

1776 1777 1778
  // reset restoreFinish
  pSyncNode->restoreFinish = false;

M
Minghao Li 已提交
1779
  // state change
M
Minghao Li 已提交
1780
  pSyncNode->state = TAOS_SYNC_STATE_LEADER;
M
Minghao Li 已提交
1781 1782

  // set leader cache
M
Minghao Li 已提交
1783 1784
  pSyncNode->leaderCache = pSyncNode->myRaftId;

S
Shengliang Guan 已提交
1785
  for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1786 1787
    // maybe overwrite myself, no harm
    // just do it!
1788 1789 1790 1791 1792 1793 1794 1795 1796

    // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1;

    // maybe wal is deleted
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
    ASSERT(code == 0);
    pSyncNode->pNextIndex->index[i] = lastIndex + 1;
M
Minghao Li 已提交
1797 1798
  }

S
Shengliang Guan 已提交
1799
  for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1800 1801
    // maybe overwrite myself, no harm
    // just do it!
M
Minghao Li 已提交
1802 1803 1804
    pSyncNode->pMatchIndex->index[i] = SYNC_INDEX_INVALID;
  }

M
Minghao Li 已提交
1805 1806 1807
  // init peer mgr
  syncNodePeerStateInit(pSyncNode);

M
Minghao Li 已提交
1808
#if 0
1809 1810
  // update sender private term
  SSyncSnapshotSender* pMySender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->myRaftId));
1811
  if (pMySender != NULL) {
S
Shengliang Guan 已提交
1812
    for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
1813 1814 1815
      if ((pSyncNode->senders)[i]->privateTerm > pMySender->privateTerm) {
        pMySender->privateTerm = (pSyncNode->senders)[i]->privateTerm;
      }
1816
    }
1817
    (pMySender->privateTerm) += 100;
1818
  }
M
Minghao Li 已提交
1819
#endif
1820

1821 1822 1823 1824 1825
  // close receiver
  if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
    snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
  }

M
Minghao Li 已提交
1826
  // stop elect timer
M
Minghao Li 已提交
1827
  syncNodeStopElectTimer(pSyncNode);
M
Minghao Li 已提交
1828

M
Minghao Li 已提交
1829 1830
  // start heartbeat timer
  syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
1831

M
Minghao Li 已提交
1832 1833
  // send heartbeat right now
  syncNodeHeartbeatPeers(pSyncNode);
M
Minghao Li 已提交
1834

1835 1836 1837 1838 1839
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLeaderCb != NULL) {
    pSyncNode->pFsm->FpBecomeLeaderCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1840 1841 1842
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1843 1844 1845
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1846
  // trace log
1847
  sNInfo(pSyncNode, "become leader %s", debugStr);
M
Minghao Li 已提交
1848 1849 1850
}

void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1851 1852
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
1853
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
M
Minghao Li 已提交
1854

S
Shengliang Guan 已提交
1855
  sNTrace(pSyncNode, "state change syncNodeCandidate2Leader");
M
Minghao Li 已提交
1856

B
Benguang Zhao 已提交
1857
  int32_t ret = syncNodeAppendNoop(pSyncNode);
1858 1859 1860 1861
  if (ret < 0) {
    sError("vgId:%d, failed to append noop entry since %s", pSyncNode->vgId, terrstr());
  }

B
Benguang Zhao 已提交
1862 1863 1864 1865
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  ASSERT(lastIndex >= 0);
  sInfo("vgId:%d, become leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64 "",
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1866 1867 1868 1869 1870 1871 1872
}

void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) {
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");

M
Minghao Li 已提交
1873
  // Raft 3.6.2 Committing entries from previous terms
1874 1875
  syncNodeAppendNoop(pSyncNode);
  syncMaybeAdvanceCommitIndex(pSyncNode);
1876 1877

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
1878
    syncNodeReplicate(pSyncNode);
1879
  }
M
Minghao Li 已提交
1880 1881
}

M
Minghao Li 已提交
1882 1883
bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }

M
Minghao Li 已提交
1884
int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1885
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1886 1887 1888 1889 1890
    pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID;
    pSyncNode->peerStates[i].lastSendTime = 0;
  }

  return 0;
M
Minghao Li 已提交
1891 1892 1893
}

void syncNodeFollower2Candidate(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1894
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER);
M
Minghao Li 已提交
1895
  pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE;
B
Benguang Zhao 已提交
1896 1897 1898
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become candidate from follower. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
M
Minghao Li 已提交
1899

S
Shengliang Guan 已提交
1900
  sNTrace(pSyncNode, "follower to candidate");
M
Minghao Li 已提交
1901 1902 1903
}

void syncNodeLeader2Follower(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1904
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
1905
  syncNodeBecomeFollower(pSyncNode, "leader to follower");
B
Benguang Zhao 已提交
1906 1907 1908 1909
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1910
  sNTrace(pSyncNode, "leader to follower");
M
Minghao Li 已提交
1911 1912 1913
}

void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1914
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
1915
  syncNodeBecomeFollower(pSyncNode, "candidate to follower");
B
Benguang Zhao 已提交
1916 1917 1918 1919
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from candidate. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1920
  sNTrace(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
1921 1922
}

M
Minghao Li 已提交
1923 1924
// just called by syncNodeVoteForSelf
// need assert
M
Minghao Li 已提交
1925
void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
M
Minghao Li 已提交
1926 1927
  ASSERT(term == pSyncNode->pRaftStore->currentTerm);
  ASSERT(!raftStoreHasVoted(pSyncNode->pRaftStore));
M
Minghao Li 已提交
1928 1929 1930 1931

  raftStoreVote(pSyncNode->pRaftStore, pRaftId);
}

M
Minghao Li 已提交
1932
// simulate get vote from outside
M
Minghao Li 已提交
1933
void syncNodeVoteForSelf(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1934
  syncNodeVoteForTerm(pSyncNode, pSyncNode->pRaftStore->currentTerm, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1935

S
Shengliang Guan 已提交
1936 1937
  SRpcMsg rpcMsg = {0};
  int32_t ret = syncBuildRequestVoteReply(&rpcMsg, pSyncNode->vgId);
S
Shengliang Guan 已提交
1938
  if (ret != 0) return;
M
Minghao Li 已提交
1939

S
Shengliang Guan 已提交
1940
  SyncRequestVoteReply* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
1941 1942 1943 1944 1945 1946 1947
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = pSyncNode->myRaftId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->voteGranted = true;

  voteGrantedVote(pSyncNode->pVotesGranted, pMsg);
  votesRespondAdd(pSyncNode->pVotesRespond, pMsg);
S
Shengliang Guan 已提交
1948
  rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
1949 1950
}

M
Minghao Li 已提交
1951
// return if has a snapshot
M
Minghao Li 已提交
1952 1953
bool syncNodeHasSnapshot(SSyncNode* pSyncNode) {
  bool      ret = false;
1954
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1955 1956
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1957 1958 1959 1960 1961 1962 1963
    if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN) {
      ret = true;
    }
  }
  return ret;
}

M
Minghao Li 已提交
1964 1965
// return max(logLastIndex, snapshotLastIndex)
// if no snapshot and log, return -1
1966
SyncIndex syncNodeGetLastIndex(const SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1967
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1968 1969
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1970 1971 1972 1973 1974 1975 1976
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);

  SyncIndex lastIndex = logLastIndex > snapshot.lastApplyIndex ? logLastIndex : snapshot.lastApplyIndex;
  return lastIndex;
}

M
Minghao Li 已提交
1977 1978
// return the last term of snapshot and log
// if error, return SYNC_TERM_INVALID (by syncLogLastTerm)
M
Minghao Li 已提交
1979 1980
SyncTerm syncNodeGetLastTerm(SSyncNode* pSyncNode) {
  SyncTerm lastTerm = 0;
M
Minghao Li 已提交
1981 1982
  if (syncNodeHasSnapshot(pSyncNode)) {
    // has snapshot
1983
    SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1984 1985
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1986 1987
    }

M
Minghao Li 已提交
1988 1989 1990
    SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    if (logLastIndex > snapshot.lastApplyIndex) {
      lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
M
Minghao Li 已提交
1991 1992 1993 1994
    } else {
      lastTerm = snapshot.lastApplyTerm;
    }

M
Minghao Li 已提交
1995
  } else {
M
Minghao Li 已提交
1996 1997
    // no snapshot
    lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
1998
  }
M
Minghao Li 已提交
1999

M
Minghao Li 已提交
2000 2001 2002 2003 2004 2005 2006
  return lastTerm;
}

// get last index and term along with snapshot
int32_t syncNodeGetLastIndexTerm(SSyncNode* pSyncNode, SyncIndex* pLastIndex, SyncTerm* pLastTerm) {
  *pLastIndex = syncNodeGetLastIndex(pSyncNode);
  *pLastTerm = syncNodeGetLastTerm(pSyncNode);
2007 2008
  return 0;
}
M
Minghao Li 已提交
2009

M
Minghao Li 已提交
2010
// return append-entries first try index
M
Minghao Li 已提交
2011 2012 2013 2014 2015
SyncIndex syncNodeSyncStartIndex(SSyncNode* pSyncNode) {
  SyncIndex syncStartIndex = syncNodeGetLastIndex(pSyncNode) + 1;
  return syncStartIndex;
}

M
Minghao Li 已提交
2016 2017
// if index > 0, return index - 1
// else, return -1
2018 2019 2020 2021 2022 2023 2024 2025 2026
SyncIndex syncNodeGetPreIndex(SSyncNode* pSyncNode, SyncIndex index) {
  SyncIndex preIndex = index - 1;
  if (preIndex < SYNC_INDEX_INVALID) {
    preIndex = SYNC_INDEX_INVALID;
  }

  return preIndex;
}

M
Minghao Li 已提交
2027 2028 2029 2030
// if index < 0, return SYNC_TERM_INVALID
// if index == 0, return 0
// if index > 0, return preTerm
// if error, return SYNC_TERM_INVALID
2031 2032 2033 2034 2035 2036 2037 2038 2039
SyncTerm syncNodeGetPreTerm(SSyncNode* pSyncNode, SyncIndex index) {
  if (index < SYNC_INDEX_BEGIN) {
    return SYNC_TERM_INVALID;
  }

  if (index == SYNC_INDEX_BEGIN) {
    return 0;
  }

2040 2041 2042
  SyncTerm  preTerm = 0;
  SyncIndex preIndex = index - 1;

2043
  SSyncRaftEntry* pPreEntry = NULL;
2044 2045 2046 2047 2048 2049 2050
  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
  LRUHandle*      h = taosLRUCacheLookup(pCache, &preIndex, sizeof(preIndex));
  int32_t         code = 0;
  if (h) {
    pPreEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
    code = 0;

2051
    pSyncNode->pLogStore->cacheHit++;
2052 2053 2054
    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", preIndex, pPreEntry->bytes, pPreEntry);

  } else {
2055
    pSyncNode->pLogStore->cacheMiss++;
2056 2057 2058 2059
    sNTrace(pSyncNode, "miss cache index:%" PRId64, preIndex);

    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, preIndex, &pPreEntry);
  }
M
Minghao Li 已提交
2060 2061 2062 2063 2064 2065

  SSnapshot snapshot = {.data = NULL,
                        .lastApplyIndex = SYNC_INDEX_INVALID,
                        .lastApplyTerm = SYNC_TERM_INVALID,
                        .lastConfigIndex = SYNC_INDEX_INVALID};

2066 2067 2068
  if (code == 0) {
    ASSERT(pPreEntry != NULL);
    preTerm = pPreEntry->term;
2069 2070 2071 2072

    if (h) {
      taosLRUCacheRelease(pCache, h, false);
    } else {
2073
      syncEntryDestroy(pPreEntry);
2074 2075
    }

2076 2077
    return preTerm;
  } else {
2078 2079 2080 2081
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
      if (snapshot.lastApplyIndex == preIndex) {
        return snapshot.lastApplyTerm;
2082 2083 2084 2085
      }
    }
  }

2086
  sNError(pSyncNode, "sync node get pre term error, index:%" PRId64 ", snap-index:%" PRId64 ", snap-term:%" PRId64,
S
Shengliang Guan 已提交
2087
          index, snapshot.lastApplyIndex, snapshot.lastApplyTerm);
2088 2089
  return SYNC_TERM_INVALID;
}
M
Minghao Li 已提交
2090 2091 2092 2093

// get pre index and term of "index"
int32_t syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncIndex* pPreIndex, SyncTerm* pPreTerm) {
  *pPreIndex = syncNodeGetPreIndex(pSyncNode, index);
M
Minghao Li 已提交
2094
  *pPreTerm = syncNodeGetPreTerm(pSyncNode, index);
M
Minghao Li 已提交
2095 2096 2097
  return 0;
}

M
Minghao Li 已提交
2098
static void syncNodeEqPingTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2099
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2100

S
Shengliang Guan 已提交
2101 2102 2103
  SSyncNode* pNode = param;
  if (atomic_load_64(&pNode->pingTimerLogicClockUser) <= atomic_load_64(&pNode->pingTimerLogicClock)) {
    SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2104
    int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_PING, atomic_load_64(&pNode->pingTimerLogicClock),
S
Shengliang Guan 已提交
2105 2106
                                    pNode->pingTimerMS, pNode);
    if (code != 0) {
M
Minghao Li 已提交
2107
      sError("failed to build ping msg");
S
Shengliang Guan 已提交
2108 2109
      rpcFreeCont(rpcMsg.pCont);
      return;
M
Minghao Li 已提交
2110
    }
M
Minghao Li 已提交
2111

M
Minghao Li 已提交
2112
    // sTrace("enqueue ping msg");
S
Shengliang Guan 已提交
2113 2114
    code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
    if (code != 0) {
M
Minghao Li 已提交
2115
      sError("failed to sync enqueue ping msg since %s", terrstr());
S
Shengliang Guan 已提交
2116 2117
      rpcFreeCont(rpcMsg.pCont);
      return;
2118
    }
M
Minghao Li 已提交
2119

S
Shengliang Guan 已提交
2120
    taosTmrReset(syncNodeEqPingTimer, pNode->pingTimerMS, pNode, syncEnv()->pTimerManager, &pNode->pPingTimer);
2121
  }
M
Minghao Li 已提交
2122 2123
}

M
Minghao Li 已提交
2124
static void syncNodeEqElectTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2125
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2126

M
Minghao Li 已提交
2127 2128
  int64_t    rid = (int64_t)param;
  SSyncNode* pNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
2129

2130
  if (pNode == NULL) return;
M
Minghao Li 已提交
2131 2132 2133 2134 2135

  if (pNode->syncEqMsg == NULL) {
    syncNodeRelease(pNode);
    return;
  }
2136

2137
  int64_t tsNow = taosGetTimestampMs();
M
Minghao Li 已提交
2138 2139 2140 2141
  if (tsNow < pNode->electTimerParam.executeTime) {
    syncNodeRelease(pNode);
    return;
  }
M
Minghao Li 已提交
2142

S
Shengliang Guan 已提交
2143
  SRpcMsg rpcMsg = {0};
2144 2145
  int32_t code =
      syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_ELECTION, pNode->electTimerParam.logicClock, pNode->electTimerMS, pNode);
S
Shengliang Guan 已提交
2146

S
Shengliang Guan 已提交
2147
  if (code != 0) {
M
Minghao Li 已提交
2148
    sError("failed to build elect msg");
M
Minghao Li 已提交
2149
    syncNodeRelease(pNode);
S
Shengliang Guan 已提交
2150
    return;
M
Minghao Li 已提交
2151 2152
  }

S
Shengliang Guan 已提交
2153
  SyncTimeout* pTimeout = rpcMsg.pCont;
S
Shengliang Guan 已提交
2154
  sNTrace(pNode, "enqueue elect msg lc:%" PRId64, pTimeout->logicClock);
S
Shengliang Guan 已提交
2155 2156 2157

  code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2158
    sError("failed to sync enqueue elect msg since %s", terrstr());
S
Shengliang Guan 已提交
2159
    rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
2160
    syncNodeRelease(pNode);
2161
    return;
M
Minghao Li 已提交
2162
  }
M
Minghao Li 已提交
2163 2164

  syncNodeRelease(pNode);
M
Minghao Li 已提交
2165 2166
}

M
Minghao Li 已提交
2167
static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2168
  if (!syncIsInit()) return;
2169

S
Shengliang Guan 已提交
2170 2171 2172 2173
  SSyncNode* pNode = param;
  if (pNode->replicaNum > 1) {
    if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) {
      SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2174
      int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock),
S
Shengliang Guan 已提交
2175 2176 2177
                                      pNode->heartbeatTimerMS, pNode);

      if (code != 0) {
M
Minghao Li 已提交
2178
        sError("failed to build heartbeat msg");
S
Shengliang Guan 已提交
2179
        return;
2180
      }
M
Minghao Li 已提交
2181

2182
      sTrace("vgId:%d, enqueue heartbeat timer", pNode->vgId);
S
Shengliang Guan 已提交
2183 2184
      code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
      if (code != 0) {
M
Minghao Li 已提交
2185
        sError("failed to enqueue heartbeat msg since %s", terrstr());
S
Shengliang Guan 已提交
2186 2187
        rpcFreeCont(rpcMsg.pCont);
        return;
2188
      }
S
Shengliang Guan 已提交
2189 2190 2191 2192

      taosTmrReset(syncNodeEqHeartbeatTimer, pNode->heartbeatTimerMS, pNode, syncEnv()->pTimerManager,
                   &pNode->pHeartbeatTimer);

2193
    } else {
S
Shengliang Guan 已提交
2194 2195
      sTrace("==syncNodeEqHeartbeatTimer== heartbeatTimerLogicClock:%" PRId64 ", heartbeatTimerLogicClockUser:%" PRId64,
             pNode->heartbeatTimerLogicClock, pNode->heartbeatTimerLogicClockUser);
2196
    }
M
Minghao Li 已提交
2197 2198 2199
  }
}

2200
static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
2201
  int64_t hbDataRid = (int64_t)param;
2202
  int64_t tsNow = taosGetTimestampMs();
2203

2204 2205
  SSyncHbTimerData* pData = syncHbTimerDataAcquire(hbDataRid);
  if (pData == NULL) {
M
Minghao Li 已提交
2206
    sError("hb timer get pData NULL, %" PRId64, hbDataRid);
2207 2208
    return;
  }
2209

2210
  SSyncNode* pSyncNode = syncNodeAcquire(pData->syncNodeRid);
M
Minghao Li 已提交
2211
  if (pSyncNode == NULL) {
2212
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2213
    sError("hb timer get pSyncNode NULL");
2214 2215 2216 2217 2218 2219 2220 2221
    return;
  }

  SSyncTimer* pSyncTimer = pData->pTimer;

  if (!pSyncNode->isStart) {
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2222
    sError("vgId:%d, hb timer sync node already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2223 2224 2225
    return;
  }

M
Minghao Li 已提交
2226
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
2227 2228
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2229
    sError("vgId:%d, hb timer sync node not leader", pSyncNode->vgId);
M
Minghao Li 已提交
2230 2231 2232
    return;
  }

M
Minghao Li 已提交
2233
  if (pSyncNode->pRaftStore == NULL) {
2234 2235
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2236
    sError("vgId:%d, hb timer raft store already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2237 2238 2239
    return;
  }

M
Minghao Li 已提交
2240
  // sTrace("vgId:%d, eq peer hb timer", pSyncNode->vgId);
2241 2242

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
2243 2244 2245
    int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock);
    int64_t msgLogicClock = atomic_load_64(&pData->logicClock);

2246
    if (timerLogicClock == msgLogicClock) {
2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266
      if (tsNow > pData->execTime) {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  EXECUTE this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, "
            "---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif

        pData->execTime += pSyncTimer->timerMS;

        SRpcMsg rpcMsg = {0};
        (void)syncBuildHeartbeat(&rpcMsg, pSyncNode->vgId);

        SyncHeartbeat* pSyncMsg = rpcMsg.pCont;
        pSyncMsg->srcId = pSyncNode->myRaftId;
        pSyncMsg->destId = pData->destId;
        pSyncMsg->term = pSyncNode->pRaftStore->currentTerm;
        pSyncMsg->commitIndex = pSyncNode->commitIndex;
        pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
        pSyncMsg->privateTerm = 0;
2267
        pSyncMsg->timeStamp = tsNow;
2268 2269 2270 2271 2272 2273

        // update reset time
        int64_t timerElapsed = tsNow - pSyncTimer->timeStamp;
        pSyncTimer->timeStamp = tsNow;

        // send msg
2274 2275
        syncLogSendHeartbeat(pSyncNode, pSyncMsg, false, timerElapsed, pData->execTime);
        syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
2276 2277 2278 2279 2280 2281 2282 2283
      } else {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  pass this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, ---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif
      }

M
Minghao Li 已提交
2284 2285
      if (syncIsInit()) {
        // sTrace("vgId:%d, reset peer hb timer", pSyncNode->vgId);
2286 2287
        taosTmrReset(syncNodeEqPeerHeartbeatTimer, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)hbDataRid,
                     syncEnv()->pTimerManager, &pSyncTimer->pTimer);
M
Minghao Li 已提交
2288 2289 2290 2291
      } else {
        sError("sync env is stop, reset peer hb timer error");
      }

2292
    } else {
M
Minghao Li 已提交
2293 2294
      sTrace("vgId:%d, do not send hb, timerLogicClock:%" PRId64 ", msgLogicClock:%" PRId64 "", pSyncNode->vgId,
             timerLogicClock, msgLogicClock);
2295 2296
    }
  }
2297 2298 2299

  syncHbTimerDataRelease(pData);
  syncNodeRelease(pSyncNode);
2300 2301
}

2302 2303 2304 2305 2306
static int32_t syncNodeEqNoop(SSyncNode* pNode) {
  if (pNode->state == TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return -1;
  }
M
Minghao Li 已提交
2307

2308 2309 2310 2311
  SyncIndex       index = pNode->pLogStore->syncLogWriteIndex(pNode->pLogStore);
  SyncTerm        term = pNode->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, pNode->vgId);
  if (pEntry == NULL) return -1;
M
Minghao Li 已提交
2312

S
Shengliang Guan 已提交
2313
  SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2314
  int32_t code = syncBuildClientRequestFromNoopEntry(&rpcMsg, pEntry, pNode->vgId);
2315
  syncEntryDestroy(pEntry);
M
Minghao Li 已提交
2316

2317 2318 2319
  sNTrace(pNode, "propose msg, type:noop");
  code = (*pNode->syncEqMsg)(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2320
    sError("failed to propose noop msg while enqueue since %s", terrstr());
2321
  }
M
Minghao Li 已提交
2322

2323
  return code;
M
Minghao Li 已提交
2324 2325
}

2326 2327
static void deleteCacheEntry(const void* key, size_t keyLen, void* value) { taosMemoryFree(value); }

2328 2329 2330 2331
int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, LRUHandle** h) {
  SSyncLogStoreData* pData = pLogStore->data;
  sNTrace(pData->pSyncNode, "in cache index:%" PRId64 ", bytes:%u, %p", pEntry->index, pEntry->bytes, pEntry);

S
Shengliang Guan 已提交
2332 2333
  int32_t   code = 0;
  int32_t   entryLen = sizeof(*pEntry) + pEntry->dataLen;
2334 2335 2336 2337 2338 2339 2340 2341 2342
  LRUStatus status = taosLRUCacheInsert(pLogStore->pCache, &pEntry->index, sizeof(pEntry->index), pEntry, entryLen,
                                        deleteCacheEntry, h, TAOS_LRU_PRIORITY_LOW);
  if (status != TAOS_LRU_STATUS_OK) {
    code = -1;
  }

  return code;
}

B
Benguang Zhao 已提交
2343 2344 2345
int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) {
  // append to log buffer
  if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) {
2346
    sError("vgId:%d, failed to enqueue sync log buffer. index:%" PRId64 "", ths->vgId, pEntry->index);
B
Benguang Zhao 已提交
2347 2348 2349 2350
    return -1;
  }

  // proceed match index, with replicating on needed
2351
  SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, NULL);
B
Benguang Zhao 已提交
2352

2353
  sTrace("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64
2354 2355 2356
         ", %" PRId64 ")",
         ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex,
         ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex);
B
Benguang Zhao 已提交
2357

B
Benguang Zhao 已提交
2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
  // multi replica
  if (ths->replicaNum > 1) {
    return 0;
  }

  // single replica
  (void)syncNodeUpdateCommitIndex(ths, matchIndex);

  if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
    sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex);
    return -1;
  }

  return 0;
}

2374
bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) {
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386
  if (pSyncNode->replicaNum == 1) {
    return false;
  }

  int32_t toCount = 0;
  int64_t tsNow = taosGetTimestampMs();
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
    int64_t recvTime = syncIndexMgrGetRecvTime(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (recvTime == 0 || recvTime == -1) {
      continue;
    }

2387
    if (tsNow - recvTime > tsHeartbeatTimeout) {
2388 2389 2390 2391 2392 2393 2394 2395 2396
      toCount++;
    }
  }

  bool b = (toCount >= pSyncNode->quorum ? true : false);

  return b;
}

2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
bool syncNodeSnapshotSending(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  bool b = false;
  for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
    if (pSyncNode->senders[i] != NULL && pSyncNode->senders[i]->start) {
      b = true;
      break;
    }
  }
  return b;
}

bool syncNodeSnapshotRecving(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  if (pSyncNode->pNewNodeReceiver == NULL) return false;
  if (pSyncNode->pNewNodeReceiver->start) return true;
  return false;
}

M
Minghao Li 已提交
2416
static int32_t syncNodeAppendNoop(SSyncNode* ths) {
B
Benguang Zhao 已提交
2417 2418 2419 2420 2421 2422 2423 2424 2425
  SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm  term = ths->pRaftStore->currentTerm;

  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
  if (pEntry == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return -1;
  }

B
Benguang Zhao 已提交
2426 2427
  int32_t ret = syncNodeAppend(ths, pEntry);
  return 0;
B
Benguang Zhao 已提交
2428 2429 2430
}

static int32_t syncNodeAppendNoopOld(SSyncNode* ths) {
M
Minghao Li 已提交
2431 2432
  int32_t ret = 0;

2433
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2434
  SyncTerm        term = ths->pRaftStore->currentTerm;
M
Minghao Li 已提交
2435
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
M
Minghao Li 已提交
2436
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2437

2438 2439
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2440
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2441
    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
2442
    if (code != 0) {
M
Minghao Li 已提交
2443
      sError("append noop error");
2444 2445
      return -1;
    }
2446 2447

    syncCacheEntry(ths->pLogStore, pEntry, &h);
M
Minghao Li 已提交
2448 2449
  }

2450 2451 2452
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2453
    syncEntryDestroy(pEntry);
2454 2455
  }

M
Minghao Li 已提交
2456 2457 2458
  return ret;
}

S
Shengliang Guan 已提交
2459 2460
int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncHeartbeat* pMsg = pRpcMsg->pCont;
2461

M
Minghao Li 已提交
2462 2463 2464 2465
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2466
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2467
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2468
  syncLogRecvHeartbeat(ths, pMsg, timeDiff, tbuf);
2469

2470 2471 2472 2473
  SRpcMsg rpcMsg = {0};
  (void)syncBuildHeartbeatReply(&rpcMsg, ths->vgId);

  SyncHeartbeatReply* pMsgReply = rpcMsg.pCont;
2474 2475 2476 2477
  pMsgReply->destId = pMsg->srcId;
  pMsgReply->srcId = ths->myRaftId;
  pMsgReply->term = ths->pRaftStore->currentTerm;
  pMsgReply->privateTerm = 8864;  // magic number
2478
  pMsgReply->startTime = ths->startTime;
2479
  pMsgReply->timeStamp = tsMs;
2480

M
Minghao Li 已提交
2481
  if (pMsg->term == ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) {
2482 2483
    syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), tsMs);

2484
    syncNodeResetElectTimer(ths);
M
Minghao Li 已提交
2485
    ths->minMatchIndex = pMsg->minMatchIndex;
2486 2487

    if (ths->state == TAOS_SYNC_STATE_FOLLOWER) {
2488
      // syncNodeFollowerCommit(ths, pMsg->commitIndex);
S
Shengliang Guan 已提交
2489 2490 2491 2492
      SRpcMsg rpcMsgLocalCmd = {0};
      (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

      SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2493 2494
      pSyncMsg->cmd = SYNC_LOCAL_CMD_FOLLOWER_CMT;
      pSyncMsg->fcIndex = pMsg->commitIndex;
2495
      SyncIndex fcIndex = pSyncMsg->fcIndex;
2496 2497 2498 2499 2500 2501 2502

      if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
        int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
        if (code != 0) {
          sError("vgId:%d, sync enqueue fc-commit msg error, code:%d", ths->vgId, code);
          rpcFreeCont(rpcMsgLocalCmd.pCont);
        } else {
2503
          sTrace("vgId:%d, sync enqueue fc-commit msg, fc-index:%" PRId64, ths->vgId, fcIndex);
2504 2505
        }
      }
2506 2507 2508
    }
  }

M
Minghao Li 已提交
2509
  if (pMsg->term >= ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) {
2510
    // syncNodeStepDown(ths, pMsg->term);
S
Shengliang Guan 已提交
2511 2512 2513 2514
    SRpcMsg rpcMsgLocalCmd = {0};
    (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

    SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2515 2516 2517
    pSyncMsg->cmd = SYNC_LOCAL_CMD_STEP_DOWN;
    pSyncMsg->sdNewTerm = pMsg->term;

S
Shengliang Guan 已提交
2518 2519
    if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
      int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
2520 2521 2522 2523
      if (code != 0) {
        sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code);
        rpcFreeCont(rpcMsgLocalCmd.pCont);
      } else {
2524
        sTrace("vgId:%d, sync enqueue step-down msg, new-term: %" PRId64, ths->vgId, pSyncMsg->sdNewTerm);
2525
      }
2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540
    }
  }

  /*
    // htonl
    SMsgHead* pHead = rpcMsg.pCont;
    pHead->contLen = htonl(pHead->contLen);
    pHead->vgId = htonl(pHead->vgId);
  */

  // reply
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);
  return 0;
}

2541
int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
S
Shengliang Guan 已提交
2542 2543 2544 2545
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2546
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
B
Benguang Zhao 已提交
2547
  SSyncLogReplMgr*    pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId);
2548 2549 2550 2551
  if (pMgr == NULL) {
    sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr);
    return -1;
  }
2552 2553

  int64_t tsMs = taosGetTimestampMs();
S
Shengliang Guan 已提交
2554
  syncLogRecvHeartbeatReply(ths, pMsg, tsMs - pMsg->timeStamp, tbuf);
2555

2556 2557
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);

2558 2559 2560
  return syncLogReplMgrProcessHeartbeatReply(pMgr, ths, pMsg);
}

2561
int32_t syncNodeOnHeartbeatReplyOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
2562
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
2563

M
Minghao Li 已提交
2564 2565 2566 2567
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

M
Minghao Li 已提交
2568
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2569
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2570
  syncLogRecvHeartbeatReply(ths, pMsg, timeDiff, tbuf);
M
Minghao Li 已提交
2571

2572
  // update last reply time, make decision whether the other node is alive or not
M
Minghao Li 已提交
2573
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);
2574 2575 2576
  return 0;
}

S
Shengliang Guan 已提交
2577 2578
int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
2579 2580
  syncLogRecvLocalCmd(ths, pMsg, "");

2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    (void)syncNodeUpdateCommitIndex(ths, pMsg->fcIndex);
    if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
      sError("vgId:%d, failed to commit raft log since %s. commit index: %" PRId64 "", ths->vgId, terrstr(),
             ths->commitIndex);
    }
  } else {
    sError("error local cmd");
  }

  return 0;
}

int32_t syncNodeOnLocalCmdOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
  syncLogRecvLocalCmd(ths, pMsg, "");

M
Minghao Li 已提交
2601 2602 2603
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

2604 2605 2606
  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    syncNodeFollowerCommit(ths, pMsg->fcIndex);

M
Minghao Li 已提交
2607
  } else {
M
Minghao Li 已提交
2608
    sError("error local cmd");
M
Minghao Li 已提交
2609
  }
2610 2611 2612 2613

  return 0;
}

M
Minghao Li 已提交
2614 2615 2616 2617 2618 2619 2620 2621 2622 2623
// TLA+ Spec
// ClientRequest(i, v) ==
//     /\ state[i] = Leader
//     /\ LET entry == [term  |-> currentTerm[i],
//                      value |-> v]
//            newLog == Append(log[i], entry)
//        IN  log' = [log EXCEPT ![i] = newLog]
//     /\ UNCHANGED <<messages, serverVars, candidateVars,
//                    leaderVars, commitIndex>>
//
M
Minghao Li 已提交
2624

2625
int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
S
Shengliang Guan 已提交
2626
  sNTrace(ths, "on client request");
2627

B
Benguang Zhao 已提交
2628 2629
  int32_t code = 0;

B
Benguang Zhao 已提交
2630 2631 2632
  SyncIndex       index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm        term = ths->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = NULL;
2633 2634 2635 2636
  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
B
Benguang Zhao 已提交
2637 2638 2639 2640 2641 2642 2643
  }

  if (ths->state == TAOS_SYNC_STATE_LEADER) {
    if (pRetIndex) {
      (*pRetIndex) = index;
    }

2644 2645 2646 2647 2648
    int32_t code = syncNodeAppend(ths, pEntry);
    if (code < 0 && ths->vgId != 1 && vnodeIsMsgBlock(pEntry->originalRpcType)) {
      ASSERT(false && "failed to append blocking msg");
    }
    return code;
B
Benguang Zhao 已提交
2649 2650
  }

B
Benguang Zhao 已提交
2651
  return -1;
B
Benguang Zhao 已提交
2652 2653
}

2654 2655
int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
  sNTrace(ths, "on client request");
B
Benguang Zhao 已提交
2656

M
Minghao Li 已提交
2657
  int32_t ret = 0;
2658
  int32_t code = 0;
M
Minghao Li 已提交
2659

M
Minghao Li 已提交
2660
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2661
  SyncTerm        term = ths->pRaftStore->currentTerm;
2662 2663 2664 2665 2666 2667 2668
  SSyncRaftEntry* pEntry;

  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
  }
M
Minghao Li 已提交
2669

2670 2671
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2672
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2673 2674 2675
    // append entry
    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    if (code != 0) {
2676 2677 2678 2679
      if (ths->replicaNum == 1) {
        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2680
          syncEntryDestroy(pEntry);
2681
        }
2682

2683 2684 2685 2686
        return -1;

      } else {
        // del resp mgr, call FpCommitCb
2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697
        SFsmCbMeta cbMeta = {
            .index = pEntry->index,
            .lastConfigIndex = SYNC_INDEX_INVALID,
            .isWeak = pEntry->isWeak,
            .code = -1,
            .state = ths->state,
            .seqNum = pEntry->seqNum,
            .term = pEntry->term,
            .currentTerm = ths->pRaftStore->currentTerm,
            .flag = 0,
        };
2698
        ths->pFsm->FpCommitCb(ths->pFsm, pMsg, &cbMeta);
2699 2700 2701 2702

        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2703
          syncEntryDestroy(pEntry);
2704 2705
        }

2706 2707
        return -1;
      }
2708
    }
M
Minghao Li 已提交
2709

2710 2711
    syncCacheEntry(ths->pLogStore, pEntry, &h);

2712 2713
    // if mulit replica, start replicate right now
    if (ths->replicaNum > 1) {
M
Minghao Li 已提交
2714
      syncNodeReplicate(ths);
2715
    }
2716

2717 2718
    // if only myself, maybe commit right now
    if (ths->replicaNum == 1) {
2719 2720 2721 2722 2723
      if (syncNodeIsMnode(ths)) {
        syncMaybeAdvanceCommitIndex(ths);
      } else {
        syncOneReplicaAdvance(ths);
      }
2724
    }
M
Minghao Li 已提交
2725 2726
  }

2727 2728 2729 2730 2731 2732 2733 2734
  if (pRetIndex != NULL) {
    if (ret == 0 && pEntry != NULL) {
      *pRetIndex = pEntry->index;
    } else {
      *pRetIndex = SYNC_INDEX_INVALID;
    }
  }

2735 2736 2737
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2738
    syncEntryDestroy(pEntry);
2739 2740
  }

M
Minghao Li 已提交
2741
  return ret;
2742
}
M
Minghao Li 已提交
2743

S
Shengliang Guan 已提交
2744 2745 2746
const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
2747
      return "follower";
S
Shengliang Guan 已提交
2748
    case TAOS_SYNC_STATE_CANDIDATE:
2749
      return "candidate";
S
Shengliang Guan 已提交
2750
    case TAOS_SYNC_STATE_LEADER:
2751
      return "leader";
S
Shengliang Guan 已提交
2752
    case TAOS_SYNC_STATE_ERROR:
2753
      return "error";
S
Shengliang Guan 已提交
2754 2755 2756 2757
    case TAOS_SYNC_STATE_OFFLINE:
      return "offline";
    default:
      return "unknown";
S
Shengliang Guan 已提交
2758
  }
M
Minghao Li 已提交
2759
}
2760

2761
#if 0
2762
int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry) {
2763
  if (ths->state != TAOS_SYNC_STATE_FOLLOWER) {
S
Shengliang Guan 已提交
2764
    sNTrace(ths, "I am not follower, can not do leader transfer");
2765 2766
    return 0;
  }
2767 2768

  if (!ths->restoreFinish) {
S
Shengliang Guan 已提交
2769
    sNTrace(ths, "restore not finish, can not do leader transfer");
2770 2771 2772
    return 0;
  }

2773
  if (pEntry->term < ths->pRaftStore->currentTerm) {
2774
    sNTrace(ths, "little term:%" PRId64 ", can not do leader transfer", pEntry->term);
2775 2776 2777 2778
    return 0;
  }

  if (pEntry->index < syncNodeGetLastIndex(ths)) {
S
Shengliang Guan 已提交
2779
    sNTrace(ths, "little index:%" PRId64 ", can not do leader transfer", pEntry->index);
2780 2781 2782
    return 0;
  }

2783 2784
  /*
    if (ths->vgId > 1) {
S
Shengliang Guan 已提交
2785
      sNTrace(ths, "I am vnode, can not do leader transfer");
2786 2787 2788 2789
      return 0;
    }
  */

2790
  SyncLeaderTransfer* pSyncLeaderTransfer = pRpcMsg->pCont;
S
Shengliang Guan 已提交
2791
  sNTrace(ths, "do leader transfer, index:%" PRId64, pEntry->index);
M
Minghao Li 已提交
2792

M
Minghao Li 已提交
2793 2794 2795
  bool sameId = syncUtilSameId(&(pSyncLeaderTransfer->newLeaderId), &(ths->myRaftId));
  bool sameNodeInfo = strcmp(pSyncLeaderTransfer->newNodeInfo.nodeFqdn, ths->myNodeInfo.nodeFqdn) == 0 &&
                      pSyncLeaderTransfer->newNodeInfo.nodePort == ths->myNodeInfo.nodePort;
M
Minghao Li 已提交
2796

M
Minghao Li 已提交
2797 2798
  bool same = sameId || sameNodeInfo;
  if (same) {
M
Minghao Li 已提交
2799 2800 2801 2802
    // reset elect timer now!
    int32_t electMS = 1;
    int32_t ret = syncNodeRestartElectTimer(ths, electMS);
    ASSERT(ret == 0);
M
Minghao Li 已提交
2803

2804
    sNTrace(ths, "maybe leader transfer to %s:%d %" PRId64, pSyncLeaderTransfer->newNodeInfo.nodeFqdn,
S
Shengliang Guan 已提交
2805
            pSyncLeaderTransfer->newNodeInfo.nodePort, pSyncLeaderTransfer->newLeaderId.addr);
2806 2807
  }

M
Minghao Li 已提交
2808
  if (ths->pFsm->FpLeaderTransferCb != NULL) {
S
Shengliang Guan 已提交
2809
    SFsmCbMeta cbMeta = {
S
Shengliang Guan 已提交
2810 2811 2812 2813 2814 2815 2816 2817 2818
        .code = 0,
        .currentTerm = ths->pRaftStore->currentTerm,
        .flag = 0,
        .index = pEntry->index,
        .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
        .isWeak = pEntry->isWeak,
        .seqNum = pEntry->seqNum,
        .state = ths->state,
        .term = pEntry->term,
S
Shengliang Guan 已提交
2819 2820
    };
    ths->pFsm->FpLeaderTransferCb(ths->pFsm, pRpcMsg, &cbMeta);
2821 2822
  }

2823 2824 2825
  return 0;
}

2826 2827
#endif

2828
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
2829
  for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) {
2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((pNewCfg->nodeInfo)[i].nodeFqdn, (pNewCfg->nodeInfo)[i].nodePort);
    raftId.vgId = ths->vgId;

    if (syncUtilSameId(&(ths->myRaftId), &raftId)) {
      pNewCfg->myIndex = i;
      return 0;
    }
  }

  return -1;
}

2843 2844 2845 2846
bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
}

M
Minghao Li 已提交
2847
int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) {
2848
  ASSERT(false);
2849 2850 2851 2852
  if (beginIndex > endIndex) {
    return 0;
  }

M
Minghao Li 已提交
2853 2854 2855 2856 2857 2858 2859 2860 2861
  if (ths == NULL) {
    return -1;
  }

  if (ths->pFsm != NULL && ths->pFsm->FpGetSnapshotInfo != NULL) {
    // advance commit index to sanpshot first
    SSnapshot snapshot = {0};
    ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot);
    if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex >= beginIndex) {
S
Shengliang Guan 已提交
2862
      sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, beginIndex, snapshot.lastApplyIndex);
2863

M
Minghao Li 已提交
2864 2865 2866
      // update begin index
      beginIndex = snapshot.lastApplyIndex + 1;
    }
2867 2868
  }

2869 2870
  int32_t    code = 0;
  ESyncState state = flag;
M
Minghao Li 已提交
2871

S
Shengliang Guan 已提交
2872
  sNTrace(ths, "commit by wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex);
2873 2874 2875 2876 2877 2878

  // execute fsm
  if (ths->pFsm != NULL) {
    for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
      if (i != SYNC_INDEX_INVALID) {
        SSyncRaftEntry* pEntry;
2879 2880 2881 2882
        SLRUCache*      pCache = ths->pLogStore->pCache;
        LRUHandle*      h = taosLRUCacheLookup(pCache, &i, sizeof(i));
        if (h) {
          pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
2883

2884
          ths->pLogStore->cacheHit++;
2885 2886
          sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", i, pEntry->bytes, pEntry);

2887
        } else {
2888
          ths->pLogStore->cacheMiss++;
2889 2890
          sNTrace(ths, "miss cache index:%" PRId64, i);

2891
          code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry);
M
Minghao Li 已提交
2892 2893 2894
          // ASSERT(code == 0);
          // ASSERT(pEntry != NULL);
          if (code != 0 || pEntry == NULL) {
S
Shengliang Guan 已提交
2895
            sNError(ths, "get log entry error");
2896
            sFatal("vgId:%d, get log entry %" PRId64 " error when commit since %s", ths->vgId, i, terrstr());
M
Minghao Li 已提交
2897 2898
            continue;
          }
2899
        }
2900

2901
        SRpcMsg rpcMsg = {0};
2902 2903
        syncEntry2OriginalRpc(pEntry, &rpcMsg);

2904
        sTrace("do commit index:%" PRId64 ", type:%s", i, TMSG_INFO(pEntry->msgType));
M
Minghao Li 已提交
2905

2906
        // user commit
2907 2908
        if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) {
          bool internalExecute = true;
S
Shengliang Guan 已提交
2909
          if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) {
2910 2911 2912
            internalExecute = false;
          }

M
Minghao Li 已提交
2913 2914
          sNTrace(ths, "user commit index:%" PRId64 ", internal:%d, type:%s", i, internalExecute,
                  TMSG_INFO(pEntry->msgType));
2915

2916 2917
          // execute fsm in apply thread, or execute outside syncPropose
          if (internalExecute) {
S
Shengliang Guan 已提交
2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929
            SFsmCbMeta cbMeta = {
                .index = pEntry->index,
                .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
                .isWeak = pEntry->isWeak,
                .code = 0,
                .state = ths->state,
                .seqNum = pEntry->seqNum,
                .term = pEntry->term,
                .currentTerm = ths->pRaftStore->currentTerm,
                .flag = flag,
            };

S
Shengliang Guan 已提交
2930
            syncRespMgrGetAndDel(ths->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info);
S
Shengliang Guan 已提交
2931
            ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, &cbMeta);
M
Minghao Li 已提交
2932
          }
2933
        }
2934

2935 2936
#if 0
        // execute in pre-commit
M
Minghao Li 已提交
2937
        // leader transfer
2938 2939 2940
        if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) {
          code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry);
          ASSERT(code == 0);
2941
        }
2942
#endif
2943 2944

        // restore finish
2945
        // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok
2946 2947 2948 2949 2950 2951
        if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
          if (ths->restoreFinish == false) {
            if (ths->pFsm->FpRestoreFinishCb != NULL) {
              ths->pFsm->FpRestoreFinishCb(ths->pFsm);
            }
            ths->restoreFinish = true;
M
Minghao Li 已提交
2952

2953
            int64_t restoreDelay = taosGetTimestampMs() - ths->leaderTime;
S
Shengliang Guan 已提交
2954
            sNTrace(ths, "restore finish, index:%" PRId64 ", elapsed:%" PRId64 " ms", pEntry->index, restoreDelay);
2955 2956 2957 2958
          }
        }

        rpcFreeCont(rpcMsg.pCont);
2959 2960 2961
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
B
Benguang Zhao 已提交
2962
          syncEntryDestroy(pEntry);
2963
        }
2964 2965 2966 2967
      }
    }
  }
  return 0;
2968 2969 2970
}

bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
S
Shengliang Guan 已提交
2971
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2972 2973 2974 2975 2976
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
      return true;
    }
  }
  return false;
M
Minghao Li 已提交
2977 2978 2979 2980
}

SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
  SSyncSnapshotSender* pSender = NULL;
S
Shengliang Guan 已提交
2981
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
2982 2983 2984 2985 2986
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pSender = (ths->senders)[i];
    }
  }
  return pSender;
M
Minghao Li 已提交
2987
}
M
Minghao Li 已提交
2988

2989 2990
SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
  SSyncTimer* pTimer = NULL;
S
Shengliang Guan 已提交
2991
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2992 2993 2994 2995 2996 2997 2998
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pTimer = &((ths->peerHeartbeatTimerArr)[i]);
    }
  }
  return pTimer;
}

M
Minghao Li 已提交
2999 3000
SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) {
  SPeerState* pState = NULL;
S
Shengliang Guan 已提交
3001
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
3002 3003 3004 3005 3006 3007 3008 3009 3010
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pState = &((ths->peerStates)[i]);
    }
  }
  return pState;
}

bool syncNodeNeedSendAppendEntries(SSyncNode* ths, const SRaftId* pDestId, const SyncAppendEntries* pMsg) {
  SPeerState* pState = syncNodeGetPeerState(ths, pDestId);
M
Minghao Li 已提交
3011
  if (pState == NULL) {
3012
    sError("vgId:%d, replica maybe dropped", ths->vgId);
M
Minghao Li 已提交
3013 3014
    return false;
  }
M
Minghao Li 已提交
3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025

  SyncIndex sendIndex = pMsg->prevLogIndex + 1;
  int64_t   tsNow = taosGetTimestampMs();

  if (pState->lastSendIndex == sendIndex && tsNow - pState->lastSendTime < SYNC_APPEND_ENTRIES_TIMEOUT_MS) {
    return false;
  }

  return true;
}

M
Minghao Li 已提交
3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039
bool syncNodeCanChange(SSyncNode* pSyncNode) {
  if (pSyncNode->changing) {
    sError("sync cannot change");
    return false;
  }

  if ((pSyncNode->commitIndex >= SYNC_INDEX_BEGIN)) {
    SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
    if (pSyncNode->commitIndex != lastIndex) {
      sError("sync cannot change2");
      return false;
    }
  }

S
Shengliang Guan 已提交
3040
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
3041
    SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->peersId)[i]);
M
Minghao Li 已提交
3042
    if (pSender != NULL && pSender->start) {
M
Minghao Li 已提交
3043 3044 3045 3046 3047 3048
      sError("sync cannot change3");
      return false;
    }
  }

  return true;
M
Minghao Li 已提交
3049
}