syncMain.c 98.1 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

S
Shengliang Guan 已提交
16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "sync.h"
M
Minghao Li 已提交
18 19
#include "syncAppendEntries.h"
#include "syncAppendEntriesReply.h"
M
Minghao Li 已提交
20
#include "syncCommit.h"
M
Minghao Li 已提交
21
#include "syncElection.h"
M
Minghao Li 已提交
22
#include "syncEnv.h"
M
Minghao Li 已提交
23
#include "syncIndexMgr.h"
M
Minghao Li 已提交
24
#include "syncInt.h"
M
Minghao Li 已提交
25
#include "syncMessage.h"
26
#include "syncPipeline.h"
M
Minghao Li 已提交
27
#include "syncRaftCfg.h"
M
Minghao Li 已提交
28
#include "syncRaftLog.h"
M
Minghao Li 已提交
29
#include "syncRaftStore.h"
M
Minghao Li 已提交
30
#include "syncReplication.h"
M
Minghao Li 已提交
31 32
#include "syncRequestVote.h"
#include "syncRequestVoteReply.h"
M
Minghao Li 已提交
33
#include "syncRespMgr.h"
M
Minghao Li 已提交
34
#include "syncSnapshot.h"
M
Minghao Li 已提交
35
#include "syncTimeout.h"
M
Minghao Li 已提交
36
#include "syncUtil.h"
M
Minghao Li 已提交
37
#include "syncVoteMgr.h"
38
#include "tglobal.h"
M
Minghao Li 已提交
39
#include "tref.h"
M
Minghao Li 已提交
40

M
Minghao Li 已提交
41 42 43 44 45
static void    syncNodeEqPingTimer(void* param, void* tmrId);
static void    syncNodeEqElectTimer(void* param, void* tmrId);
static void    syncNodeEqHeartbeatTimer(void* param, void* tmrId);
static int32_t syncNodeEqNoop(SSyncNode* ths);
static int32_t syncNodeAppendNoop(SSyncNode* ths);
46
static void    syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId);
S
Shengliang Guan 已提交
47
static bool    syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg);
S
Shengliang Guan 已提交
48 49 50
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId);
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
S
Shengliang Guan 已提交
51 52 53 54 55 56 57 58 59 60 61
static int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg);
static bool    syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config);
static void    syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* newConfig, SyncIndex lastConfigChangeIndex);
static bool    syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg);

static bool    syncNodeCanChange(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader);
static int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry);

static ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode);
M
Minghao Li 已提交
62

63
int64_t syncOpen(SSyncInfo* pSyncInfo) {
M
Minghao Li 已提交
64
  SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo);
65
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
66
    sError("vgId:%d, failed to open sync node", pSyncInfo->vgId);
67 68
    return -1;
  }
M
Minghao Li 已提交
69

S
Shengliang Guan 已提交
70
  pSyncNode->rid = syncNodeAdd(pSyncNode);
M
Minghao Li 已提交
71
  if (pSyncNode->rid < 0) {
72
    syncNodeClose(pSyncNode);
M
Minghao Li 已提交
73 74 75
    return -1;
  }

S
Shengliang Guan 已提交
76 77 78 79 80 81
  pSyncNode->pingBaseLine = pSyncInfo->pingMs;
  pSyncNode->pingTimerMS = pSyncInfo->pingMs;
  pSyncNode->electBaseLine = pSyncInfo->electMs;
  pSyncNode->hbBaseLine = pSyncInfo->heartbeatMs;
  pSyncNode->heartbeatTimerMS = pSyncInfo->heartbeatMs;
  pSyncNode->msgcb = pSyncInfo->msgcb;
M
Minghao Li 已提交
82
  return pSyncNode->rid;
M
Minghao Li 已提交
83
}
M
Minghao Li 已提交
84

B
Benguang Zhao 已提交
85
int32_t syncStart(int64_t rid) {
S
Shengliang Guan 已提交
86
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
87
  if (pSyncNode == NULL) {
B
Benguang Zhao 已提交
88 89 90 91 92
    sError("failed to acquire rid: %" PRId64 " of tsNodeReftId for pSyncNode", rid);
    return -1;
  }

  if (syncNodeRestore(pSyncNode) < 0) {
93
    sError("vgId:%d, failed to restore sync log buffer since %s", pSyncNode->vgId, terrstr());
94
    goto _err;
M
Minghao Li 已提交
95
  }
M
Minghao Li 已提交
96

B
Benguang Zhao 已提交
97 98 99 100
  if (syncNodeStart(pSyncNode) < 0) {
    sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr());
    goto _err;
  }
M
Minghao Li 已提交
101

B
Benguang Zhao 已提交
102 103
  syncNodeRelease(pSyncNode);
  return 0;
M
Minghao Li 已提交
104

105 106 107
_err:
  syncNodeRelease(pSyncNode);
  return -1;
M
Minghao Li 已提交
108 109
}

M
Minghao Li 已提交
110
void syncStop(int64_t rid) {
S
Shengliang Guan 已提交
111
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
112
  if (pSyncNode != NULL) {
113
    pSyncNode->isStart = false;
S
Shengliang Guan 已提交
114
    syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
115
    syncNodeRemove(rid);
M
Minghao Li 已提交
116 117 118
  }
}

M
Minghao Li 已提交
119 120
void syncPreStop(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
121 122 123
  if (pSyncNode != NULL) {
    syncNodePreClose(pSyncNode);
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
124 125 126
  }
}

S
Shengliang Guan 已提交
127 128 129
static bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pCfg) {
  if (!syncNodeInConfig(pSyncNode, pCfg)) return false;
  return abs(pCfg->replicaNum - pSyncNode->replicaNum) <= 1;
M
Minghao Li 已提交
130 131
}

S
Shengliang Guan 已提交
132
int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
133
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
134
  if (pSyncNode == NULL) return -1;
M
Minghao Li 已提交
135

M
Minghao Li 已提交
136
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
S
Shengliang Guan 已提交
137
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
138
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
S
Shengliang Guan 已提交
139
    sError("vgId:%d, failed to reconfig since invalid new config", pSyncNode->vgId);
M
Minghao Li 已提交
140
    return -1;
M
Minghao Li 已提交
141
  }
142

S
Shengliang Guan 已提交
143 144
  syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg);
  syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID);
S
Shengliang Guan 已提交
145

M
Minghao Li 已提交
146 147 148 149
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    syncNodeStopHeartbeatTimer(pSyncNode);

    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
S
Shengliang Guan 已提交
150
      syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]);
M
Minghao Li 已提交
151 152 153
    }

    syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
154
    //syncNodeReplicate(pSyncNode);
M
Minghao Li 已提交
155
  }
S
Shengliang Guan 已提交
156

S
Shengliang Guan 已提交
157
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
158
  return 0;
M
Minghao Li 已提交
159
}
M
Minghao Li 已提交
160

S
Shengliang Guan 已提交
161 162 163 164
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg) {
  int32_t code = -1;
  if (!syncIsInit()) return code;

S
Shengliang Guan 已提交
165
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
166 167
  if (pSyncNode == NULL) return code;

S
Shengliang Guan 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
  switch (pMsg->msgType) {
    case TDMT_SYNC_HEARTBEAT:
      code = syncNodeOnHeartbeat(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_HEARTBEAT_REPLY:
      code = syncNodeOnHeartbeatReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_TIMEOUT:
      code = syncNodeOnTimeout(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_CLIENT_REQUEST:
      code = syncNodeOnClientRequest(pSyncNode, pMsg, NULL);
      break;
    case TDMT_SYNC_REQUEST_VOTE:
      code = syncNodeOnRequestVote(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_REQUEST_VOTE_REPLY:
      code = syncNodeOnRequestVoteReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES:
      code = syncNodeOnAppendEntries(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES_REPLY:
      code = syncNodeOnAppendEntriesReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_SEND:
      code = syncNodeOnSnapshot(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_RSP:
      code = syncNodeOnSnapshotReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_LOCAL_CMD:
      code = syncNodeOnLocalCmd(pSyncNode, pMsg);
      break;
    default:
      sError("vgId:%d, failed to process msg:%p since invalid type:%s", pSyncNode->vgId, pMsg,
             TMSG_INFO(pMsg->msgType));
      code = -1;
M
Minghao Li 已提交
206 207
  }

S
Shengliang Guan 已提交
208
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
209
  return code;
210 211
}

S
Shengliang Guan 已提交
212
int32_t syncLeaderTransfer(int64_t rid) {
S
Shengliang Guan 已提交
213
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
214
  if (pSyncNode == NULL) return -1;
215

S
Shengliang Guan 已提交
216
  int32_t ret = syncNodeLeaderTransfer(pSyncNode);
S
Shengliang Guan 已提交
217
  syncNodeRelease(pSyncNode);
218 219 220
  return ret;
}

M
Minghao Li 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
SyncIndex syncMinMatchIndex(SSyncNode* pSyncNode) {
  SyncIndex minMatchIndex = SYNC_INDEX_INVALID;

  if (pSyncNode->peersNum > 0) {
    minMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
  }

  for (int32_t i = 1; i < pSyncNode->peersNum; ++i) {
    SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (matchIndex < minMatchIndex) {
      minMatchIndex = matchIndex;
    }
  }
  return minMatchIndex;
}

237
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
S
Shengliang Guan 已提交
238
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
239
  if (pSyncNode == NULL) {
240
    sError("sync begin snapshot error");
241 242
    return -1;
  }
243

244 245
  int32_t code = 0;

M
Minghao Li 已提交
246
  if (syncNodeIsMnode(pSyncNode)) {
M
Minghao Li 已提交
247 248 249
    // mnode
    int64_t logRetention = SYNC_MNODE_LOG_RETENTION;

M
Minghao Li 已提交
250 251 252
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    int64_t   logNum = endIndex - beginIndex;
M
Minghao Li 已提交
253 254 255
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || (!isEmpty && logNum < logRetention)) {
S
Shengliang Guan 已提交
256 257
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", log-num:%" PRId64 ", empty:%d, do not delete wal",
              lastApplyIndex, logNum, isEmpty);
S
Shengliang Guan 已提交
258
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
259 260 261
      return 0;
    }

M
Minghao Li 已提交
262 263 264
    goto _DEL_WAL;

  } else {
265 266 267 268 269 270 271 272 273 274 275 276
    lastApplyIndex -= SYNC_VNODE_LOG_RETENTION;

    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || !(lastApplyIndex >= beginIndex && lastApplyIndex <= endIndex)) {
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", empty:%d, do not delete wal", lastApplyIndex, isEmpty);
      syncNodeRelease(pSyncNode);
      return 0;
    }

M
Minghao Li 已提交
277 278 279 280 281 282 283 284 285 286 287 288 289 290
    // vnode
    if (pSyncNode->replicaNum > 1) {
      // multi replicas

      if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
        pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode);

        for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
          int64_t matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
          if (lastApplyIndex > matchIndex) {
            do {
              char     host[64];
              uint16_t port;
              syncUtilU642Addr(pSyncNode->peersId[i].addr, host, sizeof(host), &port);
S
Shengliang Guan 已提交
291 292 293 294
              sNTrace(pSyncNode,
                      "new-snapshot-index:%" PRId64 " is greater than match-index:%" PRId64
                      " of %s:%d, do not delete wal",
                      lastApplyIndex, matchIndex, host, port);
M
Minghao Li 已提交
295 296
            } while (0);

S
Shengliang Guan 已提交
297
            syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
298 299 300 301 302 303
            return 0;
          }
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
        if (lastApplyIndex > pSyncNode->minMatchIndex) {
S
Shengliang Guan 已提交
304 305 306
          sNTrace(pSyncNode,
                  "new-snapshot-index:%" PRId64 " is greater than min-match-index:%" PRId64 ", do not delete wal",
                  lastApplyIndex, pSyncNode->minMatchIndex);
S
Shengliang Guan 已提交
307
          syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
308 309 310 311
          return 0;
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE) {
S
Shengliang Guan 已提交
312
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
313
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
314 315 316
        return 0;

      } else {
S
Shengliang Guan 已提交
317
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " unknown state, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
318
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
319 320 321 322 323 324 325 326 327
        return 0;
      }

      goto _DEL_WAL;

    } else {
      // one replica

      goto _DEL_WAL;
328 329 330
    }
  }

M
Minghao Li 已提交
331
_DEL_WAL:
332

M
Minghao Li 已提交
333
  do {
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    SyncIndex          snapshotVer = walGetSnapshotVer(pData->pWal);
    SyncIndex          walCommitVer = walGetCommittedVer(pData->pWal);
    SyncIndex          wallastVer = walGetLastVer(pData->pWal);
    if (lastApplyIndex <= walCommitVer) {
      SyncIndex snapshottingIndex = atomic_load_64(&pSyncNode->snapshottingIndex);

      if (snapshottingIndex == SYNC_INDEX_INVALID) {
        atomic_store_64(&pSyncNode->snapshottingIndex, lastApplyIndex);
        pSyncNode->snapshottingTime = taosGetTimestampMs();

        code = walBeginSnapshot(pData->pWal, lastApplyIndex);
        if (code == 0) {
          sNTrace(pSyncNode, "wal snapshot begin, index:%" PRId64 ", last apply index:%" PRId64,
                  pSyncNode->snapshottingIndex, lastApplyIndex);
        } else {
          sNError(pSyncNode, "wal snapshot begin error since:%s, index:%" PRId64 ", last apply index:%" PRId64,
                  terrstr(terrno), pSyncNode->snapshottingIndex, lastApplyIndex);
          atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
        }
354

M
Minghao Li 已提交
355
      } else {
356 357
        sNTrace(pSyncNode, "snapshotting for %" PRId64 ", do not delete wal for new-snapshot-index:%" PRId64,
                snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
358
      }
359
    }
M
Minghao Li 已提交
360
  } while (0);
361

S
Shengliang Guan 已提交
362
  syncNodeRelease(pSyncNode);
363 364 365 366
  return code;
}

int32_t syncEndSnapshot(int64_t rid) {
S
Shengliang Guan 已提交
367
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
368
  if (pSyncNode == NULL) {
369
    sError("sync end snapshot error");
370 371 372
    return -1;
  }

373 374 375 376
  int32_t code = 0;
  if (atomic_load_64(&pSyncNode->snapshottingIndex) != SYNC_INDEX_INVALID) {
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    code = walEndSnapshot(pData->pWal);
M
Minghao Li 已提交
377
    if (code != 0) {
378
      sNError(pSyncNode, "wal snapshot end error since:%s", terrstr());
S
Shengliang Guan 已提交
379
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
380 381
      return -1;
    } else {
S
Shengliang Guan 已提交
382
      sNTrace(pSyncNode, "wal snapshot end, index:%" PRId64, atomic_load_64(&pSyncNode->snapshottingIndex));
M
Minghao Li 已提交
383 384
      atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
    }
385
  }
386

S
Shengliang Guan 已提交
387
  syncNodeRelease(pSyncNode);
388 389 390
  return code;
}

M
Minghao Li 已提交
391
int32_t syncStepDown(int64_t rid, SyncTerm newTerm) {
S
Shengliang Guan 已提交
392
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
393
  if (pSyncNode == NULL) {
394
    sError("sync step down error");
M
Minghao Li 已提交
395 396 397
    return -1;
  }

M
Minghao Li 已提交
398
  syncNodeStepDown(pSyncNode, newTerm);
S
Shengliang Guan 已提交
399
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
400
  return 0;
M
Minghao Li 已提交
401 402
}

403
bool syncNodeIsReadyForRead(SSyncNode* pSyncNode) {
404
  if (pSyncNode == NULL) {
405
    terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
406
    sError("sync ready for read error");
407 408
    return false;
  }
M
Minghao Li 已提交
409

410 411 412 413 414 415
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return false;
  }

  if (pSyncNode->restoreFinish) {
416
    return true;
M
Minghao Li 已提交
417 418
  }

419
  bool ready = false;
420 421 422
  if (!pSyncNode->pFsm->FpApplyQueueEmptyCb(pSyncNode->pFsm)) {
    // apply queue not empty
    ready = false;
M
Minghao Li 已提交
423

424 425 426 427 428 429 430 431 432 433 434 435 436
  } else {
    if (!pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore)) {
      SyncIndex       lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
      SSyncRaftEntry* pEntry = NULL;
      SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
      LRUHandle*      h = taosLRUCacheLookup(pCache, &lastIndex, sizeof(lastIndex));
      int32_t         code = 0;
      if (h) {
        pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
        code = 0;

        pSyncNode->pLogStore->cacheHit++;
        sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", lastIndex, pEntry->bytes, pEntry);
M
Minghao Li 已提交
437

438 439 440
      } else {
        pSyncNode->pLogStore->cacheMiss++;
        sNTrace(pSyncNode, "miss cache index:%" PRId64, lastIndex);
M
Minghao Li 已提交
441

442 443
        code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, lastIndex, &pEntry);
      }
444

445 446 447
      if (code == 0 && pEntry != NULL) {
        if (pEntry->originalRpcType == TDMT_SYNC_NOOP && pEntry->term == pSyncNode->pRaftStore->currentTerm) {
          ready = true;
448
        }
449

450 451 452 453
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
          syncEntryDestroy(pEntry);
454
        }
455 456 457 458
      }
    }
  }

459
  if (!ready) {
460
    terrno = TSDB_CODE_SYN_RESTORING;
461
  }
462

463 464 465 466 467 468 469 470 471 472 473 474
  return ready;
}

bool syncIsReadyForRead(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    sError("sync ready for read error");
    return false;
  }

  bool ready = syncNodeIsReadyForRead(pSyncNode);

475 476
  syncNodeRelease(pSyncNode);
  return ready;
M
Minghao Li 已提交
477
}
M
Minghao Li 已提交
478

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
bool syncSnapshotSending(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotSending(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

bool syncSnapshotRecving(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
    return false;
  }

  bool b = syncNodeSnapshotRecving(pSyncNode);
  syncNodeRelease(pSyncNode);
  return b;
}

M
Minghao Li 已提交
501 502
int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode) {
  if (pSyncNode->peersNum == 0) {
S
Shengliang Guan 已提交
503
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
504 505
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
506
  }
M
Minghao Li 已提交
507

508
  int32_t ret = 0;
509
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER && pSyncNode->replicaNum > 1) {
510
    SNodeInfo newLeader = (pSyncNode->peersNodeInfo)[0];
511 512 513 514 515 516 517
    if (pSyncNode->peersNum == 2) {
      SyncIndex matchIndex0 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
      SyncIndex matchIndex1 = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[1]));
      if (matchIndex1 > matchIndex0) {
        newLeader = (pSyncNode->peersNodeInfo)[1];
      }
    }
518 519 520
    ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
  }

M
Minghao Li 已提交
521
  return ret;
M
Minghao Li 已提交
522 523
}

M
Minghao Li 已提交
524 525
int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader) {
  if (pSyncNode->replicaNum == 1) {
S
Shengliang Guan 已提交
526
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
527 528
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
529
  }
530

S
Shengliang Guan 已提交
531
  sNTrace(pSyncNode, "begin leader transfer to %s:%u", newLeader.nodeFqdn, newLeader.nodePort);
M
Minghao Li 已提交
532

533 534 535 536
  SRpcMsg rpcMsg = {0};
  (void)syncBuildLeaderTransfer(&rpcMsg, pSyncNode->vgId);

  SyncLeaderTransfer* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
537 538 539 540
  pMsg->newLeaderId.addr = syncUtilAddr2U64(newLeader.nodeFqdn, newLeader.nodePort);
  pMsg->newLeaderId.vgId = pSyncNode->vgId;
  pMsg->newNodeInfo = newLeader;

S
Shengliang Guan 已提交
541 542 543
  int32_t ret = syncNodePropose(pSyncNode, &rpcMsg, false);
  rpcFreeCont(rpcMsg.pCont);
  return ret;
M
Minghao Li 已提交
544 545
}

546 547
SSyncState syncGetState(int64_t rid) {
  SSyncState state = {.state = TAOS_SYNC_STATE_ERROR};
M
Minghao Li 已提交
548

S
Shengliang Guan 已提交
549
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
550 551 552
  if (pSyncNode != NULL) {
    state.state = pSyncNode->state;
    state.restored = pSyncNode->restoreFinish;
553 554 555 556 557
    if (pSyncNode->vgId != 1) {
      state.canRead = syncNodeIsReadyForRead(pSyncNode);
    } else {
      state.canRead = state.restored;
    }
558
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
559 560
  }

561
  return state;
M
Minghao Li 已提交
562 563
}

564
#if 0
565 566 567 568 569
int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) {
  if (index < SYNC_INDEX_BEGIN) {
    return -1;
  }

S
Shengliang Guan 已提交
570
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
571 572 573
  if (pSyncNode == NULL) {
    return -1;
  }
574
  ASSERT(rid == pSyncNode->rid);
575 576 577 578 579

  SSyncRaftEntry* pEntry = NULL;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
  if (code != 0) {
    if (pEntry != NULL) {
B
Benguang Zhao 已提交
580
      syncEntryDestroy(pEntry);
581
    }
S
Shengliang Guan 已提交
582
    syncNodeRelease(pSyncNode);
583 584
    return -1;
  }
585
  ASSERT(pEntry != NULL);
586 587 588 589 590 591

  pSnapshot->data = NULL;
  pSnapshot->lastApplyIndex = index;
  pSnapshot->lastApplyTerm = pEntry->term;
  pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index);

592
  syncEntryDestroy(pEntry);
S
Shengliang Guan 已提交
593
  syncNodeRelease(pSyncNode);
594 595 596
  return 0;
}

597
int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
598
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
599 600 601
  if (pSyncNode == NULL) {
    return -1;
  }
602
  ASSERT(rid == pSyncNode->rid);
603 604
  sMeta->lastConfigIndex = pSyncNode->pRaftCfg->lastConfigIndex;

S
Shengliang Guan 已提交
605
  sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->pRaftCfg->lastConfigIndex);
606

S
Shengliang Guan 已提交
607
  syncNodeRelease(pSyncNode);
608 609 610
  return 0;
}

611
int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
612
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
613 614 615
  if (pSyncNode == NULL) {
    return -1;
  }
616
  ASSERT(rid == pSyncNode->rid);
617

618
  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
619 620
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
621
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
622 623 624 625 626 627
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
  sMeta->lastConfigIndex = lastIndex;
628
  sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex,
S
Shengliang Guan 已提交
629
         sMeta->lastConfigIndex);
630

S
Shengliang Guan 已提交
631
  syncNodeRelease(pSyncNode);
632 633
  return 0;
}
634
#endif
635

636
SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
637
  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
638 639
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
640
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
641 642 643 644 645
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotLastApplyIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
S
Shengliang Guan 已提交
646
  sTrace("vgId:%d, sync get last config index, index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId,
S
Shengliang Guan 已提交
647
         snapshotLastApplyIndex, lastIndex);
648 649 650 651

  return lastIndex;
}

652 653
void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
  pEpSet->numOfEps = 0;
M
Minghao Li 已提交
654

S
Shengliang Guan 已提交
655
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
656
  if (pSyncNode == NULL) return;
M
Minghao Li 已提交
657

S
Shengliang Guan 已提交
658
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
S
Shengliang Guan 已提交
659 660 661 662
    SEp* pEp = &pEpSet->eps[i];
    tstrncpy(pEp->fqdn, pSyncNode->pRaftCfg->cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN);
    pEp->port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort;
    pEpSet->numOfEps++;
663
    sDebug("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port);
M
Minghao Li 已提交
664
  }
M
Minghao Li 已提交
665 666
  if (pEpSet->numOfEps > 0) {
    pEpSet->inUse = (pSyncNode->pRaftCfg->cfg.myIndex + 1) % pEpSet->numOfEps;
M
Minghao Li 已提交
667 668
  }

S
Shengliang Guan 已提交
669
  sInfo("vgId:%d, sync get retry epset numOfEps:%d inUse:%d", pSyncNode->vgId, pEpSet->numOfEps, pEpSet->inUse);
S
Shengliang Guan 已提交
670
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
671 672
}

M
Minghao Li 已提交
673
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
674
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
675
  if (pSyncNode == NULL) {
676
    sError("sync propose error");
M
Minghao Li 已提交
677
    return -1;
678
  }
679

680
  int32_t ret = syncNodePropose(pSyncNode, pMsg, isWeak);
S
Shengliang Guan 已提交
681
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
682 683
  return ret;
}
M
Minghao Li 已提交
684

685
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
686 687 688 689 690
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    sNError(pSyncNode, "sync propose not leader, %s, type:%s", syncStr(pSyncNode->state), TMSG_INFO(pMsg->msgType));
    return -1;
  }
691

S
Shengliang Guan 已提交
692 693 694 695 696 697 698
  // not restored, vnode enable
  if (!pSyncNode->restoreFinish && pSyncNode->vgId != 1) {
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since not ready, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }
699

700
  // heartbeat timeout
701
  if (syncNodeHeartbeatReplyTimeout(pSyncNode)) {
702 703 704 705 706 707
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since hearbeat timeout, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }

S
Shengliang Guan 已提交
708 709 710
  // optimized one replica
  if (syncNodeIsOptimizedOneReplica(pSyncNode, pMsg)) {
    SyncIndex retIndex;
711
    int32_t   code = syncNodeOnClientRequest(pSyncNode, pMsg, &retIndex);
S
Shengliang Guan 已提交
712 713 714
    if (code == 0) {
      pMsg->info.conn.applyIndex = retIndex;
      pMsg->info.conn.applyTerm = pSyncNode->pRaftStore->currentTerm;
715 716 717
      sTrace("vgId:%d, propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
             TMSG_INFO(pMsg->msgType));
      return 1;
M
Minghao Li 已提交
718
    } else {
S
Shengliang Guan 已提交
719
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
720
      sError("vgId:%d, failed to propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
S
Shengliang Guan 已提交
721
             TMSG_INFO(pMsg->msgType));
722
      return -1;
723
    }
S
Shengliang Guan 已提交
724
  } else {
S
Shengliang Guan 已提交
725 726
    SRespStub stub = {.createTime = taosGetTimestampMs(), .rpcMsg = *pMsg};
    uint64_t  seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);
727
    SRpcMsg   rpcMsg = {0};
S
Shengliang Guan 已提交
728
    int32_t   code = syncBuildClientRequest(&rpcMsg, pMsg, seqNum, isWeak, pSyncNode->vgId);
729 730 731 732
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while serialize since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
      return -1;
M
Minghao Li 已提交
733
    }
734

735 736 737 738 739
    sNTrace(pSyncNode, "propose msg, type:%s", TMSG_INFO(pMsg->msgType));
    code = (*pSyncNode->syncEqMsg)(pSyncNode->msgcb, &rpcMsg);
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while enqueue since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
M
Minghao Li 已提交
740
    }
M
Minghao Li 已提交
741

742
    return code;
M
Minghao Li 已提交
743
  }
M
Minghao Li 已提交
744 745
}

S
Shengliang Guan 已提交
746
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId) {
747 748 749 750 751
  pSyncTimer->pTimer = NULL;
  pSyncTimer->counter = 0;
  pSyncTimer->timerMS = pSyncNode->hbBaseLine;
  pSyncTimer->timerCb = syncNodeEqPeerHeartbeatTimer;
  pSyncTimer->destId = destId;
M
Minghao Li 已提交
752
  pSyncTimer->timeStamp = taosGetTimestampMs();
753 754 755 756
  atomic_store_64(&pSyncTimer->logicClock, 0);
  return 0;
}

S
Shengliang Guan 已提交
757
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
758
  int32_t ret = 0;
S
Shengliang Guan 已提交
759
  int64_t tsNow = taosGetTimestampMs();
S
Shengliang Guan 已提交
760
  if (syncIsInit()) {
761 762 763 764 765 766
    SSyncHbTimerData* pData = syncHbTimerDataAcquire(pSyncTimer->hbDataRid);
    if (pData == NULL) {
      pData = taosMemoryMalloc(sizeof(SSyncHbTimerData));
      pData->rid = syncHbTimerDataAdd(pData);
    }
    pSyncTimer->hbDataRid = pData->rid;
S
Shengliang Guan 已提交
767
    pSyncTimer->timeStamp = tsNow;
768 769

    pData->syncNodeRid = pSyncNode->rid;
770 771 772
    pData->pTimer = pSyncTimer;
    pData->destId = pSyncTimer->destId;
    pData->logicClock = pSyncTimer->logicClock;
S
Shengliang Guan 已提交
773
    pData->execTime = tsNow + pSyncTimer->timerMS;
M
Minghao Li 已提交
774

775 776
    taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)(pData->rid),
                 syncEnv()->pTimerManager, &pSyncTimer->pTimer);
777 778 779 780 781 782
  } else {
    sError("vgId:%d, start ctrl hb timer error, sync env is stop", pSyncNode->vgId);
  }
  return ret;
}

S
Shengliang Guan 已提交
783
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
784 785 786 787
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncTimer->logicClock, 1);
  taosTmrStop(pSyncTimer->pTimer);
  pSyncTimer->pTimer = NULL;
788 789
  syncHbTimerDataRemove(pSyncTimer->hbDataRid);
  pSyncTimer->hbDataRid = -1;
790 791 792
  return ret;
}

793
int32_t syncNodeLogStoreRestoreOnNeed(SSyncNode* pNode) {
S
Shengliang Guan 已提交
794 795 796
  ASSERTS(pNode->pLogStore != NULL, "log store not created");
  ASSERTS(pNode->pFsm != NULL, "pFsm not registered");
  ASSERTS(pNode->pFsm->FpGetSnapshotInfo != NULL, "FpGetSnapshotInfo not registered");
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
  SSnapshot snapshot;
  if (pNode->pFsm->FpGetSnapshotInfo(pNode->pFsm, &snapshot) < 0) {
    sError("vgId:%d, failed to get snapshot info since %s", pNode->vgId, terrstr());
    return -1;
  }
  SyncIndex commitIndex = snapshot.lastApplyIndex;
  SyncIndex firstVer = pNode->pLogStore->syncLogBeginIndex(pNode->pLogStore);
  SyncIndex lastVer = pNode->pLogStore->syncLogLastIndex(pNode->pLogStore);
  if (lastVer < commitIndex || firstVer > commitIndex + 1) {
    if (pNode->pLogStore->syncLogRestoreFromSnapshot(pNode->pLogStore, commitIndex)) {
      sError("vgId:%d, failed to restore log store from snapshot since %s. lastVer: %" PRId64 ", snapshotVer: %" PRId64,
             pNode->vgId, terrstr(), lastVer, commitIndex);
      return -1;
    }
  }
  return 0;
}

M
Minghao Li 已提交
815
// open/close --------------
S
Shengliang Guan 已提交
816 817
SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode));
818 819 820 821
  if (pSyncNode == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _error;
  }
M
Minghao Li 已提交
822

M
Minghao Li 已提交
823 824 825 826
  if (!taosDirExist((char*)(pSyncInfo->path))) {
    if (taosMkDir(pSyncInfo->path) != 0) {
      terrno = TAOS_SYSTEM_ERROR(errno);
      sError("failed to create dir:%s since %s", pSyncInfo->path, terrstr());
827
      goto _error;
M
Minghao Li 已提交
828
    }
829
  }
M
Minghao Li 已提交
830

S
Shengliang Guan 已提交
831
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
832
  if (!taosCheckExistFile(pSyncNode->configPath)) {
M
Minghao Li 已提交
833
    // create a new raft config file
S
Shengliang Guan 已提交
834
    SRaftCfgMeta meta = {0};
M
Minghao Li 已提交
835
    meta.isStandBy = pSyncInfo->isStandBy;
M
Minghao Li 已提交
836
    meta.snapshotStrategy = pSyncInfo->snapshotStrategy;
837
    meta.lastConfigIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
838
    meta.batchSize = pSyncInfo->batchSize;
S
Shengliang Guan 已提交
839 840
    if (raftCfgCreateFile(&pSyncInfo->syncCfg, meta, pSyncNode->configPath) != 0) {
      sError("vgId:%d, failed to create raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
841
      goto _error;
842
    }
843
    if (pSyncInfo->syncCfg.replicaNum == 0) {
S
Shengliang Guan 已提交
844
      sInfo("vgId:%d, sync config not input", pSyncNode->vgId);
845 846
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
847 848 849
  } else {
    // update syncCfg by raft_config.json
    pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
850
    if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
851
      sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
852
      goto _error;
853
    }
S
Shengliang Guan 已提交
854 855

    if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->pRaftCfg->cfg, &pSyncInfo->syncCfg)) {
S
Shengliang Guan 已提交
856 857 858 859 860 861
      sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId);
      pSyncNode->pRaftCfg->cfg = pSyncInfo->syncCfg;
      if (raftCfgPersist(pSyncNode->pRaftCfg) != 0) {
        sError("vgId:%d, failed to persist raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
        goto _error;
      }
S
Shengliang Guan 已提交
862 863 864 865
    } else {
      sInfo("vgId:%d, use sync config from raft cfg file", pSyncNode->vgId);
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
866 867

    raftCfgClose(pSyncNode->pRaftCfg);
868
    pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
869 870
  }

M
Minghao Li 已提交
871
  // init by SSyncInfo
M
Minghao Li 已提交
872
  pSyncNode->vgId = pSyncInfo->vgId;
S
Shengliang Guan 已提交
873 874 875 876 877 878 879
  SSyncCfg* pCfg = &pSyncInfo->syncCfg;
  sDebug("vgId:%d, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SNodeInfo* pNode = &pCfg->nodeInfo[i];
    sDebug("vgId:%d, index:%d ep:%s:%u", pSyncNode->vgId, i, pNode->nodeFqdn, pNode->nodePort);
  }

M
Minghao Li 已提交
880
  memcpy(pSyncNode->path, pSyncInfo->path, sizeof(pSyncNode->path));
S
Shengliang Guan 已提交
881 882 883
  snprintf(pSyncNode->raftStorePath, sizeof(pSyncNode->raftStorePath), "%s%sraft_store.json", pSyncInfo->path,
           TD_DIRSEP);
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
M
Minghao Li 已提交
884

M
Minghao Li 已提交
885
  pSyncNode->pWal = pSyncInfo->pWal;
S
Shengliang Guan 已提交
886
  pSyncNode->msgcb = pSyncInfo->msgcb;
S
Shengliang Guan 已提交
887 888 889
  pSyncNode->syncSendMSg = pSyncInfo->syncSendMSg;
  pSyncNode->syncEqMsg = pSyncInfo->syncEqMsg;
  pSyncNode->syncEqCtrlMsg = pSyncInfo->syncEqCtrlMsg;
M
Minghao Li 已提交
890

B
Benguang Zhao 已提交
891 892 893
  // create raft log ring buffer
  pSyncNode->pLogBuf = syncLogBufferCreate();
  if (pSyncNode->pLogBuf == NULL) {
894
    sError("failed to init sync log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId);
B
Benguang Zhao 已提交
895 896 897
    goto _error;
  }

M
Minghao Li 已提交
898 899
  // init raft config
  pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
900
  if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
901
    sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
902 903
    goto _error;
  }
M
Minghao Li 已提交
904

M
Minghao Li 已提交
905
  // init internal
M
Minghao Li 已提交
906
  pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
907
  if (!syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId)) {
S
Shengliang Guan 已提交
908
    sError("vgId:%d, failed to determine my raft member id", pSyncNode->vgId);
H
Hongze Cheng 已提交
909
    goto _error;
910
  }
M
Minghao Li 已提交
911

M
Minghao Li 已提交
912
  // init peersNum, peers, peersId
M
Minghao Li 已提交
913
  pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
914 915
  int32_t j = 0;
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
916 917
    if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
      pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
M
Minghao Li 已提交
918 919 920
      j++;
    }
  }
S
Shengliang Guan 已提交
921
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
922
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i])) {
S
Shengliang Guan 已提交
923
      sError("vgId:%d, failed to determine raft member id, peer:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
924
      goto _error;
925
    }
M
Minghao Li 已提交
926
  }
M
Minghao Li 已提交
927

M
Minghao Li 已提交
928
  // init replicaNum, replicasId
M
Minghao Li 已提交
929
  pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
930
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
931
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) {
S
Shengliang Guan 已提交
932
      sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
933
      goto _error;
934
    }
M
Minghao Li 已提交
935 936
  }

M
Minghao Li 已提交
937
  // init raft algorithm
M
Minghao Li 已提交
938
  pSyncNode->pFsm = pSyncInfo->pFsm;
939
  pSyncInfo->pFsm = NULL;
M
Minghao Li 已提交
940
  pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);
M
Minghao Li 已提交
941 942
  pSyncNode->leaderCache = EMPTY_RAFT_ID;

M
Minghao Li 已提交
943
  // init life cycle outside
M
Minghao Li 已提交
944

M
Minghao Li 已提交
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968
  // TLA+ Spec
  // InitHistoryVars == /\ elections = {}
  //                    /\ allLogs   = {}
  //                    /\ voterLog  = [i \in Server |-> [j \in {} |-> <<>>]]
  // InitServerVars == /\ currentTerm = [i \in Server |-> 1]
  //                   /\ state       = [i \in Server |-> Follower]
  //                   /\ votedFor    = [i \in Server |-> Nil]
  // InitCandidateVars == /\ votesResponded = [i \in Server |-> {}]
  //                      /\ votesGranted   = [i \in Server |-> {}]
  // \* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the
  // \* leader does not send itself messages. It's still easier to include these
  // \* in the functions.
  // InitLeaderVars == /\ nextIndex  = [i \in Server |-> [j \in Server |-> 1]]
  //                   /\ matchIndex = [i \in Server |-> [j \in Server |-> 0]]
  // InitLogVars == /\ log          = [i \in Server |-> << >>]
  //                /\ commitIndex  = [i \in Server |-> 0]
  // Init == /\ messages = [m \in {} |-> 0]
  //         /\ InitHistoryVars
  //         /\ InitServerVars
  //         /\ InitCandidateVars
  //         /\ InitLeaderVars
  //         /\ InitLogVars
  //

M
Minghao Li 已提交
969
  // init TLA+ server vars
M
syncInt  
Minghao Li 已提交
970
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
M
Minghao Li 已提交
971
  pSyncNode->pRaftStore = raftStoreOpen(pSyncNode->raftStorePath);
972
  if (pSyncNode->pRaftStore == NULL) {
S
Shengliang Guan 已提交
973
    sError("vgId:%d, failed to open raft store at path %s", pSyncNode->vgId, pSyncNode->raftStorePath);
974 975
    goto _error;
  }
M
Minghao Li 已提交
976

M
Minghao Li 已提交
977
  // init TLA+ candidate vars
M
Minghao Li 已提交
978
  pSyncNode->pVotesGranted = voteGrantedCreate(pSyncNode);
979
  if (pSyncNode->pVotesGranted == NULL) {
S
Shengliang Guan 已提交
980
    sError("vgId:%d, failed to create VotesGranted", pSyncNode->vgId);
981 982
    goto _error;
  }
M
Minghao Li 已提交
983
  pSyncNode->pVotesRespond = votesRespondCreate(pSyncNode);
984
  if (pSyncNode->pVotesRespond == NULL) {
S
Shengliang Guan 已提交
985
    sError("vgId:%d, failed to create VotesRespond", pSyncNode->vgId);
986 987
    goto _error;
  }
M
Minghao Li 已提交
988

M
Minghao Li 已提交
989 990
  // init TLA+ leader vars
  pSyncNode->pNextIndex = syncIndexMgrCreate(pSyncNode);
991
  if (pSyncNode->pNextIndex == NULL) {
S
Shengliang Guan 已提交
992
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
993 994
    goto _error;
  }
M
Minghao Li 已提交
995
  pSyncNode->pMatchIndex = syncIndexMgrCreate(pSyncNode);
996
  if (pSyncNode->pMatchIndex == NULL) {
S
Shengliang Guan 已提交
997
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
998 999
    goto _error;
  }
M
Minghao Li 已提交
1000 1001 1002

  // init TLA+ log vars
  pSyncNode->pLogStore = logStoreCreate(pSyncNode);
1003
  if (pSyncNode->pLogStore == NULL) {
S
Shengliang Guan 已提交
1004
    sError("vgId:%d, failed to create SyncLogStore", pSyncNode->vgId);
1005 1006
    goto _error;
  }
1007 1008 1009 1010 1011

  SyncIndex commitIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot = {0};
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
1012
    if (code != 0) {
S
Shengliang Guan 已提交
1013
      sError("vgId:%d, failed to get snapshot info, code:%d", pSyncNode->vgId, code);
H
Hongze Cheng 已提交
1014
      goto _error;
1015
    }
1016 1017
    if (snapshot.lastApplyIndex > commitIndex) {
      commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
1018
      sNTrace(pSyncNode, "reset commit index by snapshot");
1019 1020 1021
    }
  }
  pSyncNode->commitIndex = commitIndex;
M
Minghao Li 已提交
1022

1023 1024 1025
  if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) {
    goto _error;
  }
M
Minghao Li 已提交
1026 1027
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
1028 1029
  pSyncNode->electBaseLine = tsElectInterval;
  pSyncNode->hbBaseLine = tsHeartbeatInterval;
M
Minghao Li 已提交
1030

M
Minghao Li 已提交
1031
  // init ping timer
M
Minghao Li 已提交
1032
  pSyncNode->pPingTimer = NULL;
M
Minghao Li 已提交
1033
  pSyncNode->pingTimerMS = pSyncNode->pingBaseLine;
M
Minghao Li 已提交
1034 1035
  atomic_store_64(&pSyncNode->pingTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->pingTimerLogicClockUser, 0);
M
Minghao Li 已提交
1036
  pSyncNode->FpPingTimerCB = syncNodeEqPingTimer;
M
Minghao Li 已提交
1037
  pSyncNode->pingTimerCounter = 0;
M
Minghao Li 已提交
1038

M
Minghao Li 已提交
1039 1040
  // init elect timer
  pSyncNode->pElectTimer = NULL;
M
Minghao Li 已提交
1041
  pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
M
Minghao Li 已提交
1042
  atomic_store_64(&pSyncNode->electTimerLogicClock, 0);
M
Minghao Li 已提交
1043
  pSyncNode->FpElectTimerCB = syncNodeEqElectTimer;
M
Minghao Li 已提交
1044 1045 1046 1047
  pSyncNode->electTimerCounter = 0;

  // init heartbeat timer
  pSyncNode->pHeartbeatTimer = NULL;
M
Minghao Li 已提交
1048
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
M
Minghao Li 已提交
1049 1050
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClockUser, 0);
M
Minghao Li 已提交
1051
  pSyncNode->FpHeartbeatTimerCB = syncNodeEqHeartbeatTimer;
M
Minghao Li 已提交
1052 1053
  pSyncNode->heartbeatTimerCounter = 0;

1054 1055 1056 1057 1058
  // init peer heartbeat timer
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
    syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]);
  }

M
Minghao Li 已提交
1059
  // tools
M
Minghao Li 已提交
1060
  pSyncNode->pSyncRespMgr = syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS);
1061
  if (pSyncNode->pSyncRespMgr == NULL) {
S
Shengliang Guan 已提交
1062
    sError("vgId:%d, failed to create SyncRespMgr", pSyncNode->vgId);
1063 1064
    goto _error;
  }
M
Minghao Li 已提交
1065

1066 1067
  // restore state
  pSyncNode->restoreFinish = false;
1068

M
Minghao Li 已提交
1069
  // snapshot senders
S
Shengliang Guan 已提交
1070
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1071
    SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
1072
    // ASSERT(pSender != NULL);
M
Minghao Li 已提交
1073
    (pSyncNode->senders)[i] = pSender;
S
Shengliang Guan 已提交
1074
    sSTrace(pSender, "snapshot sender create new while open, data:%p", pSender);
M
Minghao Li 已提交
1075 1076 1077
  }

  // snapshot receivers
1078
  pSyncNode->pNewNodeReceiver = snapshotReceiverCreate(pSyncNode, EMPTY_RAFT_ID);
M
Minghao Li 已提交
1079

M
Minghao Li 已提交
1080 1081 1082
  // is config changing
  pSyncNode->changing = false;

B
Benguang Zhao 已提交
1083 1084 1085
  // replication mgr
  syncNodeLogReplMgrInit(pSyncNode);

M
Minghao Li 已提交
1086 1087 1088
  // peer state
  syncNodePeerStateInit(pSyncNode);

B
Benguang Zhao 已提交
1089
  //
M
Minghao Li 已提交
1090 1091 1092
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

M
Minghao Li 已提交
1093
  // start in syncNodeStart
M
Minghao Li 已提交
1094
  // start raft
M
Minghao Li 已提交
1095
  // syncNodeBecomeFollower(pSyncNode);
M
Minghao Li 已提交
1096

M
Minghao Li 已提交
1097 1098
  int64_t timeNow = taosGetTimestampMs();
  pSyncNode->startTime = timeNow;
1099
  pSyncNode->leaderTime = timeNow;
M
Minghao Li 已提交
1100 1101
  pSyncNode->lastReplicateTime = timeNow;

1102 1103 1104
  // snapshotting
  atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);

B
Benguang Zhao 已提交
1105 1106
  // init log buffer
  if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) {
1107
    sError("vgId:%d, failed to init sync log buffer since %s", pSyncNode->vgId, terrstr());
1108
    goto _error;
B
Benguang Zhao 已提交
1109 1110
  }

1111
  pSyncNode->isStart = true;
1112 1113 1114
  pSyncNode->electNum = 0;
  pSyncNode->becomeLeaderNum = 0;
  pSyncNode->configChangeNum = 0;
1115 1116
  pSyncNode->hbSlowNum = 0;
  pSyncNode->hbrSlowNum = 0;
M
Minghao Li 已提交
1117
  pSyncNode->tmrRoutineNum = 0;
1118

1119 1120 1121
  sNInfo(pSyncNode, "sync open, node:%p", pSyncNode);
  sTrace("vgId:%d, tsElectInterval:%d, tsHeartbeatInterval:%d, tsHeartbeatTimeout:%d", pSyncNode->vgId, tsElectInterval,
         tsHeartbeatInterval, tsHeartbeatTimeout);
1122

M
Minghao Li 已提交
1123
  return pSyncNode;
1124 1125 1126

_error:
  if (pSyncInfo->pFsm) {
H
Hongze Cheng 已提交
1127 1128
    taosMemoryFree(pSyncInfo->pFsm);
    pSyncInfo->pFsm = NULL;
1129 1130 1131 1132
  }
  syncNodeClose(pSyncNode);
  pSyncNode = NULL;
  return NULL;
M
Minghao Li 已提交
1133 1134
}

M
Minghao Li 已提交
1135 1136 1137 1138
void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) {
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot;
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
1139
    ASSERT(code == 0);
M
Minghao Li 已提交
1140 1141 1142 1143 1144 1145
    if (snapshot.lastApplyIndex > pSyncNode->commitIndex) {
      pSyncNode->commitIndex = snapshot.lastApplyIndex;
    }
  }
}

B
Benguang Zhao 已提交
1146
int32_t syncNodeRestore(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1147 1148
  ASSERTS(pSyncNode->pLogStore != NULL, "log store not created");
  ASSERTS(pSyncNode->pLogBuf != NULL, "ring log buffer not created");
B
Benguang Zhao 已提交
1149 1150 1151 1152

  SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore);
  SyncIndex endIndex = pSyncNode->pLogBuf->endIndex;
1153 1154 1155 1156 1157 1158
  if (lastVer != -1 && endIndex != lastVer + 1) {
    terrno = TSDB_CODE_WAL_LOG_INCOMPLETE;
    sError("vgId:%d, failed to restore sync node since %s. expected lastLogIndex: %" PRId64 ", lastVer: %" PRId64 "",
           pSyncNode->vgId, terrstr(), endIndex - 1, lastVer);
    return -1;
  }
B
Benguang Zhao 已提交
1159

1160
  ASSERT(endIndex == lastVer + 1);
B
Benguang Zhao 已提交
1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
  commitIndex = TMAX(pSyncNode->commitIndex, commitIndex);

  if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, commitIndex) < 0) {
    return -1;
  }

  return 0;
}

int32_t syncNodeStart(SSyncNode* pSyncNode) {
  // start raft
  if (pSyncNode->replicaNum == 1) {
    raftStoreNextTerm(pSyncNode->pRaftStore);
    syncNodeBecomeLeader(pSyncNode, "one replica start");

    // Raft 3.6.2 Committing entries from previous terms
    syncNodeAppendNoop(pSyncNode);
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
  }

  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1184
  ASSERT(ret == 0);
B
Benguang Zhao 已提交
1185 1186 1187 1188
  return ret;
}

void syncNodeStartOld(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1189
  // start raft
1190
  if (pSyncNode->replicaNum == 1) {
M
Minghao Li 已提交
1191
    raftStoreNextTerm(pSyncNode->pRaftStore);
1192
    syncNodeBecomeLeader(pSyncNode, "one replica start");
M
format  
Minghao Li 已提交
1193

1194
    // Raft 3.6.2 Committing entries from previous terms
1195 1196
    syncNodeAppendNoop(pSyncNode);
    syncMaybeAdvanceCommitIndex(pSyncNode);
M
Minghao Li 已提交
1197

M
Minghao Li 已提交
1198 1199
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
1200 1201
  }

1202 1203
  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1204
  ASSERT(ret == 0);
M
Minghao Li 已提交
1205 1206
}

B
Benguang Zhao 已提交
1207
int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1208 1209 1210 1211 1212 1213 1214
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1215
  ASSERT(ret == 0);
1216

1217 1218
  ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
1219
  ASSERT(ret == 0);
B
Benguang Zhao 已提交
1220
  return ret;
M
Minghao Li 已提交
1221 1222
}

M
Minghao Li 已提交
1223
void syncNodePreClose(SSyncNode* pSyncNode) {
1224 1225 1226 1227
  if (pSyncNode != NULL && pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpApplyQueueItems != NULL) {
    while (1) {
      int32_t aqItems = pSyncNode->pFsm->FpApplyQueueItems(pSyncNode->pFsm);
      sTrace("vgId:%d, pre close, %d items in apply queue", pSyncNode->vgId, aqItems);
1228
      if (aqItems == 0 || aqItems == -1) {
1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
        break;
      }
      taosMsleep(20);
    }
  }

  if (pSyncNode->pNewNodeReceiver != NULL) {
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
      snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
    }

    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

M
Minghao Li 已提交
1244 1245 1246 1247 1248 1249 1250
  // stop elect timer
  syncNodeStopElectTimer(pSyncNode);

  // stop heartbeat timer
  syncNodeStopHeartbeatTimer(pSyncNode);
}

1251
void syncHbTimerDataFree(SSyncHbTimerData* pData) { taosMemoryFree(pData); }
M
Minghao Li 已提交
1252

M
Minghao Li 已提交
1253
void syncNodeClose(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1254
  if (pSyncNode == NULL) return;
1255
  sNInfo(pSyncNode, "sync close, node:%p", pSyncNode);
M
Minghao Li 已提交
1256

S
Shengliang Guan 已提交
1257
  int32_t ret = raftStoreClose(pSyncNode->pRaftStore);
1258
  ASSERT(ret == 0);
M
Minghao Li 已提交
1259
  pSyncNode->pRaftStore = NULL;
M
Minghao Li 已提交
1260

B
Benguang Zhao 已提交
1261
  syncNodeLogReplMgrDestroy(pSyncNode);
M
Minghao Li 已提交
1262
  syncRespMgrDestroy(pSyncNode->pSyncRespMgr);
1263
  pSyncNode->pSyncRespMgr = NULL;
M
Minghao Li 已提交
1264
  voteGrantedDestroy(pSyncNode->pVotesGranted);
1265
  pSyncNode->pVotesGranted = NULL;
M
Minghao Li 已提交
1266
  votesRespondDestory(pSyncNode->pVotesRespond);
1267
  pSyncNode->pVotesRespond = NULL;
M
Minghao Li 已提交
1268
  syncIndexMgrDestroy(pSyncNode->pNextIndex);
1269
  pSyncNode->pNextIndex = NULL;
M
Minghao Li 已提交
1270
  syncIndexMgrDestroy(pSyncNode->pMatchIndex);
1271
  pSyncNode->pMatchIndex = NULL;
M
Minghao Li 已提交
1272
  logStoreDestory(pSyncNode->pLogStore);
1273
  pSyncNode->pLogStore = NULL;
B
Benguang Zhao 已提交
1274 1275
  syncLogBufferDestroy(pSyncNode->pLogBuf);
  pSyncNode->pLogBuf = NULL;
M
Minghao Li 已提交
1276
  raftCfgClose(pSyncNode->pRaftCfg);
1277
  pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
1278 1279 1280 1281 1282

  syncNodeStopPingTimer(pSyncNode);
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1283 1284 1285 1286
  if (pSyncNode->pFsm != NULL) {
    taosMemoryFree(pSyncNode->pFsm);
  }

S
Shengliang Guan 已提交
1287
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1288
    if ((pSyncNode->senders)[i] != NULL) {
S
Shengliang Guan 已提交
1289
      sSTrace((pSyncNode->senders)[i], "snapshot sender destroy while close, data:%p", (pSyncNode->senders)[i]);
1290 1291 1292 1293 1294

      if (snapshotSenderIsStart((pSyncNode->senders)[i])) {
        snapshotSenderStop((pSyncNode->senders)[i], false);
      }

M
Minghao Li 已提交
1295 1296 1297 1298 1299
      snapshotSenderDestroy((pSyncNode->senders)[i]);
      (pSyncNode->senders)[i] = NULL;
    }
  }

M
Minghao Li 已提交
1300
  if (pSyncNode->pNewNodeReceiver != NULL) {
1301 1302 1303 1304
    if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
      snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
    }

M
Minghao Li 已提交
1305 1306 1307 1308
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

1309
  taosMemoryFree(pSyncNode);
M
Minghao Li 已提交
1310 1311
}

M
Minghao Li 已提交
1312
ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode) { return pSyncNode->pRaftCfg->snapshotStrategy; }
M
Minghao Li 已提交
1313

M
Minghao Li 已提交
1314 1315 1316
// timer control --------------
int32_t syncNodeStartPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1317 1318
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpPingTimerCB, pSyncNode->pingTimerMS, pSyncNode, syncEnv()->pTimerManager,
1319 1320 1321
                 &pSyncNode->pPingTimer);
    atomic_store_64(&pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1322
    sError("vgId:%d, start ping timer error, sync env is stop", pSyncNode->vgId);
1323
  }
M
Minghao Li 已提交
1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
  return ret;
}

int32_t syncNodeStopPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->pingTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pPingTimer);
  pSyncNode->pPingTimer = NULL;
  return ret;
}

int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1337
  if (syncIsInit()) {
1338
    pSyncNode->electTimerMS = ms;
S
Shengliang Guan 已提交
1339

1340 1341 1342 1343 1344
    int64_t execTime = taosGetTimestampMs() + ms;
    atomic_store_64(&(pSyncNode->electTimerParam.executeTime), execTime);
    atomic_store_64(&(pSyncNode->electTimerParam.logicClock), pSyncNode->electTimerLogicClock);
    pSyncNode->electTimerParam.pSyncNode = pSyncNode;
    pSyncNode->electTimerParam.pData = NULL;
S
Shengliang Guan 已提交
1345

M
Minghao Li 已提交
1346
    taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, (void*)(pSyncNode->rid), syncEnv()->pTimerManager,
1347
                 &pSyncNode->pElectTimer);
1348

1349
  } else {
M
Minghao Li 已提交
1350
    sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId);
1351
  }
M
Minghao Li 已提交
1352 1353 1354 1355 1356
  return ret;
}

int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1357
  atomic_add_fetch_64(&pSyncNode->electTimerLogicClock, 1);
M
Minghao Li 已提交
1358 1359
  taosTmrStop(pSyncNode->pElectTimer);
  pSyncNode->pElectTimer = NULL;
1360

M
Minghao Li 已提交
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
  return ret;
}

int32_t syncNodeRestartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStartElectTimer(pSyncNode, ms);
  return ret;
}

M
Minghao Li 已提交
1371 1372
int32_t syncNodeResetElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1373 1374 1375 1376 1377 1378 1379
  int32_t electMS;

  if (pSyncNode->pRaftCfg->isStandBy) {
    electMS = TIMER_MAX_MS;
  } else {
    electMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
  }
M
Minghao Li 已提交
1380
  ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1381

S
Shengliang Guan 已提交
1382 1383
  sNTrace(pSyncNode, "reset elect timer, min:%d, max:%d, ms:%d", pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine,
          electMS);
M
Minghao Li 已提交
1384 1385 1386
  return ret;
}

M
Minghao Li 已提交
1387
static int32_t syncNodeDoStartHeartbeatTimer(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1388
  int32_t ret = 0;
S
Shengliang Guan 已提交
1389 1390
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, pSyncNode->heartbeatTimerMS, pSyncNode, syncEnv()->pTimerManager,
1391 1392 1393
                 &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1394
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
1395
  }
1396

S
Shengliang Guan 已提交
1397
  sNTrace(pSyncNode, "start heartbeat timer, ms:%d", pSyncNode->heartbeatTimerMS);
M
Minghao Li 已提交
1398 1399 1400
  return ret;
}

M
Minghao Li 已提交
1401
int32_t syncNodeStartHeartbeatTimer(SSyncNode* pSyncNode) {
1402
  int32_t ret = 0;
M
Minghao Li 已提交
1403

1404
#if 0
M
Minghao Li 已提交
1405
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
1406 1407
  ret = syncNodeDoStartHeartbeatTimer(pSyncNode);
#endif
1408

S
Shengliang Guan 已提交
1409
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1410
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1411 1412 1413
    if (pSyncTimer != NULL) {
      syncHbTimerStart(pSyncNode, pSyncTimer);
    }
1414
  }
1415

M
Minghao Li 已提交
1416 1417 1418
  return ret;
}

M
Minghao Li 已提交
1419 1420
int32_t syncNodeStopHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1421 1422

#if 0
M
Minghao Li 已提交
1423 1424 1425
  atomic_add_fetch_64(&pSyncNode->heartbeatTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pHeartbeatTimer);
  pSyncNode->pHeartbeatTimer = NULL;
1426
#endif
1427

S
Shengliang Guan 已提交
1428
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1429
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1430 1431 1432
    if (pSyncTimer != NULL) {
      syncHbTimerStop(pSyncNode, pSyncTimer);
    }
1433
  }
1434

M
Minghao Li 已提交
1435 1436 1437
  return ret;
}

1438 1439 1440 1441 1442 1443
int32_t syncNodeRestartHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartHeartbeatTimer(pSyncNode);
  return 0;
}

M
Minghao Li 已提交
1444 1445 1446
// utils --------------
int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1447
  syncUtilRaftId2EpSet(destRaftId, &epSet);
S
Shengliang Guan 已提交
1448
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1449 1450 1451
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1452
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1453
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1454
  } else {
M
Minghao Li 已提交
1455
    sError("vgId:%d, sync send msg by id error, fp-send-msg is null", pSyncNode->vgId);
S
Shengliang Guan 已提交
1456
    rpcFreeCont(pMsg->pCont);
M
Minghao Li 已提交
1457
    return -1;
M
Minghao Li 已提交
1458
  }
M
Minghao Li 已提交
1459

M
Minghao Li 已提交
1460 1461 1462 1463 1464
  return 0;
}

int32_t syncNodeSendMsgByInfo(const SNodeInfo* nodeInfo, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1465
  syncUtilNodeInfo2EpSet(nodeInfo, &epSet);
S
Shengliang Guan 已提交
1466
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1467 1468 1469
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1470
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1471
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1472
  } else {
M
Minghao Li 已提交
1473
    sError("vgId:%d, sync send msg by info error, fp-send-msg is null", pSyncNode->vgId);
M
Minghao Li 已提交
1474
  }
M
Minghao Li 已提交
1475 1476 1477
  return 0;
}

1478
inline bool syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config) {
1479 1480 1481
  bool b1 = false;
  bool b2 = false;

S
Shengliang Guan 已提交
1482
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1483 1484 1485 1486 1487 1488 1489
    if (strcmp((config->nodeInfo)[i].nodeFqdn, pSyncNode->myNodeInfo.nodeFqdn) == 0 &&
        (config->nodeInfo)[i].nodePort == pSyncNode->myNodeInfo.nodePort) {
      b1 = true;
      break;
    }
  }

S
Shengliang Guan 已提交
1490
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1491 1492 1493 1494 1495 1496 1497 1498 1499 1500
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((config->nodeInfo)[i].nodeFqdn, (config->nodeInfo)[i].nodePort);
    raftId.vgId = pSyncNode->vgId;

    if (syncUtilSameId(&raftId, &(pSyncNode->myRaftId))) {
      b2 = true;
      break;
    }
  }

1501
  ASSERT(b1 == b2);
1502 1503 1504
  return b1;
}

1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) {
  if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true;
  if (pOldCfg->myIndex != pNewCfg->myIndex) return true;
  for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) {
    const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i];
    const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i];
    if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true;
    if (pOldInfo->nodePort != pNewInfo->nodePort) return true;
  }

  return false;
}

M
Minghao Li 已提交
1518
void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncIndex lastConfigChangeIndex) {
1519
  SSyncCfg oldConfig = pSyncNode->pRaftCfg->cfg;
1520 1521 1522 1523
  if (!syncIsConfigChanged(&oldConfig, pNewConfig)) {
    sInfo("vgId:1, sync not reconfig since not changed");
    return;
  }
S
Shengliang Guan 已提交
1524

1525
  pSyncNode->pRaftCfg->cfg = *pNewConfig;
1526 1527
  pSyncNode->pRaftCfg->lastConfigIndex = lastConfigChangeIndex;

1528 1529
  pSyncNode->configChangeNum++;

M
Minghao Li 已提交
1530 1531
  bool IamInOld = syncNodeInConfig(pSyncNode, &oldConfig);
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewConfig);
M
Minghao Li 已提交
1532

M
Minghao Li 已提交
1533 1534
  bool isDrop = false;
  bool isAdd = false;
M
Minghao Li 已提交
1535

M
Minghao Li 已提交
1536 1537 1538 1539
  if (IamInOld && !IamInNew) {
    isDrop = true;
  } else {
    isDrop = false;
1540
  }
1541

M
Minghao Li 已提交
1542 1543 1544 1545 1546
  if (!IamInOld && IamInNew) {
    isAdd = true;
  } else {
    isAdd = false;
  }
M
Minghao Li 已提交
1547

M
Minghao Li 已提交
1548
  // log begin config change
S
Shengliang Guan 已提交
1549 1550 1551 1552
  char oldCfgStr[1024] = {0};
  char newCfgStr[1024] = {0};
  syncCfg2SimpleStr(&oldConfig, oldCfgStr, sizeof(oldCfgStr));
  syncCfg2SimpleStr(pNewConfig, oldCfgStr, sizeof(oldCfgStr));
1553
  sNInfo(pSyncNode, "begin do config change, from %s to %s", oldCfgStr, oldCfgStr);
M
Minghao Li 已提交
1554

M
Minghao Li 已提交
1555 1556
  if (IamInNew) {
    pSyncNode->pRaftCfg->isStandBy = 0;  // change isStandBy to normal
M
Minghao Li 已提交
1557
  }
M
Minghao Li 已提交
1558 1559
  if (isDrop) {
    pSyncNode->pRaftCfg->isStandBy = 1;  // set standby
M
Minghao Li 已提交
1560 1561
  }

M
Minghao Li 已提交
1562
  // add last config index
M
Minghao Li 已提交
1563
  raftCfgAddConfigIndex(pSyncNode->pRaftCfg, lastConfigChangeIndex);
M
Minghao Li 已提交
1564

M
Minghao Li 已提交
1565 1566 1567 1568 1569 1570 1571 1572 1573
  if (IamInNew) {
    //-----------------------------------------
    int32_t ret = 0;

    // save snapshot senders
    int32_t oldReplicaNum = pSyncNode->replicaNum;
    SRaftId oldReplicasId[TSDB_MAX_REPLICA];
    memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
    SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
S
Shengliang Guan 已提交
1574
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1575
      oldSenders[i] = (pSyncNode->senders)[i];
S
Shengliang Guan 已提交
1576
      sSTrace(oldSenders[i], "snapshot sender save old");
M
Minghao Li 已提交
1577
    }
1578

M
Minghao Li 已提交
1579 1580
    // init internal
    pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
1581
    syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1582 1583 1584

    // init peersNum, peers, peersId
    pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
1585 1586
    int32_t j = 0;
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
1587 1588 1589 1590 1591
      if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
        pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
        j++;
      }
    }
S
Shengliang Guan 已提交
1592
    for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1593
      syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
M
Minghao Li 已提交
1594
    }
1595

M
Minghao Li 已提交
1596 1597
    // init replicaNum, replicasId
    pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
1598
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
1599
      syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
M
Minghao Li 已提交
1600
    }
1601

1602 1603 1604
    // update quorum first
    pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);

M
Minghao Li 已提交
1605 1606 1607 1608
    syncIndexMgrUpdate(pSyncNode->pNextIndex, pSyncNode);
    syncIndexMgrUpdate(pSyncNode->pMatchIndex, pSyncNode);
    voteGrantedUpdate(pSyncNode->pVotesGranted, pSyncNode);
    votesRespondUpdate(pSyncNode->pVotesRespond, pSyncNode);
M
Minghao Li 已提交
1609

M
Minghao Li 已提交
1610
    // reset snapshot senders
1611

M
Minghao Li 已提交
1612
    // clear new
S
Shengliang Guan 已提交
1613
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1614 1615
      (pSyncNode->senders)[i] = NULL;
    }
M
Minghao Li 已提交
1616

M
Minghao Li 已提交
1617
    // reset new
S
Shengliang Guan 已提交
1618
    for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
M
Minghao Li 已提交
1619 1620
      // reset sender
      bool reset = false;
S
Shengliang Guan 已提交
1621
      for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) {
M
Minghao Li 已提交
1622
        if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j]) && oldSenders[j] != NULL) {
M
Minghao Li 已提交
1623 1624 1625
          char     host[128];
          uint16_t port;
          syncUtilU642Addr((pSyncNode->replicasId)[i].addr, host, sizeof(host), &port);
1626
          sNTrace(pSyncNode, "snapshot sender reset for: %" PRId64 ", newIndex:%d, %s:%d, %p",
S
Shengliang Guan 已提交
1627
                  (pSyncNode->replicasId)[i].addr, i, host, port, oldSenders[j]);
M
Minghao Li 已提交
1628 1629 1630 1631 1632 1633 1634 1635 1636

          (pSyncNode->senders)[i] = oldSenders[j];
          oldSenders[j] = NULL;
          reset = true;

          // reset replicaIndex
          int32_t oldreplicaIndex = (pSyncNode->senders)[i]->replicaIndex;
          (pSyncNode->senders)[i]->replicaIndex = i;

S
Shengliang Guan 已提交
1637 1638
          sNTrace(pSyncNode, "snapshot sender udpate replicaIndex from %d to %d, %s:%d, %p, reset:%d", oldreplicaIndex,
                  i, host, port, (pSyncNode->senders)[i], reset);
M
Minghao Li 已提交
1639 1640

          break;
M
Minghao Li 已提交
1641
        }
1642 1643
      }
    }
1644

M
Minghao Li 已提交
1645
    // create new
S
Shengliang Guan 已提交
1646
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1647 1648
      if ((pSyncNode->senders)[i] == NULL) {
        (pSyncNode->senders)[i] = snapshotSenderCreate(pSyncNode, i);
S
Shengliang Guan 已提交
1649 1650 1651
        sSTrace((pSyncNode->senders)[i], "snapshot sender create new while reconfig, data:%p", (pSyncNode->senders)[i]);
      } else {
        sSTrace((pSyncNode->senders)[i], "snapshot sender already exist, data:%p", (pSyncNode->senders)[i]);
M
Minghao Li 已提交
1652
      }
1653 1654
    }

M
Minghao Li 已提交
1655
    // free old
S
Shengliang Guan 已提交
1656
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1657
      if (oldSenders[i] != NULL) {
S
Shengliang Guan 已提交
1658
        sNTrace(pSyncNode, "snapshot sender destroy old, data:%p replica-index:%d", oldSenders[i], i);
M
Minghao Li 已提交
1659 1660 1661
        snapshotSenderDestroy(oldSenders[i]);
        oldSenders[i] = NULL;
      }
1662 1663
    }

1664
    // persist cfg
M
Minghao Li 已提交
1665
    raftCfgPersist(pSyncNode->pRaftCfg);
1666

S
Shengliang Guan 已提交
1667
    char tmpbuf[1024] = {0};
1668
    snprintf(tmpbuf, sizeof(tmpbuf), "config change from %d to %d, index:%" PRId64 ", %s  -->  %s",
S
Shengliang Guan 已提交
1669
             oldConfig.replicaNum, pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1670

M
Minghao Li 已提交
1671 1672 1673
    // change isStandBy to normal (election timeout)
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
      syncNodeBecomeLeader(pSyncNode, tmpbuf);
1674 1675 1676

      // Raft 3.6.2 Committing entries from previous terms
      syncNodeAppendNoop(pSyncNode);
1677
      // syncMaybeAdvanceCommitIndex(pSyncNode);
1678

M
Minghao Li 已提交
1679 1680 1681 1682
    } else {
      syncNodeBecomeFollower(pSyncNode, tmpbuf);
    }
  } else {
1683
    // persist cfg
M
Minghao Li 已提交
1684
    raftCfgPersist(pSyncNode->pRaftCfg);
1685 1686
    sNInfo(pSyncNode, "do not config change from %d to %d, index:%" PRId64 ", %s  -->  %s", oldConfig.replicaNum,
           pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
1687
  }
1688

M
Minghao Li 已提交
1689
_END:
M
Minghao Li 已提交
1690
  // log end config change
1691
  sNInfo(pSyncNode, "end do config change, from %s to %s", oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1692 1693
}

M
Minghao Li 已提交
1694 1695 1696 1697
// raft state change --------------
void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
1698
    char tmpBuf[64];
1699
    snprintf(tmpBuf, sizeof(tmpBuf), "update term to %" PRId64, term);
1700
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
M
Minghao Li 已提交
1701 1702 1703 1704
    raftStoreClearVote(pSyncNode->pRaftStore);
  }
}

1705 1706 1707 1708 1709 1710
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
  }
}

M
Minghao Li 已提交
1711
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) {
M
Minghao Li 已提交
1712
  if (pSyncNode->pRaftStore->currentTerm > newTerm) {
1713
    sNTrace(pSyncNode, "step down, ignore, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1714
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1715 1716
    return;
  }
M
Minghao Li 已提交
1717 1718

  do {
1719
    sNTrace(pSyncNode, "step down, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1720
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1721 1722 1723 1724 1725
  } while (0);

  if (pSyncNode->pRaftStore->currentTerm < newTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, newTerm);
    char tmpBuf[64];
1726
    snprintf(tmpBuf, sizeof(tmpBuf), "step down, update term to %" PRId64, newTerm);
M
Minghao Li 已提交
1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
    raftStoreClearVote(pSyncNode->pRaftStore);

  } else {
    if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
      syncNodeBecomeFollower(pSyncNode, "step down");
    }
  }
}

1737 1738
void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); }

1739
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
M
Minghao Li 已提交
1740
  // maybe clear leader cache
M
Minghao Li 已提交
1741 1742 1743 1744
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    pSyncNode->leaderCache = EMPTY_RAFT_ID;
  }

1745 1746
  pSyncNode->hbSlowNum = 0;

M
Minghao Li 已提交
1747
  // state change
M
Minghao Li 已提交
1748 1749 1750
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1751 1752
  // reset elect timer
  syncNodeResetElectTimer(pSyncNode);
M
Minghao Li 已提交
1753

1754 1755 1756
  // send rsp to client
  syncNodeLeaderChangeRsp(pSyncNode);

1757 1758 1759 1760 1761
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeFollowerCb != NULL) {
    pSyncNode->pFsm->FpBecomeFollowerCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1762 1763 1764
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1765 1766 1767
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1768
  // trace log
S
Shengliang Guan 已提交
1769
  sNTrace(pSyncNode, "become follower %s", debugStr);
M
Minghao Li 已提交
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789
}

// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
//     /\ state[i] = Candidate
//     /\ votesGranted[i] \in Quorum
//     /\ state'      = [state EXCEPT ![i] = Leader]
//     /\ nextIndex'  = [nextIndex EXCEPT ![i] =
//                          [j \in Server |-> Len(log[i]) + 1]]
//     /\ matchIndex' = [matchIndex EXCEPT ![i] =
//                          [j \in Server |-> 0]]
//     /\ elections'  = elections \cup
//                          {[eterm     |-> currentTerm[i],
//                            eleader   |-> i,
//                            elog      |-> log[i],
//                            evotes    |-> votesGranted[i],
//                            evoterLog |-> voterLog[i]]}
//     /\ UNCHANGED <<messages, currentTerm, votedFor, candidateVars, logVars>>
//
1790
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
1791 1792
  pSyncNode->leaderTime = taosGetTimestampMs();

1793
  pSyncNode->becomeLeaderNum++;
1794
  pSyncNode->hbrSlowNum = 0;
1795

1796 1797 1798
  // reset restoreFinish
  pSyncNode->restoreFinish = false;

M
Minghao Li 已提交
1799
  // state change
M
Minghao Li 已提交
1800
  pSyncNode->state = TAOS_SYNC_STATE_LEADER;
M
Minghao Li 已提交
1801 1802

  // set leader cache
M
Minghao Li 已提交
1803 1804
  pSyncNode->leaderCache = pSyncNode->myRaftId;

S
Shengliang Guan 已提交
1805
  for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1806 1807
    // maybe overwrite myself, no harm
    // just do it!
1808 1809 1810 1811 1812 1813 1814

    // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1;

    // maybe wal is deleted
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
1815
    ASSERT(code == 0);
1816
    pSyncNode->pNextIndex->index[i] = lastIndex + 1;
M
Minghao Li 已提交
1817 1818
  }

S
Shengliang Guan 已提交
1819
  for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1820 1821
    // maybe overwrite myself, no harm
    // just do it!
M
Minghao Li 已提交
1822 1823 1824
    pSyncNode->pMatchIndex->index[i] = SYNC_INDEX_INVALID;
  }

M
Minghao Li 已提交
1825 1826 1827
  // init peer mgr
  syncNodePeerStateInit(pSyncNode);

M
Minghao Li 已提交
1828
#if 0
1829 1830
  // update sender private term
  SSyncSnapshotSender* pMySender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->myRaftId));
1831
  if (pMySender != NULL) {
S
Shengliang Guan 已提交
1832
    for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
1833 1834 1835
      if ((pSyncNode->senders)[i]->privateTerm > pMySender->privateTerm) {
        pMySender->privateTerm = (pSyncNode->senders)[i]->privateTerm;
      }
1836
    }
1837
    (pMySender->privateTerm) += 100;
1838
  }
M
Minghao Li 已提交
1839
#endif
1840

1841
  // close receiver
M
Minghao Li 已提交
1842 1843
  if (pSyncNode != NULL && pSyncNode->pNewNodeReceiver != NULL &&
      snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
1844 1845 1846
    snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
  }

M
Minghao Li 已提交
1847
  // stop elect timer
M
Minghao Li 已提交
1848
  syncNodeStopElectTimer(pSyncNode);
M
Minghao Li 已提交
1849

M
Minghao Li 已提交
1850 1851
  // start heartbeat timer
  syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
1852

M
Minghao Li 已提交
1853 1854
  // send heartbeat right now
  syncNodeHeartbeatPeers(pSyncNode);
M
Minghao Li 已提交
1855

1856 1857 1858 1859 1860
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLeaderCb != NULL) {
    pSyncNode->pFsm->FpBecomeLeaderCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1861 1862 1863
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1864 1865 1866
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1867
  // trace log
1868
  sNInfo(pSyncNode, "become leader %s", debugStr);
M
Minghao Li 已提交
1869 1870 1871
}

void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
1872 1873
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
1874
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
M
Minghao Li 已提交
1875

S
Shengliang Guan 已提交
1876
  sNTrace(pSyncNode, "state change syncNodeCandidate2Leader");
M
Minghao Li 已提交
1877

B
Benguang Zhao 已提交
1878
  int32_t ret = syncNodeAppendNoop(pSyncNode);
1879 1880 1881 1882
  if (ret < 0) {
    sError("vgId:%d, failed to append noop entry since %s", pSyncNode->vgId, terrstr());
  }

B
Benguang Zhao 已提交
1883
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
1884
  ASSERT(lastIndex >= 0);
B
Benguang Zhao 已提交
1885 1886
  sInfo("vgId:%d, become leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64 "",
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1887 1888 1889
}

void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) {
1890 1891
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
B
Benguang Zhao 已提交
1892 1893
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");

M
Minghao Li 已提交
1894
  // Raft 3.6.2 Committing entries from previous terms
1895 1896
  syncNodeAppendNoop(pSyncNode);
  syncMaybeAdvanceCommitIndex(pSyncNode);
1897 1898

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
1899
    syncNodeReplicate(pSyncNode);
1900
  }
M
Minghao Li 已提交
1901 1902
}

M
Minghao Li 已提交
1903 1904
bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }

M
Minghao Li 已提交
1905
int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1906
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1907 1908 1909 1910 1911
    pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID;
    pSyncNode->peerStates[i].lastSendTime = 0;
  }

  return 0;
M
Minghao Li 已提交
1912 1913 1914
}

void syncNodeFollower2Candidate(SSyncNode* pSyncNode) {
1915
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER);
M
Minghao Li 已提交
1916
  pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE;
B
Benguang Zhao 已提交
1917 1918 1919
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become candidate from follower. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
M
Minghao Li 已提交
1920

S
Shengliang Guan 已提交
1921
  sNTrace(pSyncNode, "follower to candidate");
M
Minghao Li 已提交
1922 1923 1924
}

void syncNodeLeader2Follower(SSyncNode* pSyncNode) {
1925
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
1926
  syncNodeBecomeFollower(pSyncNode, "leader to follower");
B
Benguang Zhao 已提交
1927 1928 1929 1930
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1931
  sNTrace(pSyncNode, "leader to follower");
M
Minghao Li 已提交
1932 1933 1934
}

void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
1935
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
1936
  syncNodeBecomeFollower(pSyncNode, "candidate to follower");
B
Benguang Zhao 已提交
1937 1938 1939 1940
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from candidate. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1941
  sNTrace(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
1942 1943
}

M
Minghao Li 已提交
1944 1945
// just called by syncNodeVoteForSelf
// need assert
M
Minghao Li 已提交
1946
void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
1947 1948
  ASSERT(term == pSyncNode->pRaftStore->currentTerm);
  ASSERT(!raftStoreHasVoted(pSyncNode->pRaftStore));
M
Minghao Li 已提交
1949 1950 1951 1952

  raftStoreVote(pSyncNode->pRaftStore, pRaftId);
}

M
Minghao Li 已提交
1953
// simulate get vote from outside
M
Minghao Li 已提交
1954
void syncNodeVoteForSelf(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1955
  syncNodeVoteForTerm(pSyncNode, pSyncNode->pRaftStore->currentTerm, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1956

S
Shengliang Guan 已提交
1957 1958
  SRpcMsg rpcMsg = {0};
  int32_t ret = syncBuildRequestVoteReply(&rpcMsg, pSyncNode->vgId);
S
Shengliang Guan 已提交
1959
  if (ret != 0) return;
M
Minghao Li 已提交
1960

S
Shengliang Guan 已提交
1961
  SyncRequestVoteReply* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
1962 1963 1964 1965 1966 1967 1968
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = pSyncNode->myRaftId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->voteGranted = true;

  voteGrantedVote(pSyncNode->pVotesGranted, pMsg);
  votesRespondAdd(pSyncNode->pVotesRespond, pMsg);
S
Shengliang Guan 已提交
1969
  rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
1970 1971
}

M
Minghao Li 已提交
1972
// return if has a snapshot
M
Minghao Li 已提交
1973 1974
bool syncNodeHasSnapshot(SSyncNode* pSyncNode) {
  bool      ret = false;
1975
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1976 1977
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1978 1979 1980 1981 1982 1983 1984
    if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN) {
      ret = true;
    }
  }
  return ret;
}

M
Minghao Li 已提交
1985 1986
// return max(logLastIndex, snapshotLastIndex)
// if no snapshot and log, return -1
1987
SyncIndex syncNodeGetLastIndex(const SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1988
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1989 1990
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1991 1992 1993 1994 1995 1996 1997
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);

  SyncIndex lastIndex = logLastIndex > snapshot.lastApplyIndex ? logLastIndex : snapshot.lastApplyIndex;
  return lastIndex;
}

M
Minghao Li 已提交
1998 1999
// return the last term of snapshot and log
// if error, return SYNC_TERM_INVALID (by syncLogLastTerm)
M
Minghao Li 已提交
2000 2001
SyncTerm syncNodeGetLastTerm(SSyncNode* pSyncNode) {
  SyncTerm lastTerm = 0;
M
Minghao Li 已提交
2002 2003
  if (syncNodeHasSnapshot(pSyncNode)) {
    // has snapshot
2004
    SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
2005 2006
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
2007 2008
    }

M
Minghao Li 已提交
2009 2010 2011
    SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    if (logLastIndex > snapshot.lastApplyIndex) {
      lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
M
Minghao Li 已提交
2012 2013 2014 2015
    } else {
      lastTerm = snapshot.lastApplyTerm;
    }

M
Minghao Li 已提交
2016
  } else {
M
Minghao Li 已提交
2017 2018
    // no snapshot
    lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
2019
  }
M
Minghao Li 已提交
2020

M
Minghao Li 已提交
2021 2022 2023 2024 2025 2026 2027
  return lastTerm;
}

// get last index and term along with snapshot
int32_t syncNodeGetLastIndexTerm(SSyncNode* pSyncNode, SyncIndex* pLastIndex, SyncTerm* pLastTerm) {
  *pLastIndex = syncNodeGetLastIndex(pSyncNode);
  *pLastTerm = syncNodeGetLastTerm(pSyncNode);
2028 2029
  return 0;
}
M
Minghao Li 已提交
2030

M
Minghao Li 已提交
2031
// return append-entries first try index
M
Minghao Li 已提交
2032 2033 2034 2035 2036
SyncIndex syncNodeSyncStartIndex(SSyncNode* pSyncNode) {
  SyncIndex syncStartIndex = syncNodeGetLastIndex(pSyncNode) + 1;
  return syncStartIndex;
}

M
Minghao Li 已提交
2037 2038
// if index > 0, return index - 1
// else, return -1
2039 2040 2041 2042 2043 2044 2045 2046 2047
SyncIndex syncNodeGetPreIndex(SSyncNode* pSyncNode, SyncIndex index) {
  SyncIndex preIndex = index - 1;
  if (preIndex < SYNC_INDEX_INVALID) {
    preIndex = SYNC_INDEX_INVALID;
  }

  return preIndex;
}

M
Minghao Li 已提交
2048 2049 2050 2051
// if index < 0, return SYNC_TERM_INVALID
// if index == 0, return 0
// if index > 0, return preTerm
// if error, return SYNC_TERM_INVALID
2052 2053 2054 2055 2056 2057 2058 2059 2060
SyncTerm syncNodeGetPreTerm(SSyncNode* pSyncNode, SyncIndex index) {
  if (index < SYNC_INDEX_BEGIN) {
    return SYNC_TERM_INVALID;
  }

  if (index == SYNC_INDEX_BEGIN) {
    return 0;
  }

2061 2062 2063
  SyncTerm  preTerm = 0;
  SyncIndex preIndex = index - 1;

2064
  SSyncRaftEntry* pPreEntry = NULL;
2065 2066 2067 2068 2069 2070 2071
  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
  LRUHandle*      h = taosLRUCacheLookup(pCache, &preIndex, sizeof(preIndex));
  int32_t         code = 0;
  if (h) {
    pPreEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
    code = 0;

2072
    pSyncNode->pLogStore->cacheHit++;
2073 2074 2075
    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", preIndex, pPreEntry->bytes, pPreEntry);

  } else {
2076
    pSyncNode->pLogStore->cacheMiss++;
2077 2078 2079 2080
    sNTrace(pSyncNode, "miss cache index:%" PRId64, preIndex);

    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, preIndex, &pPreEntry);
  }
M
Minghao Li 已提交
2081 2082 2083 2084 2085 2086

  SSnapshot snapshot = {.data = NULL,
                        .lastApplyIndex = SYNC_INDEX_INVALID,
                        .lastApplyTerm = SYNC_TERM_INVALID,
                        .lastConfigIndex = SYNC_INDEX_INVALID};

2087
  if (code == 0) {
2088
    ASSERT(pPreEntry != NULL);
2089
    preTerm = pPreEntry->term;
2090 2091 2092 2093

    if (h) {
      taosLRUCacheRelease(pCache, h, false);
    } else {
2094
      syncEntryDestroy(pPreEntry);
2095 2096
    }

2097 2098
    return preTerm;
  } else {
2099 2100 2101 2102
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
      if (snapshot.lastApplyIndex == preIndex) {
        return snapshot.lastApplyTerm;
2103 2104 2105 2106
      }
    }
  }

2107
  sNError(pSyncNode, "sync node get pre term error, index:%" PRId64 ", snap-index:%" PRId64 ", snap-term:%" PRId64,
S
Shengliang Guan 已提交
2108
          index, snapshot.lastApplyIndex, snapshot.lastApplyTerm);
2109 2110
  return SYNC_TERM_INVALID;
}
M
Minghao Li 已提交
2111 2112 2113 2114

// get pre index and term of "index"
int32_t syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncIndex* pPreIndex, SyncTerm* pPreTerm) {
  *pPreIndex = syncNodeGetPreIndex(pSyncNode, index);
M
Minghao Li 已提交
2115
  *pPreTerm = syncNodeGetPreTerm(pSyncNode, index);
M
Minghao Li 已提交
2116 2117 2118
  return 0;
}

M
Minghao Li 已提交
2119
static void syncNodeEqPingTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2120
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2121

S
Shengliang Guan 已提交
2122 2123 2124
  SSyncNode* pNode = param;
  if (atomic_load_64(&pNode->pingTimerLogicClockUser) <= atomic_load_64(&pNode->pingTimerLogicClock)) {
    SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2125
    int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_PING, atomic_load_64(&pNode->pingTimerLogicClock),
S
Shengliang Guan 已提交
2126 2127
                                    pNode->pingTimerMS, pNode);
    if (code != 0) {
M
Minghao Li 已提交
2128
      sError("failed to build ping msg");
S
Shengliang Guan 已提交
2129 2130
      rpcFreeCont(rpcMsg.pCont);
      return;
M
Minghao Li 已提交
2131
    }
M
Minghao Li 已提交
2132

M
Minghao Li 已提交
2133
    // sTrace("enqueue ping msg");
S
Shengliang Guan 已提交
2134 2135
    code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
    if (code != 0) {
M
Minghao Li 已提交
2136
      sError("failed to sync enqueue ping msg since %s", terrstr());
S
Shengliang Guan 已提交
2137 2138
      rpcFreeCont(rpcMsg.pCont);
      return;
2139
    }
M
Minghao Li 已提交
2140

S
Shengliang Guan 已提交
2141
    taosTmrReset(syncNodeEqPingTimer, pNode->pingTimerMS, pNode, syncEnv()->pTimerManager, &pNode->pPingTimer);
2142
  }
M
Minghao Li 已提交
2143 2144
}

M
Minghao Li 已提交
2145
static void syncNodeEqElectTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2146
  if (!syncIsInit()) return;
M
Minghao Li 已提交
2147

M
Minghao Li 已提交
2148 2149
  int64_t    rid = (int64_t)param;
  SSyncNode* pNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
2150

2151
  if (pNode == NULL) return;
M
Minghao Li 已提交
2152 2153 2154 2155 2156

  if (pNode->syncEqMsg == NULL) {
    syncNodeRelease(pNode);
    return;
  }
2157

2158
  int64_t tsNow = taosGetTimestampMs();
M
Minghao Li 已提交
2159 2160 2161 2162
  if (tsNow < pNode->electTimerParam.executeTime) {
    syncNodeRelease(pNode);
    return;
  }
M
Minghao Li 已提交
2163

S
Shengliang Guan 已提交
2164
  SRpcMsg rpcMsg = {0};
2165 2166
  int32_t code =
      syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_ELECTION, pNode->electTimerParam.logicClock, pNode->electTimerMS, pNode);
S
Shengliang Guan 已提交
2167

S
Shengliang Guan 已提交
2168
  if (code != 0) {
M
Minghao Li 已提交
2169
    sError("failed to build elect msg");
M
Minghao Li 已提交
2170
    syncNodeRelease(pNode);
S
Shengliang Guan 已提交
2171
    return;
M
Minghao Li 已提交
2172 2173
  }

S
Shengliang Guan 已提交
2174
  SyncTimeout* pTimeout = rpcMsg.pCont;
S
Shengliang Guan 已提交
2175
  sNTrace(pNode, "enqueue elect msg lc:%" PRId64, pTimeout->logicClock);
S
Shengliang Guan 已提交
2176 2177 2178

  code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2179
    sError("failed to sync enqueue elect msg since %s", terrstr());
S
Shengliang Guan 已提交
2180
    rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
2181
    syncNodeRelease(pNode);
2182
    return;
M
Minghao Li 已提交
2183
  }
M
Minghao Li 已提交
2184 2185

  syncNodeRelease(pNode);
M
Minghao Li 已提交
2186 2187
}

M
Minghao Li 已提交
2188
static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2189
  if (!syncIsInit()) return;
2190

S
Shengliang Guan 已提交
2191 2192 2193 2194
  SSyncNode* pNode = param;
  if (pNode->replicaNum > 1) {
    if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) {
      SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2195
      int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock),
S
Shengliang Guan 已提交
2196 2197 2198
                                      pNode->heartbeatTimerMS, pNode);

      if (code != 0) {
M
Minghao Li 已提交
2199
        sError("failed to build heartbeat msg");
S
Shengliang Guan 已提交
2200
        return;
2201
      }
M
Minghao Li 已提交
2202

2203
      sTrace("vgId:%d, enqueue heartbeat timer", pNode->vgId);
S
Shengliang Guan 已提交
2204 2205
      code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
      if (code != 0) {
M
Minghao Li 已提交
2206
        sError("failed to enqueue heartbeat msg since %s", terrstr());
S
Shengliang Guan 已提交
2207 2208
        rpcFreeCont(rpcMsg.pCont);
        return;
2209
      }
S
Shengliang Guan 已提交
2210 2211 2212 2213

      taosTmrReset(syncNodeEqHeartbeatTimer, pNode->heartbeatTimerMS, pNode, syncEnv()->pTimerManager,
                   &pNode->pHeartbeatTimer);

2214
    } else {
S
Shengliang Guan 已提交
2215 2216
      sTrace("==syncNodeEqHeartbeatTimer== heartbeatTimerLogicClock:%" PRId64 ", heartbeatTimerLogicClockUser:%" PRId64,
             pNode->heartbeatTimerLogicClock, pNode->heartbeatTimerLogicClockUser);
2217
    }
M
Minghao Li 已提交
2218 2219 2220
  }
}

2221
static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
2222
  int64_t hbDataRid = (int64_t)param;
2223
  int64_t tsNow = taosGetTimestampMs();
2224

2225 2226
  SSyncHbTimerData* pData = syncHbTimerDataAcquire(hbDataRid);
  if (pData == NULL) {
M
Minghao Li 已提交
2227
    sError("hb timer get pData NULL, %" PRId64, hbDataRid);
2228 2229
    return;
  }
2230

2231
  SSyncNode* pSyncNode = syncNodeAcquire(pData->syncNodeRid);
M
Minghao Li 已提交
2232
  if (pSyncNode == NULL) {
2233
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2234
    sError("hb timer get pSyncNode NULL");
2235 2236 2237 2238 2239 2240 2241 2242
    return;
  }

  SSyncTimer* pSyncTimer = pData->pTimer;

  if (!pSyncNode->isStart) {
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2243
    sError("vgId:%d, hb timer sync node already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2244 2245 2246
    return;
  }

M
Minghao Li 已提交
2247
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
2248 2249
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2250
    sError("vgId:%d, hb timer sync node not leader", pSyncNode->vgId);
M
Minghao Li 已提交
2251 2252 2253
    return;
  }

M
Minghao Li 已提交
2254
  if (pSyncNode->pRaftStore == NULL) {
2255 2256
    syncNodeRelease(pSyncNode);
    syncHbTimerDataRelease(pData);
M
Minghao Li 已提交
2257
    sError("vgId:%d, hb timer raft store already stop", pSyncNode->vgId);
M
Minghao Li 已提交
2258 2259 2260
    return;
  }

M
Minghao Li 已提交
2261
  // sTrace("vgId:%d, eq peer hb timer", pSyncNode->vgId);
2262 2263

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
2264 2265 2266
    int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock);
    int64_t msgLogicClock = atomic_load_64(&pData->logicClock);

2267
    if (timerLogicClock == msgLogicClock) {
2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287
      if (tsNow > pData->execTime) {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  EXECUTE this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, "
            "---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif

        pData->execTime += pSyncTimer->timerMS;

        SRpcMsg rpcMsg = {0};
        (void)syncBuildHeartbeat(&rpcMsg, pSyncNode->vgId);

        SyncHeartbeat* pSyncMsg = rpcMsg.pCont;
        pSyncMsg->srcId = pSyncNode->myRaftId;
        pSyncMsg->destId = pData->destId;
        pSyncMsg->term = pSyncNode->pRaftStore->currentTerm;
        pSyncMsg->commitIndex = pSyncNode->commitIndex;
        pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
        pSyncMsg->privateTerm = 0;
2288
        pSyncMsg->timeStamp = tsNow;
2289 2290 2291 2292 2293 2294

        // update reset time
        int64_t timerElapsed = tsNow - pSyncTimer->timeStamp;
        pSyncTimer->timeStamp = tsNow;

        // send msg
2295 2296
        syncLogSendHeartbeat(pSyncNode, pSyncMsg, false, timerElapsed, pData->execTime);
        syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
2297 2298 2299 2300 2301 2302 2303 2304
      } else {
#if 0        
        sTrace(
            "vgId:%d, hbDataRid:%ld,  pass this step-------- heartbeat tsNow:%ld, exec:%ld, tsNow-exec:%ld, ---------",
            pSyncNode->vgId, hbDataRid, tsNow, pData->execTime, tsNow - pData->execTime);
#endif
      }

M
Minghao Li 已提交
2305 2306
      if (syncIsInit()) {
        // sTrace("vgId:%d, reset peer hb timer", pSyncNode->vgId);
2307 2308
        taosTmrReset(syncNodeEqPeerHeartbeatTimer, pSyncTimer->timerMS / HEARTBEAT_TICK_NUM, (void*)hbDataRid,
                     syncEnv()->pTimerManager, &pSyncTimer->pTimer);
M
Minghao Li 已提交
2309 2310 2311 2312
      } else {
        sError("sync env is stop, reset peer hb timer error");
      }

2313
    } else {
M
Minghao Li 已提交
2314 2315
      sTrace("vgId:%d, do not send hb, timerLogicClock:%" PRId64 ", msgLogicClock:%" PRId64 "", pSyncNode->vgId,
             timerLogicClock, msgLogicClock);
2316 2317
    }
  }
2318 2319 2320

  syncHbTimerDataRelease(pData);
  syncNodeRelease(pSyncNode);
2321 2322
}

2323 2324 2325 2326 2327
static int32_t syncNodeEqNoop(SSyncNode* pNode) {
  if (pNode->state == TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return -1;
  }
M
Minghao Li 已提交
2328

2329 2330 2331 2332
  SyncIndex       index = pNode->pLogStore->syncLogWriteIndex(pNode->pLogStore);
  SyncTerm        term = pNode->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, pNode->vgId);
  if (pEntry == NULL) return -1;
M
Minghao Li 已提交
2333

S
Shengliang Guan 已提交
2334
  SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2335
  int32_t code = syncBuildClientRequestFromNoopEntry(&rpcMsg, pEntry, pNode->vgId);
2336
  syncEntryDestroy(pEntry);
M
Minghao Li 已提交
2337

2338 2339 2340
  sNTrace(pNode, "propose msg, type:noop");
  code = (*pNode->syncEqMsg)(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2341
    sError("failed to propose noop msg while enqueue since %s", terrstr());
2342
  }
M
Minghao Li 已提交
2343

2344
  return code;
M
Minghao Li 已提交
2345 2346
}

2347 2348
static void deleteCacheEntry(const void* key, size_t keyLen, void* value) { taosMemoryFree(value); }

2349 2350 2351 2352
int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, LRUHandle** h) {
  SSyncLogStoreData* pData = pLogStore->data;
  sNTrace(pData->pSyncNode, "in cache index:%" PRId64 ", bytes:%u, %p", pEntry->index, pEntry->bytes, pEntry);

S
Shengliang Guan 已提交
2353 2354
  int32_t   code = 0;
  int32_t   entryLen = sizeof(*pEntry) + pEntry->dataLen;
2355 2356 2357 2358 2359 2360 2361 2362 2363
  LRUStatus status = taosLRUCacheInsert(pLogStore->pCache, &pEntry->index, sizeof(pEntry->index), pEntry, entryLen,
                                        deleteCacheEntry, h, TAOS_LRU_PRIORITY_LOW);
  if (status != TAOS_LRU_STATUS_OK) {
    code = -1;
  }

  return code;
}

B
Benguang Zhao 已提交
2364 2365 2366
int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) {
  // append to log buffer
  if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) {
2367
    sError("vgId:%d, failed to enqueue sync log buffer. index:%" PRId64 "", ths->vgId, pEntry->index);
B
Benguang Zhao 已提交
2368 2369 2370 2371
    return -1;
  }

  // proceed match index, with replicating on needed
2372
  SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, NULL);
B
Benguang Zhao 已提交
2373

2374
  sTrace("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64
2375 2376 2377
         ", %" PRId64 ")",
         ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex,
         ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex);
B
Benguang Zhao 已提交
2378

B
Benguang Zhao 已提交
2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394
  // multi replica
  if (ths->replicaNum > 1) {
    return 0;
  }

  // single replica
  (void)syncNodeUpdateCommitIndex(ths, matchIndex);

  if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
    sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex);
    return -1;
  }

  return 0;
}

2395
bool syncNodeHeartbeatReplyTimeout(SSyncNode* pSyncNode) {
2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407
  if (pSyncNode->replicaNum == 1) {
    return false;
  }

  int32_t toCount = 0;
  int64_t tsNow = taosGetTimestampMs();
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
    int64_t recvTime = syncIndexMgrGetRecvTime(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (recvTime == 0 || recvTime == -1) {
      continue;
    }

2408
    if (tsNow - recvTime > tsHeartbeatTimeout) {
2409 2410 2411 2412 2413 2414 2415 2416 2417
      toCount++;
    }
  }

  bool b = (toCount >= pSyncNode->quorum ? true : false);

  return b;
}

2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436
bool syncNodeSnapshotSending(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  bool b = false;
  for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
    if (pSyncNode->senders[i] != NULL && pSyncNode->senders[i]->start) {
      b = true;
      break;
    }
  }
  return b;
}

bool syncNodeSnapshotRecving(SSyncNode* pSyncNode) {
  if (pSyncNode == NULL) return false;
  if (pSyncNode->pNewNodeReceiver == NULL) return false;
  if (pSyncNode->pNewNodeReceiver->start) return true;
  return false;
}

M
Minghao Li 已提交
2437
static int32_t syncNodeAppendNoop(SSyncNode* ths) {
B
Benguang Zhao 已提交
2438 2439 2440 2441 2442 2443 2444 2445 2446
  SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm  term = ths->pRaftStore->currentTerm;

  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
  if (pEntry == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return -1;
  }

B
Benguang Zhao 已提交
2447 2448
  int32_t ret = syncNodeAppend(ths, pEntry);
  return 0;
B
Benguang Zhao 已提交
2449 2450 2451
}

static int32_t syncNodeAppendNoopOld(SSyncNode* ths) {
M
Minghao Li 已提交
2452 2453
  int32_t ret = 0;

2454
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2455
  SyncTerm        term = ths->pRaftStore->currentTerm;
M
Minghao Li 已提交
2456
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
2457
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2458

2459 2460
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2461
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2462
    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
2463
    if (code != 0) {
M
Minghao Li 已提交
2464
      sError("append noop error");
2465 2466
      return -1;
    }
2467 2468

    syncCacheEntry(ths->pLogStore, pEntry, &h);
M
Minghao Li 已提交
2469 2470
  }

2471 2472 2473
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2474
    syncEntryDestroy(pEntry);
2475 2476
  }

M
Minghao Li 已提交
2477 2478 2479
  return ret;
}

S
Shengliang Guan 已提交
2480 2481
int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncHeartbeat* pMsg = pRpcMsg->pCont;
2482

M
Minghao Li 已提交
2483 2484 2485 2486
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2487
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2488
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2489
  syncLogRecvHeartbeat(ths, pMsg, timeDiff, tbuf);
2490

2491 2492 2493 2494
  SRpcMsg rpcMsg = {0};
  (void)syncBuildHeartbeatReply(&rpcMsg, ths->vgId);

  SyncHeartbeatReply* pMsgReply = rpcMsg.pCont;
2495 2496 2497 2498
  pMsgReply->destId = pMsg->srcId;
  pMsgReply->srcId = ths->myRaftId;
  pMsgReply->term = ths->pRaftStore->currentTerm;
  pMsgReply->privateTerm = 8864;  // magic number
2499
  pMsgReply->startTime = ths->startTime;
2500
  pMsgReply->timeStamp = tsMs;
2501

M
Minghao Li 已提交
2502
  if (pMsg->term == ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) {
2503 2504
    syncIndexMgrSetRecvTime(ths->pNextIndex, &(pMsg->srcId), tsMs);

2505
    syncNodeResetElectTimer(ths);
M
Minghao Li 已提交
2506
    ths->minMatchIndex = pMsg->minMatchIndex;
2507 2508

    if (ths->state == TAOS_SYNC_STATE_FOLLOWER) {
2509
      // syncNodeFollowerCommit(ths, pMsg->commitIndex);
S
Shengliang Guan 已提交
2510 2511 2512 2513
      SRpcMsg rpcMsgLocalCmd = {0};
      (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

      SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2514 2515
      pSyncMsg->cmd = SYNC_LOCAL_CMD_FOLLOWER_CMT;
      pSyncMsg->fcIndex = pMsg->commitIndex;
2516
      SyncIndex fcIndex = pSyncMsg->fcIndex;
2517 2518 2519 2520 2521 2522 2523

      if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
        int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
        if (code != 0) {
          sError("vgId:%d, sync enqueue fc-commit msg error, code:%d", ths->vgId, code);
          rpcFreeCont(rpcMsgLocalCmd.pCont);
        } else {
2524
          sTrace("vgId:%d, sync enqueue fc-commit msg, fc-index:%" PRId64, ths->vgId, fcIndex);
2525 2526
        }
      }
2527 2528 2529
    }
  }

M
Minghao Li 已提交
2530
  if (pMsg->term >= ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) {
2531
    // syncNodeStepDown(ths, pMsg->term);
S
Shengliang Guan 已提交
2532 2533 2534 2535
    SRpcMsg rpcMsgLocalCmd = {0};
    (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

    SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2536 2537 2538
    pSyncMsg->cmd = SYNC_LOCAL_CMD_STEP_DOWN;
    pSyncMsg->sdNewTerm = pMsg->term;

S
Shengliang Guan 已提交
2539 2540
    if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
      int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
2541 2542 2543 2544
      if (code != 0) {
        sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code);
        rpcFreeCont(rpcMsgLocalCmd.pCont);
      } else {
2545
        sTrace("vgId:%d, sync enqueue step-down msg, new-term: %" PRId64, ths->vgId, pSyncMsg->sdNewTerm);
2546
      }
2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561
    }
  }

  /*
    // htonl
    SMsgHead* pHead = rpcMsg.pCont;
    pHead->contLen = htonl(pHead->contLen);
    pHead->vgId = htonl(pHead->vgId);
  */

  // reply
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);
  return 0;
}

2562
int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
S
Shengliang Guan 已提交
2563 2564 2565 2566
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

2567
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
B
Benguang Zhao 已提交
2568
  SSyncLogReplMgr*    pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId);
2569 2570 2571 2572
  if (pMgr == NULL) {
    sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr);
    return -1;
  }
2573 2574

  int64_t tsMs = taosGetTimestampMs();
S
Shengliang Guan 已提交
2575
  syncLogRecvHeartbeatReply(ths, pMsg, tsMs - pMsg->timeStamp, tbuf);
2576

2577 2578
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);

2579 2580 2581
  return syncLogReplMgrProcessHeartbeatReply(pMgr, ths, pMsg);
}

2582
int32_t syncNodeOnHeartbeatReplyOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
2583
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
2584

M
Minghao Li 已提交
2585 2586 2587 2588
  const STraceId* trace = &pRpcMsg->info.traceId;
  char            tbuf[40] = {0};
  TRACE_TO_STR(trace, tbuf);

M
Minghao Li 已提交
2589
  int64_t tsMs = taosGetTimestampMs();
M
Minghao Li 已提交
2590
  int64_t timeDiff = tsMs - pMsg->timeStamp;
M
Minghao Li 已提交
2591
  syncLogRecvHeartbeatReply(ths, pMsg, timeDiff, tbuf);
M
Minghao Li 已提交
2592

2593
  // update last reply time, make decision whether the other node is alive or not
M
Minghao Li 已提交
2594
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->srcId, tsMs);
2595 2596 2597
  return 0;
}

S
Shengliang Guan 已提交
2598 2599
int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
2600 2601
  syncLogRecvLocalCmd(ths, pMsg, "");

2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    (void)syncNodeUpdateCommitIndex(ths, pMsg->fcIndex);
    if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
      sError("vgId:%d, failed to commit raft log since %s. commit index: %" PRId64 "", ths->vgId, terrstr(),
             ths->commitIndex);
    }
  } else {
    sError("error local cmd");
  }

  return 0;
}

int32_t syncNodeOnLocalCmdOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
  syncLogRecvLocalCmd(ths, pMsg, "");

M
Minghao Li 已提交
2622 2623 2624
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

2625 2626 2627
  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    syncNodeFollowerCommit(ths, pMsg->fcIndex);

M
Minghao Li 已提交
2628
  } else {
M
Minghao Li 已提交
2629
    sError("error local cmd");
M
Minghao Li 已提交
2630
  }
2631 2632 2633 2634

  return 0;
}

M
Minghao Li 已提交
2635 2636 2637 2638 2639 2640 2641 2642 2643 2644
// TLA+ Spec
// ClientRequest(i, v) ==
//     /\ state[i] = Leader
//     /\ LET entry == [term  |-> currentTerm[i],
//                      value |-> v]
//            newLog == Append(log[i], entry)
//        IN  log' = [log EXCEPT ![i] = newLog]
//     /\ UNCHANGED <<messages, serverVars, candidateVars,
//                    leaderVars, commitIndex>>
//
M
Minghao Li 已提交
2645

2646
int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
S
Shengliang Guan 已提交
2647
  sNTrace(ths, "on client request");
2648

B
Benguang Zhao 已提交
2649 2650
  int32_t code = 0;

B
Benguang Zhao 已提交
2651 2652 2653
  SyncIndex       index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm        term = ths->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = NULL;
2654 2655 2656 2657
  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
B
Benguang Zhao 已提交
2658 2659 2660 2661 2662 2663 2664
  }

  if (ths->state == TAOS_SYNC_STATE_LEADER) {
    if (pRetIndex) {
      (*pRetIndex) = index;
    }

2665 2666
    int32_t code = syncNodeAppend(ths, pEntry);
    if (code < 0 && ths->vgId != 1 && vnodeIsMsgBlock(pEntry->originalRpcType)) {
S
Shengliang Guan 已提交
2667
      ASSERTS(false, "failed to append blocking msg");
2668 2669
    }
    return code;
B
Benguang Zhao 已提交
2670 2671
  }

B
Benguang Zhao 已提交
2672
  return -1;
B
Benguang Zhao 已提交
2673 2674
}

2675 2676
int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
  sNTrace(ths, "on client request");
B
Benguang Zhao 已提交
2677

M
Minghao Li 已提交
2678
  int32_t ret = 0;
2679
  int32_t code = 0;
M
Minghao Li 已提交
2680

M
Minghao Li 已提交
2681
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2682
  SyncTerm        term = ths->pRaftStore->currentTerm;
2683 2684 2685 2686 2687 2688 2689
  SSyncRaftEntry* pEntry;

  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
  }
M
Minghao Li 已提交
2690

2691 2692
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2693
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2694 2695 2696
    // append entry
    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    if (code != 0) {
2697 2698 2699 2700
      if (ths->replicaNum == 1) {
        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2701
          syncEntryDestroy(pEntry);
2702
        }
2703

2704 2705 2706 2707
        return -1;

      } else {
        // del resp mgr, call FpCommitCb
2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718
        SFsmCbMeta cbMeta = {
            .index = pEntry->index,
            .lastConfigIndex = SYNC_INDEX_INVALID,
            .isWeak = pEntry->isWeak,
            .code = -1,
            .state = ths->state,
            .seqNum = pEntry->seqNum,
            .term = pEntry->term,
            .currentTerm = ths->pRaftStore->currentTerm,
            .flag = 0,
        };
2719
        ths->pFsm->FpCommitCb(ths->pFsm, pMsg, &cbMeta);
2720 2721 2722 2723

        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2724
          syncEntryDestroy(pEntry);
2725 2726
        }

2727 2728
        return -1;
      }
2729
    }
M
Minghao Li 已提交
2730

2731 2732
    syncCacheEntry(ths->pLogStore, pEntry, &h);

2733 2734
    // if mulit replica, start replicate right now
    if (ths->replicaNum > 1) {
M
Minghao Li 已提交
2735
      syncNodeReplicate(ths);
2736
    }
2737

2738 2739
    // if only myself, maybe commit right now
    if (ths->replicaNum == 1) {
2740 2741 2742 2743 2744
      if (syncNodeIsMnode(ths)) {
        syncMaybeAdvanceCommitIndex(ths);
      } else {
        syncOneReplicaAdvance(ths);
      }
2745
    }
M
Minghao Li 已提交
2746 2747
  }

2748 2749 2750 2751 2752 2753 2754 2755
  if (pRetIndex != NULL) {
    if (ret == 0 && pEntry != NULL) {
      *pRetIndex = pEntry->index;
    } else {
      *pRetIndex = SYNC_INDEX_INVALID;
    }
  }

2756 2757 2758
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2759
    syncEntryDestroy(pEntry);
2760 2761
  }

M
Minghao Li 已提交
2762
  return ret;
2763
}
M
Minghao Li 已提交
2764

S
Shengliang Guan 已提交
2765 2766 2767
const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
2768
      return "follower";
S
Shengliang Guan 已提交
2769
    case TAOS_SYNC_STATE_CANDIDATE:
2770
      return "candidate";
S
Shengliang Guan 已提交
2771
    case TAOS_SYNC_STATE_LEADER:
2772
      return "leader";
S
Shengliang Guan 已提交
2773
    case TAOS_SYNC_STATE_ERROR:
2774
      return "error";
S
Shengliang Guan 已提交
2775 2776 2777 2778
    case TAOS_SYNC_STATE_OFFLINE:
      return "offline";
    default:
      return "unknown";
S
Shengliang Guan 已提交
2779
  }
M
Minghao Li 已提交
2780
}
2781

2782
#if 0
2783
int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry) {
2784
  if (ths->state != TAOS_SYNC_STATE_FOLLOWER) {
S
Shengliang Guan 已提交
2785
    sNTrace(ths, "I am not follower, can not do leader transfer");
2786 2787
    return 0;
  }
2788 2789

  if (!ths->restoreFinish) {
S
Shengliang Guan 已提交
2790
    sNTrace(ths, "restore not finish, can not do leader transfer");
2791 2792 2793
    return 0;
  }

2794
  if (pEntry->term < ths->pRaftStore->currentTerm) {
2795
    sNTrace(ths, "little term:%" PRId64 ", can not do leader transfer", pEntry->term);
2796 2797 2798 2799
    return 0;
  }

  if (pEntry->index < syncNodeGetLastIndex(ths)) {
S
Shengliang Guan 已提交
2800
    sNTrace(ths, "little index:%" PRId64 ", can not do leader transfer", pEntry->index);
2801 2802 2803
    return 0;
  }

2804 2805
  /*
    if (ths->vgId > 1) {
S
Shengliang Guan 已提交
2806
      sNTrace(ths, "I am vnode, can not do leader transfer");
2807 2808 2809 2810
      return 0;
    }
  */

2811
  SyncLeaderTransfer* pSyncLeaderTransfer = pRpcMsg->pCont;
S
Shengliang Guan 已提交
2812
  sNTrace(ths, "do leader transfer, index:%" PRId64, pEntry->index);
M
Minghao Li 已提交
2813

M
Minghao Li 已提交
2814 2815 2816
  bool sameId = syncUtilSameId(&(pSyncLeaderTransfer->newLeaderId), &(ths->myRaftId));
  bool sameNodeInfo = strcmp(pSyncLeaderTransfer->newNodeInfo.nodeFqdn, ths->myNodeInfo.nodeFqdn) == 0 &&
                      pSyncLeaderTransfer->newNodeInfo.nodePort == ths->myNodeInfo.nodePort;
M
Minghao Li 已提交
2817

M
Minghao Li 已提交
2818 2819
  bool same = sameId || sameNodeInfo;
  if (same) {
M
Minghao Li 已提交
2820 2821 2822
    // reset elect timer now!
    int32_t electMS = 1;
    int32_t ret = syncNodeRestartElectTimer(ths, electMS);
2823
    ASSERT(ret == 0);
M
Minghao Li 已提交
2824

2825
    sNTrace(ths, "maybe leader transfer to %s:%d %" PRId64, pSyncLeaderTransfer->newNodeInfo.nodeFqdn,
S
Shengliang Guan 已提交
2826
            pSyncLeaderTransfer->newNodeInfo.nodePort, pSyncLeaderTransfer->newLeaderId.addr);
2827 2828
  }

M
Minghao Li 已提交
2829
  if (ths->pFsm->FpLeaderTransferCb != NULL) {
S
Shengliang Guan 已提交
2830
    SFsmCbMeta cbMeta = {
S
Shengliang Guan 已提交
2831 2832 2833 2834 2835 2836 2837 2838 2839
        .code = 0,
        .currentTerm = ths->pRaftStore->currentTerm,
        .flag = 0,
        .index = pEntry->index,
        .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
        .isWeak = pEntry->isWeak,
        .seqNum = pEntry->seqNum,
        .state = ths->state,
        .term = pEntry->term,
S
Shengliang Guan 已提交
2840 2841
    };
    ths->pFsm->FpLeaderTransferCb(ths->pFsm, pRpcMsg, &cbMeta);
2842 2843
  }

2844 2845 2846
  return 0;
}

2847 2848
#endif

2849
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
2850
  for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) {
2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((pNewCfg->nodeInfo)[i].nodeFqdn, (pNewCfg->nodeInfo)[i].nodePort);
    raftId.vgId = ths->vgId;

    if (syncUtilSameId(&(ths->myRaftId), &raftId)) {
      pNewCfg->myIndex = i;
      return 0;
    }
  }

  return -1;
}

2864 2865 2866 2867
bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
}

M
Minghao Li 已提交
2868
int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) {
2869
  ASSERT(false);
2870 2871 2872 2873
  if (beginIndex > endIndex) {
    return 0;
  }

M
Minghao Li 已提交
2874 2875 2876 2877 2878 2879 2880 2881 2882
  if (ths == NULL) {
    return -1;
  }

  if (ths->pFsm != NULL && ths->pFsm->FpGetSnapshotInfo != NULL) {
    // advance commit index to sanpshot first
    SSnapshot snapshot = {0};
    ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot);
    if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex >= beginIndex) {
S
Shengliang Guan 已提交
2883
      sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, beginIndex, snapshot.lastApplyIndex);
2884

M
Minghao Li 已提交
2885 2886 2887
      // update begin index
      beginIndex = snapshot.lastApplyIndex + 1;
    }
2888 2889
  }

2890 2891
  int32_t    code = 0;
  ESyncState state = flag;
M
Minghao Li 已提交
2892

S
Shengliang Guan 已提交
2893
  sNTrace(ths, "commit by wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex);
2894 2895 2896 2897 2898 2899

  // execute fsm
  if (ths->pFsm != NULL) {
    for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
      if (i != SYNC_INDEX_INVALID) {
        SSyncRaftEntry* pEntry;
2900 2901 2902 2903
        SLRUCache*      pCache = ths->pLogStore->pCache;
        LRUHandle*      h = taosLRUCacheLookup(pCache, &i, sizeof(i));
        if (h) {
          pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
2904

2905
          ths->pLogStore->cacheHit++;
2906 2907
          sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", i, pEntry->bytes, pEntry);

2908
        } else {
2909
          ths->pLogStore->cacheMiss++;
2910 2911
          sNTrace(ths, "miss cache index:%" PRId64, i);

2912
          code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry);
2913 2914
          // ASSERT(code == 0);
          // ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2915
          if (code != 0 || pEntry == NULL) {
S
Shengliang Guan 已提交
2916
            sNError(ths, "get log entry error");
2917
            sFatal("vgId:%d, get log entry %" PRId64 " error when commit since %s", ths->vgId, i, terrstr());
M
Minghao Li 已提交
2918 2919
            continue;
          }
2920
        }
2921

2922
        SRpcMsg rpcMsg = {0};
2923 2924
        syncEntry2OriginalRpc(pEntry, &rpcMsg);

2925
        sTrace("do commit index:%" PRId64 ", type:%s", i, TMSG_INFO(pEntry->msgType));
M
Minghao Li 已提交
2926

2927
        // user commit
2928 2929
        if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) {
          bool internalExecute = true;
S
Shengliang Guan 已提交
2930
          if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) {
2931 2932 2933
            internalExecute = false;
          }

M
Minghao Li 已提交
2934 2935
          sNTrace(ths, "user commit index:%" PRId64 ", internal:%d, type:%s", i, internalExecute,
                  TMSG_INFO(pEntry->msgType));
2936

2937 2938
          // execute fsm in apply thread, or execute outside syncPropose
          if (internalExecute) {
S
Shengliang Guan 已提交
2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950
            SFsmCbMeta cbMeta = {
                .index = pEntry->index,
                .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
                .isWeak = pEntry->isWeak,
                .code = 0,
                .state = ths->state,
                .seqNum = pEntry->seqNum,
                .term = pEntry->term,
                .currentTerm = ths->pRaftStore->currentTerm,
                .flag = flag,
            };

S
Shengliang Guan 已提交
2951
            syncRespMgrGetAndDel(ths->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info);
S
Shengliang Guan 已提交
2952
            ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, &cbMeta);
M
Minghao Li 已提交
2953
          }
2954
        }
2955

2956 2957
#if 0
        // execute in pre-commit
M
Minghao Li 已提交
2958
        // leader transfer
2959 2960
        if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) {
          code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry);
2961
          ASSERT(code == 0);
2962
        }
2963
#endif
2964 2965

        // restore finish
2966
        // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok
2967 2968 2969 2970 2971 2972
        if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
          if (ths->restoreFinish == false) {
            if (ths->pFsm->FpRestoreFinishCb != NULL) {
              ths->pFsm->FpRestoreFinishCb(ths->pFsm);
            }
            ths->restoreFinish = true;
M
Minghao Li 已提交
2973

2974
            int64_t restoreDelay = taosGetTimestampMs() - ths->leaderTime;
S
Shengliang Guan 已提交
2975
            sNTrace(ths, "restore finish, index:%" PRId64 ", elapsed:%" PRId64 " ms", pEntry->index, restoreDelay);
2976 2977 2978 2979
          }
        }

        rpcFreeCont(rpcMsg.pCont);
2980 2981 2982
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
B
Benguang Zhao 已提交
2983
          syncEntryDestroy(pEntry);
2984
        }
2985 2986 2987 2988
      }
    }
  }
  return 0;
2989 2990 2991
}

bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
S
Shengliang Guan 已提交
2992
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2993 2994 2995 2996 2997
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
      return true;
    }
  }
  return false;
M
Minghao Li 已提交
2998 2999 3000 3001
}

SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
  SSyncSnapshotSender* pSender = NULL;
S
Shengliang Guan 已提交
3002
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
3003 3004 3005 3006 3007
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pSender = (ths->senders)[i];
    }
  }
  return pSender;
M
Minghao Li 已提交
3008
}
M
Minghao Li 已提交
3009

3010 3011
SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
  SSyncTimer* pTimer = NULL;
S
Shengliang Guan 已提交
3012
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
3013 3014 3015 3016 3017 3018 3019
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pTimer = &((ths->peerHeartbeatTimerArr)[i]);
    }
  }
  return pTimer;
}

M
Minghao Li 已提交
3020 3021
SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) {
  SPeerState* pState = NULL;
S
Shengliang Guan 已提交
3022
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
3023 3024 3025 3026 3027 3028 3029 3030 3031
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pState = &((ths->peerStates)[i]);
    }
  }
  return pState;
}

bool syncNodeNeedSendAppendEntries(SSyncNode* ths, const SRaftId* pDestId, const SyncAppendEntries* pMsg) {
  SPeerState* pState = syncNodeGetPeerState(ths, pDestId);
M
Minghao Li 已提交
3032
  if (pState == NULL) {
3033
    sError("vgId:%d, replica maybe dropped", ths->vgId);
M
Minghao Li 已提交
3034 3035
    return false;
  }
M
Minghao Li 已提交
3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046

  SyncIndex sendIndex = pMsg->prevLogIndex + 1;
  int64_t   tsNow = taosGetTimestampMs();

  if (pState->lastSendIndex == sendIndex && tsNow - pState->lastSendTime < SYNC_APPEND_ENTRIES_TIMEOUT_MS) {
    return false;
  }

  return true;
}

M
Minghao Li 已提交
3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060
bool syncNodeCanChange(SSyncNode* pSyncNode) {
  if (pSyncNode->changing) {
    sError("sync cannot change");
    return false;
  }

  if ((pSyncNode->commitIndex >= SYNC_INDEX_BEGIN)) {
    SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
    if (pSyncNode->commitIndex != lastIndex) {
      sError("sync cannot change2");
      return false;
    }
  }

S
Shengliang Guan 已提交
3061
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
3062
    SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->peersId)[i]);
M
Minghao Li 已提交
3063
    if (pSender != NULL && pSender->start) {
M
Minghao Li 已提交
3064 3065 3066 3067 3068 3069
      sError("sync cannot change3");
      return false;
    }
  }

  return true;
M
Minghao Li 已提交
3070
}