syncMain.c 87.8 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

S
Shengliang Guan 已提交
16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "sync.h"
M
Minghao Li 已提交
18 19
#include "syncAppendEntries.h"
#include "syncAppendEntriesReply.h"
M
Minghao Li 已提交
20
#include "syncCommit.h"
M
Minghao Li 已提交
21
#include "syncElection.h"
M
Minghao Li 已提交
22
#include "syncEnv.h"
M
Minghao Li 已提交
23
#include "syncIndexMgr.h"
M
Minghao Li 已提交
24
#include "syncInt.h"
25
#include "syncLogBuffer.h"
M
Minghao Li 已提交
26
#include "syncMessage.h"
M
Minghao Li 已提交
27
#include "syncRaftCfg.h"
M
Minghao Li 已提交
28
#include "syncRaftLog.h"
M
Minghao Li 已提交
29
#include "syncRaftStore.h"
M
Minghao Li 已提交
30
#include "syncReplication.h"
M
Minghao Li 已提交
31 32
#include "syncRequestVote.h"
#include "syncRequestVoteReply.h"
M
Minghao Li 已提交
33
#include "syncRespMgr.h"
M
Minghao Li 已提交
34
#include "syncSnapshot.h"
M
Minghao Li 已提交
35
#include "syncTimeout.h"
M
Minghao Li 已提交
36
#include "syncUtil.h"
M
Minghao Li 已提交
37
#include "syncVoteMgr.h"
M
Minghao Li 已提交
38
#include "tref.h"
M
Minghao Li 已提交
39

M
Minghao Li 已提交
40 41 42 43 44
static void    syncNodeEqPingTimer(void* param, void* tmrId);
static void    syncNodeEqElectTimer(void* param, void* tmrId);
static void    syncNodeEqHeartbeatTimer(void* param, void* tmrId);
static int32_t syncNodeEqNoop(SSyncNode* ths);
static int32_t syncNodeAppendNoop(SSyncNode* ths);
45
static void    syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId);
S
Shengliang Guan 已提交
46
static bool    syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg);
S
Shengliang Guan 已提交
47 48 49
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId);
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer);
S
Shengliang Guan 已提交
50 51 52 53 54 55 56 57 58 59 60
static int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg);
static bool    syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config);
static void    syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* newConfig, SyncIndex lastConfigChangeIndex);
static bool    syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg);

static bool    syncNodeCanChange(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode);
static int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader);
static int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry);

static ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode);
M
Minghao Li 已提交
61

62
int64_t syncOpen(SSyncInfo* pSyncInfo) {
M
Minghao Li 已提交
63
  SSyncNode* pSyncNode = syncNodeOpen(pSyncInfo);
64
  if (pSyncNode == NULL) {
S
Shengliang Guan 已提交
65
    sError("vgId:%d, failed to open sync node", pSyncInfo->vgId);
66 67
    return -1;
  }
M
Minghao Li 已提交
68

S
Shengliang Guan 已提交
69
  pSyncNode->rid = syncNodeAdd(pSyncNode);
M
Minghao Li 已提交
70
  if (pSyncNode->rid < 0) {
71
    syncNodeClose(pSyncNode);
M
Minghao Li 已提交
72 73 74
    return -1;
  }

S
Shengliang Guan 已提交
75 76 77 78 79 80
  pSyncNode->pingBaseLine = pSyncInfo->pingMs;
  pSyncNode->pingTimerMS = pSyncInfo->pingMs;
  pSyncNode->electBaseLine = pSyncInfo->electMs;
  pSyncNode->hbBaseLine = pSyncInfo->heartbeatMs;
  pSyncNode->heartbeatTimerMS = pSyncInfo->heartbeatMs;
  pSyncNode->msgcb = pSyncInfo->msgcb;
M
Minghao Li 已提交
81
  return pSyncNode->rid;
M
Minghao Li 已提交
82
}
M
Minghao Li 已提交
83

B
Benguang Zhao 已提交
84
int32_t syncStart(int64_t rid) {
S
Shengliang Guan 已提交
85
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
86
  if (pSyncNode == NULL) {
B
Benguang Zhao 已提交
87 88 89 90 91 92
    sError("failed to acquire rid: %" PRId64 " of tsNodeReftId for pSyncNode", rid);
    return -1;
  }

  if (syncNodeRestore(pSyncNode) < 0) {
    sError("vgId:%d, failed to restore raft log buffer since %s", pSyncNode->vgId, terrstr());
93
    goto _err;
M
Minghao Li 已提交
94
  }
M
Minghao Li 已提交
95

B
Benguang Zhao 已提交
96 97 98 99
  if (syncNodeStart(pSyncNode) < 0) {
    sError("vgId:%d, failed to start sync node since %s", pSyncNode->vgId, terrstr());
    goto _err;
  }
M
Minghao Li 已提交
100

B
Benguang Zhao 已提交
101 102
  syncNodeRelease(pSyncNode);
  return 0;
M
Minghao Li 已提交
103

104 105 106
_err:
  syncNodeRelease(pSyncNode);
  return -1;
M
Minghao Li 已提交
107 108
}

M
Minghao Li 已提交
109
void syncStop(int64_t rid) {
S
Shengliang Guan 已提交
110
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
111
  if (pSyncNode != NULL) {
S
Shengliang Guan 已提交
112
    syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
113
    syncNodeRemove(rid);
M
Minghao Li 已提交
114 115 116
  }
}

M
Minghao Li 已提交
117 118
void syncPreStop(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
119 120 121
  if (pSyncNode != NULL) {
    syncNodePreClose(pSyncNode);
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
122 123 124
  }
}

S
Shengliang Guan 已提交
125 126 127
static bool syncNodeCheckNewConfig(SSyncNode* pSyncNode, const SSyncCfg* pCfg) {
  if (!syncNodeInConfig(pSyncNode, pCfg)) return false;
  return abs(pCfg->replicaNum - pSyncNode->replicaNum) <= 1;
M
Minghao Li 已提交
128 129
}

S
Shengliang Guan 已提交
130
int32_t syncReconfig(int64_t rid, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
131
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
132
  if (pSyncNode == NULL) return -1;
M
Minghao Li 已提交
133

M
Minghao Li 已提交
134
  if (!syncNodeCheckNewConfig(pSyncNode, pNewCfg)) {
S
Shengliang Guan 已提交
135
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
136
    terrno = TSDB_CODE_SYN_NEW_CONFIG_ERROR;
S
Shengliang Guan 已提交
137
    sError("vgId:%d, failed to reconfig since invalid new config", pSyncNode->vgId);
M
Minghao Li 已提交
138
    return -1;
M
Minghao Li 已提交
139
  }
140

S
Shengliang Guan 已提交
141 142
  syncNodeUpdateNewConfigIndex(pSyncNode, pNewCfg);
  syncNodeDoConfigChange(pSyncNode, pNewCfg, SYNC_INDEX_INVALID);
S
Shengliang Guan 已提交
143

M
Minghao Li 已提交
144 145 146 147
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    syncNodeStopHeartbeatTimer(pSyncNode);

    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
S
Shengliang Guan 已提交
148
      syncHbTimerInit(pSyncNode, &pSyncNode->peerHeartbeatTimerArr[i], pSyncNode->replicasId[i]);
M
Minghao Li 已提交
149 150 151 152 153
    }

    syncNodeStartHeartbeatTimer(pSyncNode);
    syncNodeReplicate(pSyncNode);
  }
S
Shengliang Guan 已提交
154

S
Shengliang Guan 已提交
155
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
156
  return 0;
M
Minghao Li 已提交
157
}
M
Minghao Li 已提交
158

S
Shengliang Guan 已提交
159 160 161 162
int32_t syncProcessMsg(int64_t rid, SRpcMsg* pMsg) {
  int32_t code = -1;
  if (!syncIsInit()) return code;

S
Shengliang Guan 已提交
163
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
164 165
  if (pSyncNode == NULL) return code;

S
Shengliang Guan 已提交
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
  switch (pMsg->msgType) {
    case TDMT_SYNC_HEARTBEAT:
      code = syncNodeOnHeartbeat(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_HEARTBEAT_REPLY:
      code = syncNodeOnHeartbeatReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_TIMEOUT:
      code = syncNodeOnTimeout(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_CLIENT_REQUEST:
      code = syncNodeOnClientRequest(pSyncNode, pMsg, NULL);
      break;
    case TDMT_SYNC_REQUEST_VOTE:
      code = syncNodeOnRequestVote(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_REQUEST_VOTE_REPLY:
      code = syncNodeOnRequestVoteReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES:
      code = syncNodeOnAppendEntries(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_APPEND_ENTRIES_REPLY:
      code = syncNodeOnAppendEntriesReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_SEND:
      code = syncNodeOnSnapshot(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_SNAPSHOT_RSP:
      code = syncNodeOnSnapshotReply(pSyncNode, pMsg);
      break;
    case TDMT_SYNC_LOCAL_CMD:
      code = syncNodeOnLocalCmd(pSyncNode, pMsg);
      break;
    default:
      sError("vgId:%d, failed to process msg:%p since invalid type:%s", pSyncNode->vgId, pMsg,
             TMSG_INFO(pMsg->msgType));
      code = -1;
M
Minghao Li 已提交
204 205
  }

S
Shengliang Guan 已提交
206
  syncNodeRelease(pSyncNode);
S
Shengliang Guan 已提交
207
  return code;
208 209
}

S
Shengliang Guan 已提交
210
int32_t syncLeaderTransfer(int64_t rid) {
S
Shengliang Guan 已提交
211
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
212
  if (pSyncNode == NULL) return -1;
213

S
Shengliang Guan 已提交
214
  int32_t ret = syncNodeLeaderTransfer(pSyncNode);
S
Shengliang Guan 已提交
215
  syncNodeRelease(pSyncNode);
216 217 218
  return ret;
}

M
Minghao Li 已提交
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
SyncIndex syncMinMatchIndex(SSyncNode* pSyncNode) {
  SyncIndex minMatchIndex = SYNC_INDEX_INVALID;

  if (pSyncNode->peersNum > 0) {
    minMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[0]));
  }

  for (int32_t i = 1; i < pSyncNode->peersNum; ++i) {
    SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
    if (matchIndex < minMatchIndex) {
      minMatchIndex = matchIndex;
    }
  }
  return minMatchIndex;
}

235
int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
S
Shengliang Guan 已提交
236
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
237
  if (pSyncNode == NULL) {
238
    sError("sync begin snapshot error");
239 240
    return -1;
  }
241

242 243
  int32_t code = 0;

M
Minghao Li 已提交
244
  if (syncNodeIsMnode(pSyncNode)) {
M
Minghao Li 已提交
245 246 247
    // mnode
    int64_t logRetention = SYNC_MNODE_LOG_RETENTION;

M
Minghao Li 已提交
248 249 250
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    int64_t   logNum = endIndex - beginIndex;
M
Minghao Li 已提交
251 252 253
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);

    if (isEmpty || (!isEmpty && logNum < logRetention)) {
S
Shengliang Guan 已提交
254 255
      sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 ", log-num:%" PRId64 ", empty:%d, do not delete wal",
              lastApplyIndex, logNum, isEmpty);
S
Shengliang Guan 已提交
256
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
257 258 259
      return 0;
    }

M
Minghao Li 已提交
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
    goto _DEL_WAL;

  } else {
    // vnode
    if (pSyncNode->replicaNum > 1) {
      // multi replicas

      if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
        pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode);

        for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
          int64_t matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId[i]));
          if (lastApplyIndex > matchIndex) {
            do {
              char     host[64];
              uint16_t port;
              syncUtilU642Addr(pSyncNode->peersId[i].addr, host, sizeof(host), &port);
S
Shengliang Guan 已提交
277 278 279 280
              sNTrace(pSyncNode,
                      "new-snapshot-index:%" PRId64 " is greater than match-index:%" PRId64
                      " of %s:%d, do not delete wal",
                      lastApplyIndex, matchIndex, host, port);
M
Minghao Li 已提交
281 282
            } while (0);

S
Shengliang Guan 已提交
283
            syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
284 285 286 287 288 289
            return 0;
          }
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
        if (lastApplyIndex > pSyncNode->minMatchIndex) {
S
Shengliang Guan 已提交
290 291 292
          sNTrace(pSyncNode,
                  "new-snapshot-index:%" PRId64 " is greater than min-match-index:%" PRId64 ", do not delete wal",
                  lastApplyIndex, pSyncNode->minMatchIndex);
S
Shengliang Guan 已提交
293
          syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
294 295 296 297
          return 0;
        }

      } else if (pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE) {
S
Shengliang Guan 已提交
298
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " candidate, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
299
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
300 301 302
        return 0;

      } else {
S
Shengliang Guan 已提交
303
        sNTrace(pSyncNode, "new-snapshot-index:%" PRId64 " unknown state, do not delete wal", lastApplyIndex);
S
Shengliang Guan 已提交
304
        syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
305 306 307 308 309 310 311 312 313
        return 0;
      }

      goto _DEL_WAL;

    } else {
      // one replica

      goto _DEL_WAL;
314 315 316
    }
  }

M
Minghao Li 已提交
317
_DEL_WAL:
318

M
Minghao Li 已提交
319
  do {
320 321 322 323
    SyncIndex snapshottingIndex = atomic_load_64(&pSyncNode->snapshottingIndex);

    if (snapshottingIndex == SYNC_INDEX_INVALID) {
      atomic_store_64(&pSyncNode->snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
324
      pSyncNode->snapshottingTime = taosGetTimestampMs();
325

M
Minghao Li 已提交
326 327 328
      SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
      code = walBeginSnapshot(pData->pWal, lastApplyIndex);
      if (code == 0) {
S
Shengliang Guan 已提交
329 330
        sNTrace(pSyncNode, "wal snapshot begin, index:%" PRId64 ", last apply index:%" PRId64,
                pSyncNode->snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
331
      } else {
S
Shengliang Guan 已提交
332
        sNError(pSyncNode, "wal snapshot begin error since:%s, index:%" PRId64 ", last apply index:%" PRId64,
S
Shengliang Guan 已提交
333
                terrstr(terrno), pSyncNode->snapshottingIndex, lastApplyIndex);
M
Minghao Li 已提交
334 335
        atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
      }
336 337

    } else {
S
Shengliang Guan 已提交
338 339
      sNTrace(pSyncNode, "snapshotting for %" PRId64 ", do not delete wal for new-snapshot-index:%" PRId64,
              snapshottingIndex, lastApplyIndex);
340
    }
M
Minghao Li 已提交
341
  } while (0);
342

S
Shengliang Guan 已提交
343
  syncNodeRelease(pSyncNode);
344 345 346 347
  return code;
}

int32_t syncEndSnapshot(int64_t rid) {
S
Shengliang Guan 已提交
348
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
349
  if (pSyncNode == NULL) {
350
    sError("sync end snapshot error");
351 352 353
    return -1;
  }

354 355 356 357
  int32_t code = 0;
  if (atomic_load_64(&pSyncNode->snapshottingIndex) != SYNC_INDEX_INVALID) {
    SSyncLogStoreData* pData = pSyncNode->pLogStore->data;
    code = walEndSnapshot(pData->pWal);
M
Minghao Li 已提交
358
    if (code != 0) {
359
      sNError(pSyncNode, "wal snapshot end error since:%s", terrstr());
S
Shengliang Guan 已提交
360
      syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
361 362
      return -1;
    } else {
S
Shengliang Guan 已提交
363
      sNTrace(pSyncNode, "wal snapshot end, index:%" PRId64, atomic_load_64(&pSyncNode->snapshottingIndex));
M
Minghao Li 已提交
364 365
      atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);
    }
366
  }
367

S
Shengliang Guan 已提交
368
  syncNodeRelease(pSyncNode);
369 370 371
  return code;
}

M
Minghao Li 已提交
372
int32_t syncStepDown(int64_t rid, SyncTerm newTerm) {
S
Shengliang Guan 已提交
373
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
M
Minghao Li 已提交
374
  if (pSyncNode == NULL) {
375
    sError("sync step down error");
M
Minghao Li 已提交
376 377 378
    return -1;
  }

M
Minghao Li 已提交
379
  syncNodeStepDown(pSyncNode, newTerm);
S
Shengliang Guan 已提交
380
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
381
  return 0;
M
Minghao Li 已提交
382 383
}

384 385 386
bool syncIsReadyForRead(int64_t rid) {
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
  if (pSyncNode == NULL) {
387
    sError("sync ready for read error");
388 389
    return false;
  }
M
Minghao Li 已提交
390

391 392 393
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER && pSyncNode->restoreFinish) {
    syncNodeRelease(pSyncNode);
    return true;
M
Minghao Li 已提交
394 395
  }

396 397
  bool ready = false;
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER && !pSyncNode->restoreFinish) {
398 399 400
    if (!pSyncNode->pFsm->FpApplyQueueEmptyCb(pSyncNode->pFsm)) {
      // apply queue not empty
      ready = false;
M
Minghao Li 已提交
401

402 403
    } else {
      if (!pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore)) {
404
        SyncIndex       lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
405
        SSyncRaftEntry* pEntry = NULL;
406 407 408 409 410 411
        SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
        LRUHandle*      h = taosLRUCacheLookup(pCache, &lastIndex, sizeof(lastIndex));
        int32_t         code = 0;
        if (h) {
          pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
          code = 0;
M
Minghao Li 已提交
412

413
          sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", lastIndex, pEntry->bytes, pEntry);
M
Minghao Li 已提交
414

415 416
        } else {
          sNTrace(pSyncNode, "miss cache index:%" PRId64, lastIndex);
417

418 419
          code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, lastIndex, &pEntry);
        }
420

421 422 423 424
        if (code == 0 && pEntry != NULL) {
          if (pEntry->originalRpcType == TDMT_SYNC_NOOP && pEntry->term == pSyncNode->pRaftStore->currentTerm) {
            ready = true;
          }
425

426 427 428
          if (h) {
            taosLRUCacheRelease(pCache, h, false);
          } else {
429
            syncEntryDestroy(pEntry);
430
          }
431
        }
432 433 434 435
      }
    }
  }

436 437 438 439 440 441 442
  if (!ready) {
    if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
      terrno = TSDB_CODE_SYN_NOT_LEADER;
    } else {
      terrno = TSDB_CODE_APP_NOT_READY;
    }
  }
443

444 445
  syncNodeRelease(pSyncNode);
  return ready;
M
Minghao Li 已提交
446
}
M
Minghao Li 已提交
447

M
Minghao Li 已提交
448 449
int32_t syncNodeLeaderTransfer(SSyncNode* pSyncNode) {
  if (pSyncNode->peersNum == 0) {
S
Shengliang Guan 已提交
450
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
451 452
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
453
  }
M
Minghao Li 已提交
454

455 456 457 458 459 460
  int32_t ret = 0;
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    SNodeInfo newLeader = (pSyncNode->peersNodeInfo)[0];
    ret = syncNodeLeaderTransferTo(pSyncNode, newLeader);
  }

M
Minghao Li 已提交
461
  return ret;
M
Minghao Li 已提交
462 463
}

M
Minghao Li 已提交
464 465
int32_t syncNodeLeaderTransferTo(SSyncNode* pSyncNode, SNodeInfo newLeader) {
  if (pSyncNode->replicaNum == 1) {
S
Shengliang Guan 已提交
466
    sDebug("vgId:%d, only one replica, cannot leader transfer", pSyncNode->vgId);
M
Minghao Li 已提交
467 468
    terrno = TSDB_CODE_SYN_ONE_REPLICA;
    return -1;
M
Minghao Li 已提交
469
  }
470

S
Shengliang Guan 已提交
471
  sNTrace(pSyncNode, "begin leader transfer to %s:%u", newLeader.nodeFqdn, newLeader.nodePort);
M
Minghao Li 已提交
472

473 474 475 476
  SRpcMsg rpcMsg = {0};
  (void)syncBuildLeaderTransfer(&rpcMsg, pSyncNode->vgId);

  SyncLeaderTransfer* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
477 478 479 480
  pMsg->newLeaderId.addr = syncUtilAddr2U64(newLeader.nodeFqdn, newLeader.nodePort);
  pMsg->newLeaderId.vgId = pSyncNode->vgId;
  pMsg->newNodeInfo = newLeader;

S
Shengliang Guan 已提交
481 482 483
  int32_t ret = syncNodePropose(pSyncNode, &rpcMsg, false);
  rpcFreeCont(rpcMsg.pCont);
  return ret;
M
Minghao Li 已提交
484 485
}

486 487
SSyncState syncGetState(int64_t rid) {
  SSyncState state = {.state = TAOS_SYNC_STATE_ERROR};
M
Minghao Li 已提交
488

S
Shengliang Guan 已提交
489
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
490 491 492 493
  if (pSyncNode != NULL) {
    state.state = pSyncNode->state;
    state.restored = pSyncNode->restoreFinish;
    syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
494 495
  }

496
  return state;
M
Minghao Li 已提交
497 498
}

499
#if 0
500 501 502 503 504
int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) {
  if (index < SYNC_INDEX_BEGIN) {
    return -1;
  }

S
Shengliang Guan 已提交
505
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
506 507 508 509 510 511 512 513 514
  if (pSyncNode == NULL) {
    return -1;
  }
  ASSERT(rid == pSyncNode->rid);

  SSyncRaftEntry* pEntry = NULL;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
  if (code != 0) {
    if (pEntry != NULL) {
B
Benguang Zhao 已提交
515
      syncEntryDestroy(pEntry);
516
    }
S
Shengliang Guan 已提交
517
    syncNodeRelease(pSyncNode);
518 519 520 521 522 523 524 525 526
    return -1;
  }
  ASSERT(pEntry != NULL);

  pSnapshot->data = NULL;
  pSnapshot->lastApplyIndex = index;
  pSnapshot->lastApplyTerm = pEntry->term;
  pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index);

527
  syncEntryDestroy(pEntry);
S
Shengliang Guan 已提交
528
  syncNodeRelease(pSyncNode);
529 530 531
  return 0;
}

532
int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
533
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
534 535 536
  if (pSyncNode == NULL) {
    return -1;
  }
M
Minghao Li 已提交
537
  ASSERT(rid == pSyncNode->rid);
538 539
  sMeta->lastConfigIndex = pSyncNode->pRaftCfg->lastConfigIndex;

S
Shengliang Guan 已提交
540
  sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->pRaftCfg->lastConfigIndex);
541

S
Shengliang Guan 已提交
542
  syncNodeRelease(pSyncNode);
543 544 545
  return 0;
}

546
int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) {
S
Shengliang Guan 已提交
547
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
548 549 550
  if (pSyncNode == NULL) {
    return -1;
  }
M
Minghao Li 已提交
551
  ASSERT(rid == pSyncNode->rid);
552 553 554 555

  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
556
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
557 558 559 560 561 562
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
  sMeta->lastConfigIndex = lastIndex;
563
  sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex,
S
Shengliang Guan 已提交
564
         sMeta->lastConfigIndex);
565

S
Shengliang Guan 已提交
566
  syncNodeRelease(pSyncNode);
567 568
  return 0;
}
569
#endif
570

571 572 573 574
SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
  ASSERT(pSyncNode->pRaftCfg->configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[0];

S
Shengliang Guan 已提交
575
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->configIndexCount; ++i) {
576 577 578 579 580
    if ((pSyncNode->pRaftCfg->configIndexArr)[i] > lastIndex &&
        (pSyncNode->pRaftCfg->configIndexArr)[i] <= snapshotLastApplyIndex) {
      lastIndex = (pSyncNode->pRaftCfg->configIndexArr)[i];
    }
  }
S
Shengliang Guan 已提交
581
  sTrace("vgId:%d, sync get last config index, index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId,
S
Shengliang Guan 已提交
582
         snapshotLastApplyIndex, lastIndex);
583 584 585 586

  return lastIndex;
}

587 588
void syncGetRetryEpSet(int64_t rid, SEpSet* pEpSet) {
  pEpSet->numOfEps = 0;
M
Minghao Li 已提交
589

S
Shengliang Guan 已提交
590
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
S
Shengliang Guan 已提交
591
  if (pSyncNode == NULL) return;
M
Minghao Li 已提交
592

S
Shengliang Guan 已提交
593
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
S
Shengliang Guan 已提交
594 595 596 597 598
    SEp* pEp = &pEpSet->eps[i];
    tstrncpy(pEp->fqdn, pSyncNode->pRaftCfg->cfg.nodeInfo[i].nodeFqdn, TSDB_FQDN_LEN);
    pEp->port = (pSyncNode->pRaftCfg->cfg.nodeInfo)[i].nodePort;
    pEpSet->numOfEps++;
    sInfo("vgId:%d, sync get retry epset, index:%d %s:%d", pSyncNode->vgId, i, pEp->fqdn, pEp->port);
M
Minghao Li 已提交
599
  }
M
Minghao Li 已提交
600 601
  if (pEpSet->numOfEps > 0) {
    pEpSet->inUse = (pSyncNode->pRaftCfg->cfg.myIndex + 1) % pEpSet->numOfEps;
M
Minghao Li 已提交
602 603
  }

S
Shengliang Guan 已提交
604
  sInfo("vgId:%d, sync get retry epset numOfEps:%d inUse:%d", pSyncNode->vgId, pEpSet->numOfEps, pEpSet->inUse);
S
Shengliang Guan 已提交
605
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
606 607
}

M
Minghao Li 已提交
608
int32_t syncPropose(int64_t rid, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
609
  SSyncNode* pSyncNode = syncNodeAcquire(rid);
610
  if (pSyncNode == NULL) {
611
    sError("sync propose error");
M
Minghao Li 已提交
612
    return -1;
613
  }
614

615
  int32_t ret = syncNodePropose(pSyncNode, pMsg, isWeak);
S
Shengliang Guan 已提交
616
  syncNodeRelease(pSyncNode);
M
Minghao Li 已提交
617 618
  return ret;
}
M
Minghao Li 已提交
619

620
int32_t syncNodePropose(SSyncNode* pSyncNode, SRpcMsg* pMsg, bool isWeak) {
S
Shengliang Guan 已提交
621 622 623 624 625
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    sNError(pSyncNode, "sync propose not leader, %s, type:%s", syncStr(pSyncNode->state), TMSG_INFO(pMsg->msgType));
    return -1;
  }
626

S
Shengliang Guan 已提交
627 628 629 630 631 632 633
  // not restored, vnode enable
  if (!pSyncNode->restoreFinish && pSyncNode->vgId != 1) {
    terrno = TSDB_CODE_SYN_PROPOSE_NOT_READY;
    sNError(pSyncNode, "failed to sync propose since not ready, type:%s, last:%" PRId64 ", cmt:%" PRId64,
            TMSG_INFO(pMsg->msgType), syncNodeGetLastIndex(pSyncNode), pSyncNode->commitIndex);
    return -1;
  }
634

S
Shengliang Guan 已提交
635 636 637
  // optimized one replica
  if (syncNodeIsOptimizedOneReplica(pSyncNode, pMsg)) {
    SyncIndex retIndex;
638
    int32_t   code = syncNodeOnClientRequest(pSyncNode, pMsg, &retIndex);
S
Shengliang Guan 已提交
639 640 641
    if (code == 0) {
      pMsg->info.conn.applyIndex = retIndex;
      pMsg->info.conn.applyTerm = pSyncNode->pRaftStore->currentTerm;
642 643 644
      sTrace("vgId:%d, propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
             TMSG_INFO(pMsg->msgType));
      return 1;
M
Minghao Li 已提交
645
    } else {
S
Shengliang Guan 已提交
646
      terrno = TSDB_CODE_SYN_INTERNAL_ERROR;
647
      sError("vgId:%d, failed to propose optimized msg, index:%" PRId64 " type:%s", pSyncNode->vgId, retIndex,
S
Shengliang Guan 已提交
648
             TMSG_INFO(pMsg->msgType));
649
      return -1;
650
    }
S
Shengliang Guan 已提交
651
  } else {
S
Shengliang Guan 已提交
652 653
    SRespStub stub = {.createTime = taosGetTimestampMs(), .rpcMsg = *pMsg};
    uint64_t  seqNum = syncRespMgrAdd(pSyncNode->pSyncRespMgr, &stub);
654
    SRpcMsg   rpcMsg = {0};
S
Shengliang Guan 已提交
655
    int32_t   code = syncBuildClientRequest(&rpcMsg, pMsg, seqNum, isWeak, pSyncNode->vgId);
656 657 658 659
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while serialize since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
      return -1;
M
Minghao Li 已提交
660
    }
661

662 663 664 665 666
    sNTrace(pSyncNode, "propose msg, type:%s", TMSG_INFO(pMsg->msgType));
    code = (*pSyncNode->syncEqMsg)(pSyncNode->msgcb, &rpcMsg);
    if (code != 0) {
      sError("vgId:%d, failed to propose msg while enqueue since %s", pSyncNode->vgId, terrstr());
      (void)syncRespMgrDel(pSyncNode->pSyncRespMgr, seqNum);
M
Minghao Li 已提交
667
    }
M
Minghao Li 已提交
668

669
    return code;
M
Minghao Li 已提交
670
  }
M
Minghao Li 已提交
671 672
}

S
Shengliang Guan 已提交
673
static int32_t syncHbTimerInit(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer, SRaftId destId) {
674 675 676 677 678 679 680 681 682
  pSyncTimer->pTimer = NULL;
  pSyncTimer->counter = 0;
  pSyncTimer->timerMS = pSyncNode->hbBaseLine;
  pSyncTimer->timerCb = syncNodeEqPeerHeartbeatTimer;
  pSyncTimer->destId = destId;
  atomic_store_64(&pSyncTimer->logicClock, 0);
  return 0;
}

S
Shengliang Guan 已提交
683
static int32_t syncHbTimerStart(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
684
  int32_t ret = 0;
S
Shengliang Guan 已提交
685
  if (syncIsInit()) {
M
Minghao Li 已提交
686
    SSyncHbTimerData* pData = &pSyncTimer->hbData;
687 688 689 690
    pData->pSyncNode = pSyncNode;
    pData->pTimer = pSyncTimer;
    pData->destId = pSyncTimer->destId;
    pData->logicClock = pSyncTimer->logicClock;
M
Minghao Li 已提交
691

S
Shengliang Guan 已提交
692
    taosTmrReset(pSyncTimer->timerCb, pSyncTimer->timerMS, pData, syncEnv()->pTimerManager, &pSyncTimer->pTimer);
693 694 695 696 697 698
  } else {
    sError("vgId:%d, start ctrl hb timer error, sync env is stop", pSyncNode->vgId);
  }
  return ret;
}

S
Shengliang Guan 已提交
699
static int32_t syncHbTimerStop(SSyncNode* pSyncNode, SSyncTimer* pSyncTimer) {
700 701 702 703 704 705 706
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncTimer->logicClock, 1);
  taosTmrStop(pSyncTimer->pTimer);
  pSyncTimer->pTimer = NULL;
  return ret;
}

M
Minghao Li 已提交
707
// open/close --------------
S
Shengliang Guan 已提交
708 709
SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  SSyncNode* pSyncNode = taosMemoryCalloc(1, sizeof(SSyncNode));
710 711 712 713
  if (pSyncNode == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _error;
  }
M
Minghao Li 已提交
714

M
Minghao Li 已提交
715 716 717 718
  if (!taosDirExist((char*)(pSyncInfo->path))) {
    if (taosMkDir(pSyncInfo->path) != 0) {
      terrno = TAOS_SYSTEM_ERROR(errno);
      sError("failed to create dir:%s since %s", pSyncInfo->path, terrstr());
719
      goto _error;
M
Minghao Li 已提交
720
    }
721
  }
M
Minghao Li 已提交
722

S
Shengliang Guan 已提交
723
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
724
  if (!taosCheckExistFile(pSyncNode->configPath)) {
M
Minghao Li 已提交
725
    // create a new raft config file
S
Shengliang Guan 已提交
726
    SRaftCfgMeta meta = {0};
M
Minghao Li 已提交
727
    meta.isStandBy = pSyncInfo->isStandBy;
M
Minghao Li 已提交
728
    meta.snapshotStrategy = pSyncInfo->snapshotStrategy;
729
    meta.lastConfigIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
730
    meta.batchSize = pSyncInfo->batchSize;
S
Shengliang Guan 已提交
731 732
    if (raftCfgCreateFile(&pSyncInfo->syncCfg, meta, pSyncNode->configPath) != 0) {
      sError("vgId:%d, failed to create raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
733
      goto _error;
734
    }
735
    if (pSyncInfo->syncCfg.replicaNum == 0) {
S
Shengliang Guan 已提交
736
      sInfo("vgId:%d, sync config not input", pSyncNode->vgId);
737 738
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
739 740 741
  } else {
    // update syncCfg by raft_config.json
    pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
742
    if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
743
      sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
H
Hongze Cheng 已提交
744
      goto _error;
745
    }
S
Shengliang Guan 已提交
746 747

    if (pSyncInfo->syncCfg.replicaNum > 0 && syncIsConfigChanged(&pSyncNode->pRaftCfg->cfg, &pSyncInfo->syncCfg)) {
S
Shengliang Guan 已提交
748 749 750 751 752 753
      sInfo("vgId:%d, use sync config from input options and write to cfg file", pSyncNode->vgId);
      pSyncNode->pRaftCfg->cfg = pSyncInfo->syncCfg;
      if (raftCfgPersist(pSyncNode->pRaftCfg) != 0) {
        sError("vgId:%d, failed to persist raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
        goto _error;
      }
S
Shengliang Guan 已提交
754 755 756 757
    } else {
      sInfo("vgId:%d, use sync config from raft cfg file", pSyncNode->vgId);
      pSyncInfo->syncCfg = pSyncNode->pRaftCfg->cfg;
    }
758 759

    raftCfgClose(pSyncNode->pRaftCfg);
760
    pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
761 762
  }

M
Minghao Li 已提交
763
  // init by SSyncInfo
M
Minghao Li 已提交
764
  pSyncNode->vgId = pSyncInfo->vgId;
S
Shengliang Guan 已提交
765 766 767 768 769 770 771
  SSyncCfg* pCfg = &pSyncInfo->syncCfg;
  sDebug("vgId:%d, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SNodeInfo* pNode = &pCfg->nodeInfo[i];
    sDebug("vgId:%d, index:%d ep:%s:%u", pSyncNode->vgId, i, pNode->nodeFqdn, pNode->nodePort);
  }

M
Minghao Li 已提交
772
  memcpy(pSyncNode->path, pSyncInfo->path, sizeof(pSyncNode->path));
S
Shengliang Guan 已提交
773 774 775
  snprintf(pSyncNode->raftStorePath, sizeof(pSyncNode->raftStorePath), "%s%sraft_store.json", pSyncInfo->path,
           TD_DIRSEP);
  snprintf(pSyncNode->configPath, sizeof(pSyncNode->configPath), "%s%sraft_config.json", pSyncInfo->path, TD_DIRSEP);
M
Minghao Li 已提交
776

M
Minghao Li 已提交
777
  pSyncNode->pWal = pSyncInfo->pWal;
S
Shengliang Guan 已提交
778
  pSyncNode->msgcb = pSyncInfo->msgcb;
S
Shengliang Guan 已提交
779 780 781
  pSyncNode->syncSendMSg = pSyncInfo->syncSendMSg;
  pSyncNode->syncEqMsg = pSyncInfo->syncEqMsg;
  pSyncNode->syncEqCtrlMsg = pSyncInfo->syncEqCtrlMsg;
M
Minghao Li 已提交
782

B
Benguang Zhao 已提交
783 784 785 786 787 788 789
  // create raft log ring buffer
  pSyncNode->pLogBuf = syncLogBufferCreate();
  if (pSyncNode->pLogBuf == NULL) {
    sError("failed to init log buffer since %s. vgId:%d", terrstr(), pSyncNode->vgId);
    goto _error;
  }

M
Minghao Li 已提交
790 791
  // init raft config
  pSyncNode->pRaftCfg = raftCfgOpen(pSyncNode->configPath);
792
  if (pSyncNode->pRaftCfg == NULL) {
S
Shengliang Guan 已提交
793
    sError("vgId:%d, failed to open raft cfg file at %s", pSyncNode->vgId, pSyncNode->configPath);
794 795
    goto _error;
  }
M
Minghao Li 已提交
796

M
Minghao Li 已提交
797
  // init internal
M
Minghao Li 已提交
798
  pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
799
  if (!syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId)) {
S
Shengliang Guan 已提交
800
    sError("vgId:%d, failed to determine my raft member id", pSyncNode->vgId);
H
Hongze Cheng 已提交
801
    goto _error;
802
  }
M
Minghao Li 已提交
803

M
Minghao Li 已提交
804
  // init peersNum, peers, peersId
M
Minghao Li 已提交
805
  pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
806 807
  int32_t j = 0;
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
808 809
    if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
      pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
M
Minghao Li 已提交
810 811 812
      j++;
    }
  }
S
Shengliang Guan 已提交
813
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
814
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i])) {
S
Shengliang Guan 已提交
815
      sError("vgId:%d, failed to determine raft member id, peer:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
816
      goto _error;
817
    }
M
Minghao Li 已提交
818
  }
M
Minghao Li 已提交
819

M
Minghao Li 已提交
820
  // init replicaNum, replicasId
M
Minghao Li 已提交
821
  pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
822
  for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
823
    if (!syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i])) {
S
Shengliang Guan 已提交
824
      sError("vgId:%d, failed to determine raft member id, replica:%d", pSyncNode->vgId, i);
H
Hongze Cheng 已提交
825
      goto _error;
826
    }
M
Minghao Li 已提交
827 828
  }

M
Minghao Li 已提交
829
  // init raft algorithm
M
Minghao Li 已提交
830
  pSyncNode->pFsm = pSyncInfo->pFsm;
831
  pSyncInfo->pFsm = NULL;
M
Minghao Li 已提交
832
  pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);
M
Minghao Li 已提交
833 834
  pSyncNode->leaderCache = EMPTY_RAFT_ID;

M
Minghao Li 已提交
835
  // init life cycle outside
M
Minghao Li 已提交
836

M
Minghao Li 已提交
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
  // TLA+ Spec
  // InitHistoryVars == /\ elections = {}
  //                    /\ allLogs   = {}
  //                    /\ voterLog  = [i \in Server |-> [j \in {} |-> <<>>]]
  // InitServerVars == /\ currentTerm = [i \in Server |-> 1]
  //                   /\ state       = [i \in Server |-> Follower]
  //                   /\ votedFor    = [i \in Server |-> Nil]
  // InitCandidateVars == /\ votesResponded = [i \in Server |-> {}]
  //                      /\ votesGranted   = [i \in Server |-> {}]
  // \* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the
  // \* leader does not send itself messages. It's still easier to include these
  // \* in the functions.
  // InitLeaderVars == /\ nextIndex  = [i \in Server |-> [j \in Server |-> 1]]
  //                   /\ matchIndex = [i \in Server |-> [j \in Server |-> 0]]
  // InitLogVars == /\ log          = [i \in Server |-> << >>]
  //                /\ commitIndex  = [i \in Server |-> 0]
  // Init == /\ messages = [m \in {} |-> 0]
  //         /\ InitHistoryVars
  //         /\ InitServerVars
  //         /\ InitCandidateVars
  //         /\ InitLeaderVars
  //         /\ InitLogVars
  //

M
Minghao Li 已提交
861
  // init TLA+ server vars
M
syncInt  
Minghao Li 已提交
862
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
M
Minghao Li 已提交
863
  pSyncNode->pRaftStore = raftStoreOpen(pSyncNode->raftStorePath);
864
  if (pSyncNode->pRaftStore == NULL) {
S
Shengliang Guan 已提交
865
    sError("vgId:%d, failed to open raft store at path %s", pSyncNode->vgId, pSyncNode->raftStorePath);
866 867
    goto _error;
  }
M
Minghao Li 已提交
868

M
Minghao Li 已提交
869
  // init TLA+ candidate vars
M
Minghao Li 已提交
870
  pSyncNode->pVotesGranted = voteGrantedCreate(pSyncNode);
871
  if (pSyncNode->pVotesGranted == NULL) {
S
Shengliang Guan 已提交
872
    sError("vgId:%d, failed to create VotesGranted", pSyncNode->vgId);
873 874
    goto _error;
  }
M
Minghao Li 已提交
875
  pSyncNode->pVotesRespond = votesRespondCreate(pSyncNode);
876
  if (pSyncNode->pVotesRespond == NULL) {
S
Shengliang Guan 已提交
877
    sError("vgId:%d, failed to create VotesRespond", pSyncNode->vgId);
878 879
    goto _error;
  }
M
Minghao Li 已提交
880

M
Minghao Li 已提交
881 882
  // init TLA+ leader vars
  pSyncNode->pNextIndex = syncIndexMgrCreate(pSyncNode);
883
  if (pSyncNode->pNextIndex == NULL) {
S
Shengliang Guan 已提交
884
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
885 886
    goto _error;
  }
M
Minghao Li 已提交
887
  pSyncNode->pMatchIndex = syncIndexMgrCreate(pSyncNode);
888
  if (pSyncNode->pMatchIndex == NULL) {
S
Shengliang Guan 已提交
889
    sError("vgId:%d, failed to create SyncIndexMgr", pSyncNode->vgId);
890 891
    goto _error;
  }
M
Minghao Li 已提交
892 893 894

  // init TLA+ log vars
  pSyncNode->pLogStore = logStoreCreate(pSyncNode);
895
  if (pSyncNode->pLogStore == NULL) {
S
Shengliang Guan 已提交
896
    sError("vgId:%d, failed to create SyncLogStore", pSyncNode->vgId);
897 898
    goto _error;
  }
899 900 901 902 903

  SyncIndex commitIndex = SYNC_INDEX_INVALID;
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot = {0};
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
904
    if (code != 0) {
S
Shengliang Guan 已提交
905
      sError("vgId:%d, failed to get snapshot info, code:%d", pSyncNode->vgId, code);
H
Hongze Cheng 已提交
906
      goto _error;
907
    }
908 909
    if (snapshot.lastApplyIndex > commitIndex) {
      commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
910
      sNTrace(pSyncNode, "reset commit index by snapshot");
911 912 913
    }
  }
  pSyncNode->commitIndex = commitIndex;
M
Minghao Li 已提交
914

M
Minghao Li 已提交
915 916 917 918 919
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
  pSyncNode->electBaseLine = ELECT_TIMER_MS_MIN;
  pSyncNode->hbBaseLine = HEARTBEAT_TIMER_MS;

M
Minghao Li 已提交
920
  // init ping timer
M
Minghao Li 已提交
921
  pSyncNode->pPingTimer = NULL;
M
Minghao Li 已提交
922
  pSyncNode->pingTimerMS = pSyncNode->pingBaseLine;
M
Minghao Li 已提交
923 924
  atomic_store_64(&pSyncNode->pingTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->pingTimerLogicClockUser, 0);
M
Minghao Li 已提交
925
  pSyncNode->FpPingTimerCB = syncNodeEqPingTimer;
M
Minghao Li 已提交
926
  pSyncNode->pingTimerCounter = 0;
M
Minghao Li 已提交
927

M
Minghao Li 已提交
928 929
  // init elect timer
  pSyncNode->pElectTimer = NULL;
M
Minghao Li 已提交
930
  pSyncNode->electTimerMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
M
Minghao Li 已提交
931
  atomic_store_64(&pSyncNode->electTimerLogicClock, 0);
M
Minghao Li 已提交
932
  pSyncNode->FpElectTimerCB = syncNodeEqElectTimer;
M
Minghao Li 已提交
933 934 935 936
  pSyncNode->electTimerCounter = 0;

  // init heartbeat timer
  pSyncNode->pHeartbeatTimer = NULL;
M
Minghao Li 已提交
937
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
M
Minghao Li 已提交
938 939
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, 0);
  atomic_store_64(&pSyncNode->heartbeatTimerLogicClockUser, 0);
M
Minghao Li 已提交
940
  pSyncNode->FpHeartbeatTimerCB = syncNodeEqHeartbeatTimer;
M
Minghao Li 已提交
941 942
  pSyncNode->heartbeatTimerCounter = 0;

943 944 945 946 947
  // init peer heartbeat timer
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
    syncHbTimerInit(pSyncNode, &(pSyncNode->peerHeartbeatTimerArr[i]), (pSyncNode->replicasId)[i]);
  }

M
Minghao Li 已提交
948
  // tools
M
Minghao Li 已提交
949
  pSyncNode->pSyncRespMgr = syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS);
950
  if (pSyncNode->pSyncRespMgr == NULL) {
S
Shengliang Guan 已提交
951
    sError("vgId:%d, failed to create SyncRespMgr", pSyncNode->vgId);
952 953
    goto _error;
  }
M
Minghao Li 已提交
954

955 956
  // restore state
  pSyncNode->restoreFinish = false;
957

M
Minghao Li 已提交
958
  // snapshot senders
S
Shengliang Guan 已提交
959
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
960 961 962 963 964 965
    SSyncSnapshotSender* pSender = snapshotSenderCreate(pSyncNode, i);
    // ASSERT(pSender != NULL);
    (pSyncNode->senders)[i] = pSender;
  }

  // snapshot receivers
966
  pSyncNode->pNewNodeReceiver = snapshotReceiverCreate(pSyncNode, EMPTY_RAFT_ID);
M
Minghao Li 已提交
967

M
Minghao Li 已提交
968 969 970
  // is config changing
  pSyncNode->changing = false;

B
Benguang Zhao 已提交
971 972 973
  // replication mgr
  syncNodeLogReplMgrInit(pSyncNode);

M
Minghao Li 已提交
974 975 976
  // peer state
  syncNodePeerStateInit(pSyncNode);

B
Benguang Zhao 已提交
977
  //
M
Minghao Li 已提交
978 979 980
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

M
Minghao Li 已提交
981
  // start in syncNodeStart
M
Minghao Li 已提交
982
  // start raft
M
Minghao Li 已提交
983
  // syncNodeBecomeFollower(pSyncNode);
M
Minghao Li 已提交
984

M
Minghao Li 已提交
985 986
  int64_t timeNow = taosGetTimestampMs();
  pSyncNode->startTime = timeNow;
987
  pSyncNode->leaderTime = timeNow;
M
Minghao Li 已提交
988 989
  pSyncNode->lastReplicateTime = timeNow;

990 991 992
  // snapshotting
  atomic_store_64(&pSyncNode->snapshottingIndex, SYNC_INDEX_INVALID);

B
Benguang Zhao 已提交
993 994 995
  // init log buffer
  if (syncLogBufferInit(pSyncNode->pLogBuf, pSyncNode) < 0) {
    sError("vgId:%d, failed to init raft log buffer since %s", pSyncNode->vgId, terrstr());
996
    goto _error;
B
Benguang Zhao 已提交
997 998
  }

S
Shengliang Guan 已提交
999
  sNTrace(pSyncNode, "sync open");
1000

M
Minghao Li 已提交
1001
  return pSyncNode;
1002 1003 1004

_error:
  if (pSyncInfo->pFsm) {
H
Hongze Cheng 已提交
1005 1006
    taosMemoryFree(pSyncInfo->pFsm);
    pSyncInfo->pFsm = NULL;
1007 1008 1009 1010
  }
  syncNodeClose(pSyncNode);
  pSyncNode = NULL;
  return NULL;
M
Minghao Li 已提交
1011 1012
}

M
Minghao Li 已提交
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
void syncNodeMaybeUpdateCommitBySnapshot(SSyncNode* pSyncNode) {
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    SSnapshot snapshot;
    int32_t   code = pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
    ASSERT(code == 0);
    if (snapshot.lastApplyIndex > pSyncNode->commitIndex) {
      pSyncNode->commitIndex = snapshot.lastApplyIndex;
    }
  }
}

B
Benguang Zhao 已提交
1024 1025 1026 1027 1028 1029 1030 1031
int32_t syncNodeRestore(SSyncNode* pSyncNode) {
  ASSERT(pSyncNode->pLogStore != NULL && "log store not created");
  ASSERT(pSyncNode->pLogBuf != NULL && "ring log buffer not created");

  SyncIndex lastVer = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  SyncIndex commitIndex = pSyncNode->pLogStore->syncLogCommitIndex(pSyncNode->pLogStore);
  SyncIndex endIndex = pSyncNode->pLogBuf->endIndex;

1032
  ASSERT(endIndex == lastVer + 1);
B
Benguang Zhao 已提交
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
  commitIndex = TMAX(pSyncNode->commitIndex, commitIndex);

  if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, commitIndex) < 0) {
    return -1;
  }

  return 0;
}

int32_t syncNodeStart(SSyncNode* pSyncNode) {
  // start raft
  if (pSyncNode->replicaNum == 1) {
    raftStoreNextTerm(pSyncNode->pRaftStore);
    syncNodeBecomeLeader(pSyncNode, "one replica start");

    // Raft 3.6.2 Committing entries from previous terms
    syncNodeAppendNoop(pSyncNode);
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
  }

  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
  ASSERT(ret == 0);
  return ret;
}

void syncNodeStartOld(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1061
  // start raft
1062
  if (pSyncNode->replicaNum == 1) {
M
Minghao Li 已提交
1063
    raftStoreNextTerm(pSyncNode->pRaftStore);
1064
    syncNodeBecomeLeader(pSyncNode, "one replica start");
M
format  
Minghao Li 已提交
1065

1066
    // Raft 3.6.2 Committing entries from previous terms
1067 1068
    syncNodeAppendNoop(pSyncNode);
    syncMaybeAdvanceCommitIndex(pSyncNode);
M
Minghao Li 已提交
1069

M
Minghao Li 已提交
1070 1071
  } else {
    syncNodeBecomeFollower(pSyncNode, "first start");
1072 1073
  }

1074 1075 1076
  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
  ASSERT(ret == 0);
M
Minghao Li 已提交
1077 1078
}

B
Benguang Zhao 已提交
1079
int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1080 1081 1082 1083 1084 1085 1086 1087
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
  ASSERT(ret == 0);
1088

1089 1090 1091
  ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
  ASSERT(ret == 0);
B
Benguang Zhao 已提交
1092
  return ret;
M
Minghao Li 已提交
1093 1094
}

M
Minghao Li 已提交
1095 1096 1097 1098 1099 1100 1101 1102
void syncNodePreClose(SSyncNode* pSyncNode) {
  // stop elect timer
  syncNodeStopElectTimer(pSyncNode);

  // stop heartbeat timer
  syncNodeStopHeartbeatTimer(pSyncNode);
}

M
Minghao Li 已提交
1103
void syncNodeClose(SSyncNode* pSyncNode) {
1104 1105 1106
  if (pSyncNode == NULL) {
    return;
  }
M
Minghao Li 已提交
1107 1108
  int32_t ret;

S
Shengliang Guan 已提交
1109
  sNTrace(pSyncNode, "sync close");
M
Minghao Li 已提交
1110

M
Minghao Li 已提交
1111
  ret = raftStoreClose(pSyncNode->pRaftStore);
M
Minghao Li 已提交
1112
  ASSERT(ret == 0);
M
Minghao Li 已提交
1113
  pSyncNode->pRaftStore = NULL;
M
Minghao Li 已提交
1114

B
Benguang Zhao 已提交
1115
  syncNodeLogReplMgrDestroy(pSyncNode);
M
Minghao Li 已提交
1116
  syncRespMgrDestroy(pSyncNode->pSyncRespMgr);
1117
  pSyncNode->pSyncRespMgr = NULL;
M
Minghao Li 已提交
1118
  voteGrantedDestroy(pSyncNode->pVotesGranted);
1119
  pSyncNode->pVotesGranted = NULL;
M
Minghao Li 已提交
1120
  votesRespondDestory(pSyncNode->pVotesRespond);
1121
  pSyncNode->pVotesRespond = NULL;
M
Minghao Li 已提交
1122
  syncIndexMgrDestroy(pSyncNode->pNextIndex);
1123
  pSyncNode->pNextIndex = NULL;
M
Minghao Li 已提交
1124
  syncIndexMgrDestroy(pSyncNode->pMatchIndex);
1125
  pSyncNode->pMatchIndex = NULL;
M
Minghao Li 已提交
1126
  logStoreDestory(pSyncNode->pLogStore);
1127
  pSyncNode->pLogStore = NULL;
B
Benguang Zhao 已提交
1128 1129
  syncLogBufferDestroy(pSyncNode->pLogBuf);
  pSyncNode->pLogBuf = NULL;
M
Minghao Li 已提交
1130
  raftCfgClose(pSyncNode->pRaftCfg);
1131
  pSyncNode->pRaftCfg = NULL;
M
Minghao Li 已提交
1132 1133 1134 1135 1136

  syncNodeStopPingTimer(pSyncNode);
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1137 1138 1139 1140
  if (pSyncNode->pFsm != NULL) {
    taosMemoryFree(pSyncNode->pFsm);
  }

S
Shengliang Guan 已提交
1141
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1142 1143 1144 1145 1146 1147
    if ((pSyncNode->senders)[i] != NULL) {
      snapshotSenderDestroy((pSyncNode->senders)[i]);
      (pSyncNode->senders)[i] = NULL;
    }
  }

M
Minghao Li 已提交
1148 1149 1150 1151 1152
  if (pSyncNode->pNewNodeReceiver != NULL) {
    snapshotReceiverDestroy(pSyncNode->pNewNodeReceiver);
    pSyncNode->pNewNodeReceiver = NULL;
  }

1153
  taosMemoryFree(pSyncNode);
M
Minghao Li 已提交
1154 1155
}

M
Minghao Li 已提交
1156
ESyncStrategy syncNodeStrategy(SSyncNode* pSyncNode) { return pSyncNode->pRaftCfg->snapshotStrategy; }
M
Minghao Li 已提交
1157

M
Minghao Li 已提交
1158 1159 1160
// timer control --------------
int32_t syncNodeStartPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1161 1162
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpPingTimerCB, pSyncNode->pingTimerMS, pSyncNode, syncEnv()->pTimerManager,
1163 1164 1165
                 &pSyncNode->pPingTimer);
    atomic_store_64(&pSyncNode->pingTimerLogicClock, pSyncNode->pingTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1166
    sError("vgId:%d, start ping timer error, sync env is stop", pSyncNode->vgId);
1167
  }
M
Minghao Li 已提交
1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
  return ret;
}

int32_t syncNodeStopPingTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
  atomic_add_fetch_64(&pSyncNode->pingTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pPingTimer);
  pSyncNode->pPingTimer = NULL;
  return ret;
}

int32_t syncNodeStartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
S
Shengliang Guan 已提交
1181
  if (syncIsInit()) {
1182
    pSyncNode->electTimerMS = ms;
S
Shengliang Guan 已提交
1183

1184 1185 1186 1187 1188
    int64_t execTime = taosGetTimestampMs() + ms;
    atomic_store_64(&(pSyncNode->electTimerParam.executeTime), execTime);
    atomic_store_64(&(pSyncNode->electTimerParam.logicClock), pSyncNode->electTimerLogicClock);
    pSyncNode->electTimerParam.pSyncNode = pSyncNode;
    pSyncNode->electTimerParam.pData = NULL;
S
Shengliang Guan 已提交
1189

1190
    taosTmrReset(pSyncNode->FpElectTimerCB, pSyncNode->electTimerMS, pSyncNode, syncEnv()->pTimerManager,
1191
                 &pSyncNode->pElectTimer);
1192

1193
  } else {
M
Minghao Li 已提交
1194
    sError("vgId:%d, start elect timer error, sync env is stop", pSyncNode->vgId);
1195
  }
M
Minghao Li 已提交
1196 1197 1198 1199 1200
  return ret;
}

int32_t syncNodeStopElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1201
  atomic_add_fetch_64(&pSyncNode->electTimerLogicClock, 1);
M
Minghao Li 已提交
1202 1203
  taosTmrStop(pSyncNode->pElectTimer);
  pSyncNode->pElectTimer = NULL;
1204

M
Minghao Li 已提交
1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
  return ret;
}

int32_t syncNodeRestartElectTimer(SSyncNode* pSyncNode, int32_t ms) {
  int32_t ret = 0;
  syncNodeStopElectTimer(pSyncNode);
  syncNodeStartElectTimer(pSyncNode, ms);
  return ret;
}

M
Minghao Li 已提交
1215 1216
int32_t syncNodeResetElectTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
M
Minghao Li 已提交
1217 1218 1219 1220 1221 1222 1223
  int32_t electMS;

  if (pSyncNode->pRaftCfg->isStandBy) {
    electMS = TIMER_MAX_MS;
  } else {
    electMS = syncUtilElectRandomMS(pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine);
  }
M
Minghao Li 已提交
1224
  ret = syncNodeRestartElectTimer(pSyncNode, electMS);
1225

S
Shengliang Guan 已提交
1226 1227
  sNTrace(pSyncNode, "reset elect timer, min:%d, max:%d, ms:%d", pSyncNode->electBaseLine, 2 * pSyncNode->electBaseLine,
          electMS);
M
Minghao Li 已提交
1228 1229 1230
  return ret;
}

M
Minghao Li 已提交
1231
static int32_t syncNodeDoStartHeartbeatTimer(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1232
  int32_t ret = 0;
S
Shengliang Guan 已提交
1233 1234
  if (syncIsInit()) {
    taosTmrReset(pSyncNode->FpHeartbeatTimerCB, pSyncNode->heartbeatTimerMS, pSyncNode, syncEnv()->pTimerManager,
1235 1236 1237
                 &pSyncNode->pHeartbeatTimer);
    atomic_store_64(&pSyncNode->heartbeatTimerLogicClock, pSyncNode->heartbeatTimerLogicClockUser);
  } else {
M
Minghao Li 已提交
1238
    sError("vgId:%d, start heartbeat timer error, sync env is stop", pSyncNode->vgId);
1239
  }
1240

S
Shengliang Guan 已提交
1241
  sNTrace(pSyncNode, "start heartbeat timer, ms:%d", pSyncNode->heartbeatTimerMS);
M
Minghao Li 已提交
1242 1243 1244
  return ret;
}

M
Minghao Li 已提交
1245
int32_t syncNodeStartHeartbeatTimer(SSyncNode* pSyncNode) {
1246
  int32_t ret = 0;
M
Minghao Li 已提交
1247

1248
#if 0
M
Minghao Li 已提交
1249
  pSyncNode->heartbeatTimerMS = pSyncNode->hbBaseLine;
1250 1251
  ret = syncNodeDoStartHeartbeatTimer(pSyncNode);
#endif
1252

S
Shengliang Guan 已提交
1253
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1254
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1255 1256 1257
    if (pSyncTimer != NULL) {
      syncHbTimerStart(pSyncNode, pSyncTimer);
    }
1258
  }
1259

M
Minghao Li 已提交
1260 1261 1262
  return ret;
}

M
Minghao Li 已提交
1263 1264
int32_t syncNodeStopHeartbeatTimer(SSyncNode* pSyncNode) {
  int32_t ret = 0;
1265 1266

#if 0
M
Minghao Li 已提交
1267 1268 1269
  atomic_add_fetch_64(&pSyncNode->heartbeatTimerLogicClockUser, 1);
  taosTmrStop(pSyncNode->pHeartbeatTimer);
  pSyncNode->pHeartbeatTimer = NULL;
1270
#endif
1271

S
Shengliang Guan 已提交
1272
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1273
    SSyncTimer* pSyncTimer = syncNodeGetHbTimer(pSyncNode, &(pSyncNode->peersId[i]));
M
Minghao Li 已提交
1274 1275 1276
    if (pSyncTimer != NULL) {
      syncHbTimerStop(pSyncNode, pSyncTimer);
    }
1277
  }
1278

M
Minghao Li 已提交
1279 1280 1281
  return ret;
}

1282 1283 1284 1285 1286 1287
int32_t syncNodeRestartHeartbeatTimer(SSyncNode* pSyncNode) {
  syncNodeStopHeartbeatTimer(pSyncNode);
  syncNodeStartHeartbeatTimer(pSyncNode);
  return 0;
}

M
Minghao Li 已提交
1288 1289 1290
// utils --------------
int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1291
  syncUtilRaftId2EpSet(destRaftId, &epSet);
S
Shengliang Guan 已提交
1292
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1293 1294 1295
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1296
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1297
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1298
  } else {
M
Minghao Li 已提交
1299
    sError("vgId:%d, sync send msg by id error, fp-send-msg is null", pSyncNode->vgId);
S
Shengliang Guan 已提交
1300
    rpcFreeCont(pMsg->pCont);
M
Minghao Li 已提交
1301
    return -1;
M
Minghao Li 已提交
1302
  }
M
Minghao Li 已提交
1303

M
Minghao Li 已提交
1304 1305 1306 1307 1308
  return 0;
}

int32_t syncNodeSendMsgByInfo(const SNodeInfo* nodeInfo, SSyncNode* pSyncNode, SRpcMsg* pMsg) {
  SEpSet epSet;
1309
  syncUtilNodeInfo2EpSet(nodeInfo, &epSet);
S
Shengliang Guan 已提交
1310
  if (pSyncNode->syncSendMSg != NULL) {
M
Minghao Li 已提交
1311 1312 1313
    // htonl
    syncUtilMsgHtoN(pMsg->pCont);

1314
    pMsg->info.noResp = 1;
S
Shengliang Guan 已提交
1315
    pSyncNode->syncSendMSg(&epSet, pMsg);
M
Minghao Li 已提交
1316
  } else {
M
Minghao Li 已提交
1317
    sError("vgId:%d, sync send msg by info error, fp-send-msg is null", pSyncNode->vgId);
M
Minghao Li 已提交
1318
  }
M
Minghao Li 已提交
1319 1320 1321
  return 0;
}

1322
inline bool syncNodeInConfig(SSyncNode* pSyncNode, const SSyncCfg* config) {
1323 1324 1325
  bool b1 = false;
  bool b2 = false;

S
Shengliang Guan 已提交
1326
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1327 1328 1329 1330 1331 1332 1333
    if (strcmp((config->nodeInfo)[i].nodeFqdn, pSyncNode->myNodeInfo.nodeFqdn) == 0 &&
        (config->nodeInfo)[i].nodePort == pSyncNode->myNodeInfo.nodePort) {
      b1 = true;
      break;
    }
  }

S
Shengliang Guan 已提交
1334
  for (int32_t i = 0; i < config->replicaNum; ++i) {
1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((config->nodeInfo)[i].nodeFqdn, (config->nodeInfo)[i].nodePort);
    raftId.vgId = pSyncNode->vgId;

    if (syncUtilSameId(&raftId, &(pSyncNode->myRaftId))) {
      b2 = true;
      break;
    }
  }

  ASSERT(b1 == b2);
  return b1;
}

1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
static bool syncIsConfigChanged(const SSyncCfg* pOldCfg, const SSyncCfg* pNewCfg) {
  if (pOldCfg->replicaNum != pNewCfg->replicaNum) return true;
  if (pOldCfg->myIndex != pNewCfg->myIndex) return true;
  for (int32_t i = 0; i < pOldCfg->replicaNum; ++i) {
    const SNodeInfo* pOldInfo = &pOldCfg->nodeInfo[i];
    const SNodeInfo* pNewInfo = &pNewCfg->nodeInfo[i];
    if (strcmp(pOldInfo->nodeFqdn, pNewInfo->nodeFqdn) != 0) return true;
    if (pOldInfo->nodePort != pNewInfo->nodePort) return true;
  }

  return false;
}

M
Minghao Li 已提交
1362
void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncIndex lastConfigChangeIndex) {
1363
  SSyncCfg oldConfig = pSyncNode->pRaftCfg->cfg;
1364 1365 1366 1367
  if (!syncIsConfigChanged(&oldConfig, pNewConfig)) {
    sInfo("vgId:1, sync not reconfig since not changed");
    return;
  }
S
Shengliang Guan 已提交
1368

1369
  pSyncNode->pRaftCfg->cfg = *pNewConfig;
1370 1371
  pSyncNode->pRaftCfg->lastConfigIndex = lastConfigChangeIndex;

M
Minghao Li 已提交
1372 1373
  bool IamInOld = syncNodeInConfig(pSyncNode, &oldConfig);
  bool IamInNew = syncNodeInConfig(pSyncNode, pNewConfig);
M
Minghao Li 已提交
1374

M
Minghao Li 已提交
1375 1376
  bool isDrop = false;
  bool isAdd = false;
M
Minghao Li 已提交
1377

M
Minghao Li 已提交
1378 1379 1380 1381
  if (IamInOld && !IamInNew) {
    isDrop = true;
  } else {
    isDrop = false;
1382
  }
1383

M
Minghao Li 已提交
1384 1385 1386 1387 1388
  if (!IamInOld && IamInNew) {
    isAdd = true;
  } else {
    isAdd = false;
  }
M
Minghao Li 已提交
1389

M
Minghao Li 已提交
1390
  // log begin config change
S
Shengliang Guan 已提交
1391 1392 1393 1394 1395
  char oldCfgStr[1024] = {0};
  char newCfgStr[1024] = {0};
  syncCfg2SimpleStr(&oldConfig, oldCfgStr, sizeof(oldCfgStr));
  syncCfg2SimpleStr(pNewConfig, oldCfgStr, sizeof(oldCfgStr));
  sNTrace(pSyncNode, "begin do config change, from %s to %s", oldCfgStr, oldCfgStr);
M
Minghao Li 已提交
1396

M
Minghao Li 已提交
1397 1398
  if (IamInNew) {
    pSyncNode->pRaftCfg->isStandBy = 0;  // change isStandBy to normal
M
Minghao Li 已提交
1399
  }
M
Minghao Li 已提交
1400 1401
  if (isDrop) {
    pSyncNode->pRaftCfg->isStandBy = 1;  // set standby
M
Minghao Li 已提交
1402 1403
  }

M
Minghao Li 已提交
1404
  // add last config index
M
Minghao Li 已提交
1405
  raftCfgAddConfigIndex(pSyncNode->pRaftCfg, lastConfigChangeIndex);
M
Minghao Li 已提交
1406

M
Minghao Li 已提交
1407 1408 1409 1410 1411 1412 1413 1414 1415
  if (IamInNew) {
    //-----------------------------------------
    int32_t ret = 0;

    // save snapshot senders
    int32_t oldReplicaNum = pSyncNode->replicaNum;
    SRaftId oldReplicasId[TSDB_MAX_REPLICA];
    memcpy(oldReplicasId, pSyncNode->replicasId, sizeof(oldReplicasId));
    SSyncSnapshotSender* oldSenders[TSDB_MAX_REPLICA];
S
Shengliang Guan 已提交
1416
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1417
      oldSenders[i] = (pSyncNode->senders)[i];
S
Shengliang Guan 已提交
1418
      sSTrace(oldSenders[i], "snapshot sender save old");
M
Minghao Li 已提交
1419
    }
1420

M
Minghao Li 已提交
1421 1422
    // init internal
    pSyncNode->myNodeInfo = pSyncNode->pRaftCfg->cfg.nodeInfo[pSyncNode->pRaftCfg->cfg.myIndex];
1423
    syncUtilNodeInfo2RaftId(&pSyncNode->myNodeInfo, pSyncNode->vgId, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1424 1425 1426

    // init peersNum, peers, peersId
    pSyncNode->peersNum = pSyncNode->pRaftCfg->cfg.replicaNum - 1;
S
Shengliang Guan 已提交
1427 1428
    int32_t j = 0;
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
M
Minghao Li 已提交
1429 1430 1431 1432 1433
      if (i != pSyncNode->pRaftCfg->cfg.myIndex) {
        pSyncNode->peersNodeInfo[j] = pSyncNode->pRaftCfg->cfg.nodeInfo[i];
        j++;
      }
    }
S
Shengliang Guan 已提交
1434
    for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
1435
      syncUtilNodeInfo2RaftId(&pSyncNode->peersNodeInfo[i], pSyncNode->vgId, &pSyncNode->peersId[i]);
M
Minghao Li 已提交
1436
    }
1437

M
Minghao Li 已提交
1438 1439
    // init replicaNum, replicasId
    pSyncNode->replicaNum = pSyncNode->pRaftCfg->cfg.replicaNum;
S
Shengliang Guan 已提交
1440
    for (int32_t i = 0; i < pSyncNode->pRaftCfg->cfg.replicaNum; ++i) {
1441
      syncUtilNodeInfo2RaftId(&pSyncNode->pRaftCfg->cfg.nodeInfo[i], pSyncNode->vgId, &pSyncNode->replicasId[i]);
M
Minghao Li 已提交
1442
    }
1443

1444 1445 1446
    // update quorum first
    pSyncNode->quorum = syncUtilQuorum(pSyncNode->pRaftCfg->cfg.replicaNum);

M
Minghao Li 已提交
1447 1448 1449 1450
    syncIndexMgrUpdate(pSyncNode->pNextIndex, pSyncNode);
    syncIndexMgrUpdate(pSyncNode->pMatchIndex, pSyncNode);
    voteGrantedUpdate(pSyncNode->pVotesGranted, pSyncNode);
    votesRespondUpdate(pSyncNode->pVotesRespond, pSyncNode);
M
Minghao Li 已提交
1451

M
Minghao Li 已提交
1452
    // reset snapshot senders
1453

M
Minghao Li 已提交
1454
    // clear new
S
Shengliang Guan 已提交
1455
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1456 1457
      (pSyncNode->senders)[i] = NULL;
    }
M
Minghao Li 已提交
1458

M
Minghao Li 已提交
1459
    // reset new
S
Shengliang Guan 已提交
1460
    for (int32_t i = 0; i < pSyncNode->replicaNum; ++i) {
M
Minghao Li 已提交
1461 1462
      // reset sender
      bool reset = false;
S
Shengliang Guan 已提交
1463
      for (int32_t j = 0; j < TSDB_MAX_REPLICA; ++j) {
M
Minghao Li 已提交
1464 1465 1466 1467
        if (syncUtilSameId(&(pSyncNode->replicasId)[i], &oldReplicasId[j])) {
          char     host[128];
          uint16_t port;
          syncUtilU642Addr((pSyncNode->replicasId)[i].addr, host, sizeof(host), &port);
1468
          sNTrace(pSyncNode, "snapshot sender reset for: %" PRId64 ", newIndex:%d, %s:%d, %p",
S
Shengliang Guan 已提交
1469
                  (pSyncNode->replicasId)[i].addr, i, host, port, oldSenders[j]);
M
Minghao Li 已提交
1470 1471 1472 1473 1474 1475 1476 1477 1478

          (pSyncNode->senders)[i] = oldSenders[j];
          oldSenders[j] = NULL;
          reset = true;

          // reset replicaIndex
          int32_t oldreplicaIndex = (pSyncNode->senders)[i]->replicaIndex;
          (pSyncNode->senders)[i]->replicaIndex = i;

S
Shengliang Guan 已提交
1479 1480
          sNTrace(pSyncNode, "snapshot sender udpate replicaIndex from %d to %d, %s:%d, %p, reset:%d", oldreplicaIndex,
                  i, host, port, (pSyncNode->senders)[i], reset);
M
Minghao Li 已提交
1481
        }
1482 1483
      }
    }
1484

M
Minghao Li 已提交
1485
    // create new
S
Shengliang Guan 已提交
1486
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1487 1488
      if ((pSyncNode->senders)[i] == NULL) {
        (pSyncNode->senders)[i] = snapshotSenderCreate(pSyncNode, i);
S
Shengliang Guan 已提交
1489
        sSTrace((pSyncNode->senders)[i], "snapshot sender create new");
M
Minghao Li 已提交
1490
      }
1491 1492
    }

M
Minghao Li 已提交
1493
    // free old
S
Shengliang Guan 已提交
1494
    for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1495 1496
      if (oldSenders[i] != NULL) {
        snapshotSenderDestroy(oldSenders[i]);
S
Shengliang Guan 已提交
1497
        sNTrace(pSyncNode, "snapshot sender delete old %p replica-index:%d", oldSenders[i], i);
M
Minghao Li 已提交
1498 1499
        oldSenders[i] = NULL;
      }
1500 1501
    }

1502
    // persist cfg
M
Minghao Li 已提交
1503
    raftCfgPersist(pSyncNode->pRaftCfg);
1504

S
Shengliang Guan 已提交
1505
    char tmpbuf[1024] = {0};
1506
    snprintf(tmpbuf, sizeof(tmpbuf), "config change from %d to %d, index:%" PRId64 ", %s  -->  %s",
S
Shengliang Guan 已提交
1507
             oldConfig.replicaNum, pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1508

M
Minghao Li 已提交
1509 1510 1511
    // change isStandBy to normal (election timeout)
    if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
      syncNodeBecomeLeader(pSyncNode, tmpbuf);
1512 1513 1514 1515 1516

      // Raft 3.6.2 Committing entries from previous terms
      syncNodeAppendNoop(pSyncNode);
      syncMaybeAdvanceCommitIndex(pSyncNode);

M
Minghao Li 已提交
1517 1518 1519 1520
    } else {
      syncNodeBecomeFollower(pSyncNode, tmpbuf);
    }
  } else {
1521
    // persist cfg
M
Minghao Li 已提交
1522
    raftCfgPersist(pSyncNode->pRaftCfg);
S
Shengliang Guan 已提交
1523 1524
    sNTrace(pSyncNode, "do not config change from %d to %d, index:%" PRId64 ", %s  -->  %s", oldConfig.replicaNum,
            pNewConfig->replicaNum, lastConfigChangeIndex, oldCfgStr, newCfgStr);
1525
  }
1526

M
Minghao Li 已提交
1527
_END:
M
Minghao Li 已提交
1528
  // log end config change
S
Shengliang Guan 已提交
1529
  sNTrace(pSyncNode, "end do config change, from %s to %s", oldCfgStr, newCfgStr);
M
Minghao Li 已提交
1530 1531
}

M
Minghao Li 已提交
1532 1533 1534 1535
// raft state change --------------
void syncNodeUpdateTerm(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
1536
    char tmpBuf[64];
1537
    snprintf(tmpBuf, sizeof(tmpBuf), "update term to %" PRId64, term);
1538
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
M
Minghao Li 已提交
1539 1540 1541 1542
    raftStoreClearVote(pSyncNode->pRaftStore);
  }
}

1543 1544 1545 1546 1547 1548
void syncNodeUpdateTermWithoutStepDown(SSyncNode* pSyncNode, SyncTerm term) {
  if (term > pSyncNode->pRaftStore->currentTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, term);
  }
}

M
Minghao Li 已提交
1549
void syncNodeStepDown(SSyncNode* pSyncNode, SyncTerm newTerm) {
M
Minghao Li 已提交
1550
  if (pSyncNode->pRaftStore->currentTerm > newTerm) {
1551
    sNTrace(pSyncNode, "step down, ignore, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1552
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1553 1554
    return;
  }
M
Minghao Li 已提交
1555 1556

  do {
1557
    sNTrace(pSyncNode, "step down, new-term:%" PRId64 ", current-term:%" PRId64, newTerm,
S
Shengliang Guan 已提交
1558
            pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
1559 1560 1561 1562 1563
  } while (0);

  if (pSyncNode->pRaftStore->currentTerm < newTerm) {
    raftStoreSetTerm(pSyncNode->pRaftStore, newTerm);
    char tmpBuf[64];
1564
    snprintf(tmpBuf, sizeof(tmpBuf), "step down, update term to %" PRId64, newTerm);
M
Minghao Li 已提交
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
    syncNodeBecomeFollower(pSyncNode, tmpBuf);
    raftStoreClearVote(pSyncNode->pRaftStore);

  } else {
    if (pSyncNode->state != TAOS_SYNC_STATE_FOLLOWER) {
      syncNodeBecomeFollower(pSyncNode, "step down");
    }
  }
}

1575 1576
void syncNodeLeaderChangeRsp(SSyncNode* pSyncNode) { syncRespCleanRsp(pSyncNode->pSyncRespMgr); }

1577
void syncNodeBecomeFollower(SSyncNode* pSyncNode, const char* debugStr) {
M
Minghao Li 已提交
1578
  // maybe clear leader cache
M
Minghao Li 已提交
1579 1580 1581 1582
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    pSyncNode->leaderCache = EMPTY_RAFT_ID;
  }

M
Minghao Li 已提交
1583
  // state change
M
Minghao Li 已提交
1584 1585 1586
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
  syncNodeStopHeartbeatTimer(pSyncNode);

M
Minghao Li 已提交
1587 1588
  // reset elect timer
  syncNodeResetElectTimer(pSyncNode);
M
Minghao Li 已提交
1589

1590 1591 1592
  // send rsp to client
  syncNodeLeaderChangeRsp(pSyncNode);

1593 1594 1595 1596 1597
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeFollowerCb != NULL) {
    pSyncNode->pFsm->FpBecomeFollowerCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1598 1599 1600
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1601 1602 1603
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1604
  // trace log
S
Shengliang Guan 已提交
1605
  sNTrace(pSyncNode, "become follower %s", debugStr);
M
Minghao Li 已提交
1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625
}

// TLA+ Spec
// \* Candidate i transitions to leader.
// BecomeLeader(i) ==
//     /\ state[i] = Candidate
//     /\ votesGranted[i] \in Quorum
//     /\ state'      = [state EXCEPT ![i] = Leader]
//     /\ nextIndex'  = [nextIndex EXCEPT ![i] =
//                          [j \in Server |-> Len(log[i]) + 1]]
//     /\ matchIndex' = [matchIndex EXCEPT ![i] =
//                          [j \in Server |-> 0]]
//     /\ elections'  = elections \cup
//                          {[eterm     |-> currentTerm[i],
//                            eleader   |-> i,
//                            elog      |-> log[i],
//                            evotes    |-> votesGranted[i],
//                            evoterLog |-> voterLog[i]]}
//     /\ UNCHANGED <<messages, currentTerm, votedFor, candidateVars, logVars>>
//
1626
void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
1627 1628
  pSyncNode->leaderTime = taosGetTimestampMs();

1629 1630 1631
  // reset restoreFinish
  pSyncNode->restoreFinish = false;

M
Minghao Li 已提交
1632
  // state change
M
Minghao Li 已提交
1633
  pSyncNode->state = TAOS_SYNC_STATE_LEADER;
M
Minghao Li 已提交
1634 1635

  // set leader cache
M
Minghao Li 已提交
1636 1637
  pSyncNode->leaderCache = pSyncNode->myRaftId;

S
Shengliang Guan 已提交
1638
  for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1639 1640
    // maybe overwrite myself, no harm
    // just do it!
1641 1642 1643 1644 1645 1646 1647 1648 1649

    // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1;

    // maybe wal is deleted
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
    ASSERT(code == 0);
    pSyncNode->pNextIndex->index[i] = lastIndex + 1;
M
Minghao Li 已提交
1650 1651
  }

S
Shengliang Guan 已提交
1652
  for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
M
Minghao Li 已提交
1653 1654
    // maybe overwrite myself, no harm
    // just do it!
M
Minghao Li 已提交
1655 1656 1657
    pSyncNode->pMatchIndex->index[i] = SYNC_INDEX_INVALID;
  }

M
Minghao Li 已提交
1658 1659 1660
  // init peer mgr
  syncNodePeerStateInit(pSyncNode);

M
Minghao Li 已提交
1661
#if 0
1662 1663
  // update sender private term
  SSyncSnapshotSender* pMySender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->myRaftId));
1664
  if (pMySender != NULL) {
S
Shengliang Guan 已提交
1665
    for (int32_t i = 0; i < pSyncNode->pMatchIndex->replicaNum; ++i) {
1666 1667 1668
      if ((pSyncNode->senders)[i]->privateTerm > pMySender->privateTerm) {
        pMySender->privateTerm = (pSyncNode->senders)[i]->privateTerm;
      }
1669
    }
1670
    (pMySender->privateTerm) += 100;
1671
  }
M
Minghao Li 已提交
1672
#endif
1673

1674 1675 1676 1677 1678
  // close receiver
  if (snapshotReceiverIsStart(pSyncNode->pNewNodeReceiver)) {
    snapshotReceiverForceStop(pSyncNode->pNewNodeReceiver);
  }

M
Minghao Li 已提交
1679
  // stop elect timer
M
Minghao Li 已提交
1680
  syncNodeStopElectTimer(pSyncNode);
M
Minghao Li 已提交
1681

M
Minghao Li 已提交
1682 1683
  // start heartbeat timer
  syncNodeStartHeartbeatTimer(pSyncNode);
M
Minghao Li 已提交
1684

M
Minghao Li 已提交
1685 1686
  // send heartbeat right now
  syncNodeHeartbeatPeers(pSyncNode);
M
Minghao Li 已提交
1687

1688 1689 1690 1691 1692
  // call back
  if (pSyncNode->pFsm != NULL && pSyncNode->pFsm->FpBecomeLeaderCb != NULL) {
    pSyncNode->pFsm->FpBecomeLeaderCb(pSyncNode->pFsm);
  }

M
Minghao Li 已提交
1693 1694 1695
  // min match index
  pSyncNode->minMatchIndex = SYNC_INDEX_INVALID;

B
Benguang Zhao 已提交
1696 1697 1698
  // reset log buffer
  syncLogBufferReset(pSyncNode->pLogBuf, pSyncNode);

M
Minghao Li 已提交
1699
  // trace log
S
Shengliang Guan 已提交
1700
  sNTrace(pSyncNode, "become leader %s", debugStr);
M
Minghao Li 已提交
1701 1702 1703
}

void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1704 1705
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
1706
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
M
Minghao Li 已提交
1707

S
Shengliang Guan 已提交
1708
  sNTrace(pSyncNode, "state change syncNodeCandidate2Leader");
M
Minghao Li 已提交
1709

B
Benguang Zhao 已提交
1710
  int32_t ret = syncNodeAppendNoop(pSyncNode);
1711 1712 1713 1714
  if (ret < 0) {
    sError("vgId:%d, failed to append noop entry since %s", pSyncNode->vgId, terrstr());
  }

B
Benguang Zhao 已提交
1715 1716 1717 1718
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  ASSERT(lastIndex >= 0);
  sInfo("vgId:%d, become leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64 "",
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
B
Benguang Zhao 已提交
1719 1720 1721 1722 1723 1724 1725
}

void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) {
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");

M
Minghao Li 已提交
1726
  // Raft 3.6.2 Committing entries from previous terms
1727 1728
  syncNodeAppendNoop(pSyncNode);
  syncMaybeAdvanceCommitIndex(pSyncNode);
1729 1730

  if (pSyncNode->replicaNum > 1) {
M
Minghao Li 已提交
1731
    syncNodeReplicate(pSyncNode);
1732
  }
M
Minghao Li 已提交
1733 1734
}

M
Minghao Li 已提交
1735 1736
bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }

M
Minghao Li 已提交
1737
int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1738
  for (int32_t i = 0; i < TSDB_MAX_REPLICA; ++i) {
M
Minghao Li 已提交
1739 1740 1741 1742 1743
    pSyncNode->peerStates[i].lastSendIndex = SYNC_INDEX_INVALID;
    pSyncNode->peerStates[i].lastSendTime = 0;
  }

  return 0;
M
Minghao Li 已提交
1744 1745 1746
}

void syncNodeFollower2Candidate(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1747
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER);
M
Minghao Li 已提交
1748
  pSyncNode->state = TAOS_SYNC_STATE_CANDIDATE;
B
Benguang Zhao 已提交
1749 1750 1751
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become candidate from follower. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);
M
Minghao Li 已提交
1752

S
Shengliang Guan 已提交
1753
  sNTrace(pSyncNode, "follower to candidate");
M
Minghao Li 已提交
1754 1755 1756
}

void syncNodeLeader2Follower(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1757
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
1758
  syncNodeBecomeFollower(pSyncNode, "leader to follower");
B
Benguang Zhao 已提交
1759 1760 1761 1762
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from leader. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1763
  sNTrace(pSyncNode, "leader to follower");
M
Minghao Li 已提交
1764 1765 1766
}

void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1767
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
1768
  syncNodeBecomeFollower(pSyncNode, "candidate to follower");
B
Benguang Zhao 已提交
1769 1770 1771 1772
  SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
  sInfo("vgId:%d, become follower from candidate. term: %" PRId64 ", commit index: %" PRId64 ", last index: %" PRId64,
        pSyncNode->vgId, pSyncNode->pRaftStore->currentTerm, pSyncNode->commitIndex, lastIndex);

S
Shengliang Guan 已提交
1773
  sNTrace(pSyncNode, "candidate to follower");
M
Minghao Li 已提交
1774 1775
}

M
Minghao Li 已提交
1776 1777
// just called by syncNodeVoteForSelf
// need assert
M
Minghao Li 已提交
1778
void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
M
Minghao Li 已提交
1779 1780
  ASSERT(term == pSyncNode->pRaftStore->currentTerm);
  ASSERT(!raftStoreHasVoted(pSyncNode->pRaftStore));
M
Minghao Li 已提交
1781 1782 1783 1784

  raftStoreVote(pSyncNode->pRaftStore, pRaftId);
}

M
Minghao Li 已提交
1785
// simulate get vote from outside
M
Minghao Li 已提交
1786
void syncNodeVoteForSelf(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
1787
  syncNodeVoteForTerm(pSyncNode, pSyncNode->pRaftStore->currentTerm, &pSyncNode->myRaftId);
M
Minghao Li 已提交
1788

S
Shengliang Guan 已提交
1789 1790
  SRpcMsg rpcMsg = {0};
  int32_t ret = syncBuildRequestVoteReply(&rpcMsg, pSyncNode->vgId);
S
Shengliang Guan 已提交
1791
  if (ret != 0) return;
M
Minghao Li 已提交
1792

S
Shengliang Guan 已提交
1793
  SyncRequestVoteReply* pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
1794 1795 1796 1797 1798 1799 1800
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = pSyncNode->myRaftId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->voteGranted = true;

  voteGrantedVote(pSyncNode->pVotesGranted, pMsg);
  votesRespondAdd(pSyncNode->pVotesRespond, pMsg);
S
Shengliang Guan 已提交
1801
  rpcFreeCont(rpcMsg.pCont);
M
Minghao Li 已提交
1802 1803
}

M
Minghao Li 已提交
1804
// return if has a snapshot
M
Minghao Li 已提交
1805 1806
bool syncNodeHasSnapshot(SSyncNode* pSyncNode) {
  bool      ret = false;
1807
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1808 1809
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1810 1811 1812 1813 1814 1815 1816
    if (snapshot.lastApplyIndex >= SYNC_INDEX_BEGIN) {
      ret = true;
    }
  }
  return ret;
}

M
Minghao Li 已提交
1817 1818
// return max(logLastIndex, snapshotLastIndex)
// if no snapshot and log, return -1
1819
SyncIndex syncNodeGetLastIndex(const SSyncNode* pSyncNode) {
M
Minghao Li 已提交
1820
  SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1821 1822
  if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
    pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1823 1824 1825 1826 1827 1828 1829
  }
  SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);

  SyncIndex lastIndex = logLastIndex > snapshot.lastApplyIndex ? logLastIndex : snapshot.lastApplyIndex;
  return lastIndex;
}

M
Minghao Li 已提交
1830 1831
// return the last term of snapshot and log
// if error, return SYNC_TERM_INVALID (by syncLogLastTerm)
M
Minghao Li 已提交
1832 1833
SyncTerm syncNodeGetLastTerm(SSyncNode* pSyncNode) {
  SyncTerm lastTerm = 0;
M
Minghao Li 已提交
1834 1835
  if (syncNodeHasSnapshot(pSyncNode)) {
    // has snapshot
1836
    SSnapshot snapshot = {.data = NULL, .lastApplyIndex = -1, .lastApplyTerm = 0, .lastConfigIndex = -1};
1837 1838
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
M
Minghao Li 已提交
1839 1840
    }

M
Minghao Li 已提交
1841 1842 1843
    SyncIndex logLastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore);
    if (logLastIndex > snapshot.lastApplyIndex) {
      lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
M
Minghao Li 已提交
1844 1845 1846 1847
    } else {
      lastTerm = snapshot.lastApplyTerm;
    }

M
Minghao Li 已提交
1848
  } else {
M
Minghao Li 已提交
1849 1850
    // no snapshot
    lastTerm = pSyncNode->pLogStore->syncLogLastTerm(pSyncNode->pLogStore);
1851
  }
M
Minghao Li 已提交
1852

M
Minghao Li 已提交
1853 1854 1855 1856 1857 1858 1859
  return lastTerm;
}

// get last index and term along with snapshot
int32_t syncNodeGetLastIndexTerm(SSyncNode* pSyncNode, SyncIndex* pLastIndex, SyncTerm* pLastTerm) {
  *pLastIndex = syncNodeGetLastIndex(pSyncNode);
  *pLastTerm = syncNodeGetLastTerm(pSyncNode);
1860 1861
  return 0;
}
M
Minghao Li 已提交
1862

M
Minghao Li 已提交
1863
// return append-entries first try index
M
Minghao Li 已提交
1864 1865 1866 1867 1868
SyncIndex syncNodeSyncStartIndex(SSyncNode* pSyncNode) {
  SyncIndex syncStartIndex = syncNodeGetLastIndex(pSyncNode) + 1;
  return syncStartIndex;
}

M
Minghao Li 已提交
1869 1870
// if index > 0, return index - 1
// else, return -1
1871 1872 1873 1874 1875 1876 1877 1878 1879
SyncIndex syncNodeGetPreIndex(SSyncNode* pSyncNode, SyncIndex index) {
  SyncIndex preIndex = index - 1;
  if (preIndex < SYNC_INDEX_INVALID) {
    preIndex = SYNC_INDEX_INVALID;
  }

  return preIndex;
}

M
Minghao Li 已提交
1880 1881 1882 1883
// if index < 0, return SYNC_TERM_INVALID
// if index == 0, return 0
// if index > 0, return preTerm
// if error, return SYNC_TERM_INVALID
1884 1885 1886 1887 1888 1889 1890 1891 1892
SyncTerm syncNodeGetPreTerm(SSyncNode* pSyncNode, SyncIndex index) {
  if (index < SYNC_INDEX_BEGIN) {
    return SYNC_TERM_INVALID;
  }

  if (index == SYNC_INDEX_BEGIN) {
    return 0;
  }

1893 1894 1895
  SyncTerm  preTerm = 0;
  SyncIndex preIndex = index - 1;

1896
  SSyncRaftEntry* pPreEntry = NULL;
1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910
  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
  LRUHandle*      h = taosLRUCacheLookup(pCache, &preIndex, sizeof(preIndex));
  int32_t         code = 0;
  if (h) {
    pPreEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
    code = 0;

    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", preIndex, pPreEntry->bytes, pPreEntry);

  } else {
    sNTrace(pSyncNode, "miss cache index:%" PRId64, preIndex);

    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, preIndex, &pPreEntry);
  }
M
Minghao Li 已提交
1911 1912 1913 1914 1915 1916

  SSnapshot snapshot = {.data = NULL,
                        .lastApplyIndex = SYNC_INDEX_INVALID,
                        .lastApplyTerm = SYNC_TERM_INVALID,
                        .lastConfigIndex = SYNC_INDEX_INVALID};

1917 1918 1919
  if (code == 0) {
    ASSERT(pPreEntry != NULL);
    preTerm = pPreEntry->term;
1920 1921 1922 1923

    if (h) {
      taosLRUCacheRelease(pCache, h, false);
    } else {
1924
      syncEntryDestroy(pPreEntry);
1925 1926
    }

1927 1928
    return preTerm;
  } else {
1929 1930 1931 1932
    if (pSyncNode->pFsm->FpGetSnapshotInfo != NULL) {
      pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
      if (snapshot.lastApplyIndex == preIndex) {
        return snapshot.lastApplyTerm;
1933 1934 1935 1936
      }
    }
  }

1937
  sNError(pSyncNode, "sync node get pre term error, index:%" PRId64 ", snap-index:%" PRId64 ", snap-term:%" PRId64,
S
Shengliang Guan 已提交
1938
          index, snapshot.lastApplyIndex, snapshot.lastApplyTerm);
1939 1940
  return SYNC_TERM_INVALID;
}
M
Minghao Li 已提交
1941 1942 1943 1944

// get pre index and term of "index"
int32_t syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncIndex* pPreIndex, SyncTerm* pPreTerm) {
  *pPreIndex = syncNodeGetPreIndex(pSyncNode, index);
M
Minghao Li 已提交
1945
  *pPreTerm = syncNodeGetPreTerm(pSyncNode, index);
M
Minghao Li 已提交
1946 1947 1948
  return 0;
}

M
Minghao Li 已提交
1949
static void syncNodeEqPingTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
1950
  if (!syncIsInit()) return;
M
Minghao Li 已提交
1951

S
Shengliang Guan 已提交
1952 1953 1954
  SSyncNode* pNode = param;
  if (atomic_load_64(&pNode->pingTimerLogicClockUser) <= atomic_load_64(&pNode->pingTimerLogicClock)) {
    SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
1955
    int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_PING, atomic_load_64(&pNode->pingTimerLogicClock),
S
Shengliang Guan 已提交
1956 1957
                                    pNode->pingTimerMS, pNode);
    if (code != 0) {
M
Minghao Li 已提交
1958
      sError("failed to build ping msg");
S
Shengliang Guan 已提交
1959 1960
      rpcFreeCont(rpcMsg.pCont);
      return;
M
Minghao Li 已提交
1961
    }
M
Minghao Li 已提交
1962

1963
    sTrace("enqueue ping msg");
S
Shengliang Guan 已提交
1964 1965
    code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
    if (code != 0) {
M
Minghao Li 已提交
1966
      sError("failed to sync enqueue ping msg since %s", terrstr());
S
Shengliang Guan 已提交
1967 1968
      rpcFreeCont(rpcMsg.pCont);
      return;
1969
    }
M
Minghao Li 已提交
1970

S
Shengliang Guan 已提交
1971
    taosTmrReset(syncNodeEqPingTimer, pNode->pingTimerMS, pNode, syncEnv()->pTimerManager, &pNode->pPingTimer);
1972
  }
M
Minghao Li 已提交
1973 1974
}

M
Minghao Li 已提交
1975
static void syncNodeEqElectTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
1976
  if (!syncIsInit()) return;
M
Minghao Li 已提交
1977

1978
  SSyncNode* pNode = (SSyncNode*)param;
M
Minghao Li 已提交
1979

1980 1981
  if (pNode == NULL) return;
  if (pNode->syncEqMsg == NULL) return;
1982

1983 1984
  int64_t tsNow = taosGetTimestampMs();
  if (tsNow < pNode->electTimerParam.executeTime) return;
M
Minghao Li 已提交
1985

S
Shengliang Guan 已提交
1986
  SRpcMsg rpcMsg = {0};
1987 1988
  int32_t code =
      syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_ELECTION, pNode->electTimerParam.logicClock, pNode->electTimerMS, pNode);
S
Shengliang Guan 已提交
1989

S
Shengliang Guan 已提交
1990
  if (code != 0) {
M
Minghao Li 已提交
1991
    sError("failed to build elect msg");
1992

S
Shengliang Guan 已提交
1993
    return;
M
Minghao Li 已提交
1994 1995
  }

S
Shengliang Guan 已提交
1996
  SyncTimeout* pTimeout = rpcMsg.pCont;
S
Shengliang Guan 已提交
1997
  sNTrace(pNode, "enqueue elect msg lc:%" PRId64, pTimeout->logicClock);
S
Shengliang Guan 已提交
1998 1999 2000

  code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2001
    sError("failed to sync enqueue elect msg since %s", terrstr());
S
Shengliang Guan 已提交
2002
    rpcFreeCont(rpcMsg.pCont);
S
Shengliang Guan 已提交
2003

2004
    return;
M
Minghao Li 已提交
2005 2006 2007
  }
}

M
Minghao Li 已提交
2008
static void syncNodeEqHeartbeatTimer(void* param, void* tmrId) {
S
Shengliang Guan 已提交
2009
  if (!syncIsInit()) return;
2010

S
Shengliang Guan 已提交
2011 2012 2013 2014
  SSyncNode* pNode = param;
  if (pNode->replicaNum > 1) {
    if (atomic_load_64(&pNode->heartbeatTimerLogicClockUser) <= atomic_load_64(&pNode->heartbeatTimerLogicClock)) {
      SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2015
      int32_t code = syncBuildTimeout(&rpcMsg, SYNC_TIMEOUT_HEARTBEAT, atomic_load_64(&pNode->heartbeatTimerLogicClock),
S
Shengliang Guan 已提交
2016 2017 2018
                                      pNode->heartbeatTimerMS, pNode);

      if (code != 0) {
M
Minghao Li 已提交
2019
        sError("failed to build heartbeat msg");
S
Shengliang Guan 已提交
2020
        return;
2021
      }
M
Minghao Li 已提交
2022

2023
      sTrace("enqueue heartbeat timer");
S
Shengliang Guan 已提交
2024 2025
      code = pNode->syncEqMsg(pNode->msgcb, &rpcMsg);
      if (code != 0) {
M
Minghao Li 已提交
2026
        sError("failed to enqueue heartbeat msg since %s", terrstr());
S
Shengliang Guan 已提交
2027 2028
        rpcFreeCont(rpcMsg.pCont);
        return;
2029
      }
S
Shengliang Guan 已提交
2030 2031 2032 2033

      taosTmrReset(syncNodeEqHeartbeatTimer, pNode->heartbeatTimerMS, pNode, syncEnv()->pTimerManager,
                   &pNode->pHeartbeatTimer);

2034
    } else {
S
Shengliang Guan 已提交
2035 2036
      sTrace("==syncNodeEqHeartbeatTimer== heartbeatTimerLogicClock:%" PRId64 ", heartbeatTimerLogicClockUser:%" PRId64,
             pNode->heartbeatTimerLogicClock, pNode->heartbeatTimerLogicClockUser);
2037
    }
M
Minghao Li 已提交
2038 2039 2040
  }
}

2041 2042 2043 2044 2045
static void syncNodeEqPeerHeartbeatTimer(void* param, void* tmrId) {
  SSyncHbTimerData* pData = (SSyncHbTimerData*)param;
  SSyncNode*        pSyncNode = pData->pSyncNode;
  SSyncTimer*       pSyncTimer = pData->pTimer;

M
Minghao Li 已提交
2046 2047 2048 2049
  if (pSyncNode == NULL) {
    return;
  }

M
Minghao Li 已提交
2050 2051 2052 2053
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    return;
  }

M
Minghao Li 已提交
2054 2055 2056 2057
  if (pSyncNode->pRaftStore == NULL) {
    return;
  }

S
Shengliang Guan 已提交
2058
  // sNTrace(pSyncNode, "eq peer hb timer");
2059 2060 2061 2062 2063 2064

  int64_t timerLogicClock = atomic_load_64(&pSyncTimer->logicClock);
  int64_t msgLogicClock = atomic_load_64(&pData->logicClock);

  if (pSyncNode->replicaNum > 1) {
    if (timerLogicClock == msgLogicClock) {
2065
      SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2066 2067 2068
      (void)syncBuildHeartbeat(&rpcMsg, pSyncNode->vgId);

      SyncHeartbeat* pSyncMsg = rpcMsg.pCont;
2069 2070 2071 2072
      pSyncMsg->srcId = pSyncNode->myRaftId;
      pSyncMsg->destId = pData->destId;
      pSyncMsg->term = pSyncNode->pRaftStore->currentTerm;
      pSyncMsg->commitIndex = pSyncNode->commitIndex;
M
Minghao Li 已提交
2073
      pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
2074 2075 2076
      pSyncMsg->privateTerm = 0;

      // send msg
S
Shengliang Guan 已提交
2077
      syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
2078

S
Shengliang Guan 已提交
2079 2080
      if (syncIsInit()) {
        taosTmrReset(syncNodeEqPeerHeartbeatTimer, pSyncTimer->timerMS, pData, syncEnv()->pTimerManager,
2081 2082 2083 2084 2085 2086
                     &pSyncTimer->pTimer);
      } else {
        sError("sync env is stop, syncNodeEqHeartbeatTimer");
      }

    } else {
2087
      sTrace("==syncNodeEqPeerHeartbeatTimer== timerLogicClock:%" PRId64 ", msgLogicClock:%" PRId64 "", timerLogicClock,
2088 2089 2090 2091 2092
             msgLogicClock);
    }
  }
}

2093 2094 2095 2096 2097
static int32_t syncNodeEqNoop(SSyncNode* pNode) {
  if (pNode->state == TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    return -1;
  }
M
Minghao Li 已提交
2098

2099 2100 2101 2102
  SyncIndex       index = pNode->pLogStore->syncLogWriteIndex(pNode->pLogStore);
  SyncTerm        term = pNode->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, pNode->vgId);
  if (pEntry == NULL) return -1;
M
Minghao Li 已提交
2103

S
Shengliang Guan 已提交
2104
  SRpcMsg rpcMsg = {0};
S
Shengliang Guan 已提交
2105
  int32_t code = syncBuildClientRequestFromNoopEntry(&rpcMsg, pEntry, pNode->vgId);
2106
  syncEntryDestroy(pEntry);
M
Minghao Li 已提交
2107

2108 2109 2110
  sNTrace(pNode, "propose msg, type:noop");
  code = (*pNode->syncEqMsg)(pNode->msgcb, &rpcMsg);
  if (code != 0) {
M
Minghao Li 已提交
2111
    sError("failed to propose noop msg while enqueue since %s", terrstr());
2112
  }
M
Minghao Li 已提交
2113

2114
  return code;
M
Minghao Li 已提交
2115 2116
}

2117 2118
static void deleteCacheEntry(const void* key, size_t keyLen, void* value) { taosMemoryFree(value); }

2119 2120 2121 2122
int32_t syncCacheEntry(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, LRUHandle** h) {
  SSyncLogStoreData* pData = pLogStore->data;
  sNTrace(pData->pSyncNode, "in cache index:%" PRId64 ", bytes:%u, %p", pEntry->index, pEntry->bytes, pEntry);

S
Shengliang Guan 已提交
2123 2124
  int32_t   code = 0;
  int32_t   entryLen = sizeof(*pEntry) + pEntry->dataLen;
2125 2126 2127 2128 2129 2130 2131 2132 2133
  LRUStatus status = taosLRUCacheInsert(pLogStore->pCache, &pEntry->index, sizeof(pEntry->index), pEntry, entryLen,
                                        deleteCacheEntry, h, TAOS_LRU_PRIORITY_LOW);
  if (status != TAOS_LRU_STATUS_OK) {
    code = -1;
  }

  return code;
}

B
Benguang Zhao 已提交
2134 2135 2136 2137 2138 2139 2140 2141 2142 2143
int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) {
  // append to log buffer
  if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) {
    sError("vgId:%d, failed to enqueue log buffer. index:%" PRId64 "", ths->vgId, pEntry->index);
    return -1;
  }

  // proceed match index, with replicating on needed
  SyncIndex matchIndex = syncLogBufferProceed(ths->pLogBuf, ths);

B
Benguang Zhao 已提交
2144 2145 2146 2147
  sInfo("vgId:%d, append raft entry. index: %" PRId64 ", term: %" PRId64 " pBuf: [%" PRId64 " %" PRId64 " %" PRId64
        ", %" PRId64 ")",
        ths->vgId, pEntry->index, pEntry->term, ths->pLogBuf->startIndex, ths->pLogBuf->commitIndex,
        ths->pLogBuf->matchIndex, ths->pLogBuf->endIndex);
B
Benguang Zhao 已提交
2148

B
Benguang Zhao 已提交
2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164
  // multi replica
  if (ths->replicaNum > 1) {
    return 0;
  }

  // single replica
  (void)syncNodeUpdateCommitIndex(ths, matchIndex);

  if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
    sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex);
    return -1;
  }

  return 0;
}

M
Minghao Li 已提交
2165
static int32_t syncNodeAppendNoop(SSyncNode* ths) {
B
Benguang Zhao 已提交
2166 2167 2168 2169 2170 2171 2172 2173 2174
  SyncIndex index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm  term = ths->pRaftStore->currentTerm;

  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
  if (pEntry == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return -1;
  }

B
Benguang Zhao 已提交
2175 2176
  int32_t ret = syncNodeAppend(ths, pEntry);
  return 0;
B
Benguang Zhao 已提交
2177 2178 2179
}

static int32_t syncNodeAppendNoopOld(SSyncNode* ths) {
M
Minghao Li 已提交
2180 2181
  int32_t ret = 0;

2182
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2183
  SyncTerm        term = ths->pRaftStore->currentTerm;
M
Minghao Li 已提交
2184
  SSyncRaftEntry* pEntry = syncEntryBuildNoop(term, index, ths->vgId);
M
Minghao Li 已提交
2185
  ASSERT(pEntry != NULL);
M
Minghao Li 已提交
2186

2187 2188
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2189
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2190
    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
2191
    if (code != 0) {
M
Minghao Li 已提交
2192
      sError("append noop error");
2193 2194
      return -1;
    }
2195 2196

    syncCacheEntry(ths->pLogStore, pEntry, &h);
M
Minghao Li 已提交
2197 2198
  }

2199 2200 2201
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2202
    syncEntryDestroy(pEntry);
2203 2204
  }

M
Minghao Li 已提交
2205 2206 2207
  return ret;
}

S
Shengliang Guan 已提交
2208 2209
int32_t syncNodeOnHeartbeat(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncHeartbeat* pMsg = pRpcMsg->pCont;
2210 2211
  syncLogRecvHeartbeat(ths, pMsg, "");

2212 2213 2214 2215
  SRpcMsg rpcMsg = {0};
  (void)syncBuildHeartbeatReply(&rpcMsg, ths->vgId);

  SyncHeartbeatReply* pMsgReply = rpcMsg.pCont;
2216 2217 2218 2219
  pMsgReply->destId = pMsg->srcId;
  pMsgReply->srcId = ths->myRaftId;
  pMsgReply->term = ths->pRaftStore->currentTerm;
  pMsgReply->privateTerm = 8864;  // magic number
2220
  pMsgReply->startTime = ths->startTime;
2221

M
Minghao Li 已提交
2222
  if (pMsg->term == ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_LEADER) {
2223
    syncNodeResetElectTimer(ths);
M
Minghao Li 已提交
2224
    ths->minMatchIndex = pMsg->minMatchIndex;
2225 2226

    if (ths->state == TAOS_SYNC_STATE_FOLLOWER) {
2227
      // syncNodeFollowerCommit(ths, pMsg->commitIndex);
S
Shengliang Guan 已提交
2228 2229 2230 2231
      SRpcMsg rpcMsgLocalCmd = {0};
      (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

      SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2232 2233
      pSyncMsg->cmd = SYNC_LOCAL_CMD_FOLLOWER_CMT;
      pSyncMsg->fcIndex = pMsg->commitIndex;
2234
      SyncIndex fcIndex = pSyncMsg->fcIndex;
2235 2236 2237 2238 2239 2240 2241

      if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
        int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
        if (code != 0) {
          sError("vgId:%d, sync enqueue fc-commit msg error, code:%d", ths->vgId, code);
          rpcFreeCont(rpcMsgLocalCmd.pCont);
        } else {
2242
          sTrace("vgId:%d, sync enqueue fc-commit msg, fc-index:%" PRId64, ths->vgId, fcIndex);
2243 2244
        }
      }
2245 2246 2247
    }
  }

M
Minghao Li 已提交
2248
  if (pMsg->term >= ths->pRaftStore->currentTerm && ths->state != TAOS_SYNC_STATE_FOLLOWER) {
2249
    // syncNodeStepDown(ths, pMsg->term);
S
Shengliang Guan 已提交
2250 2251 2252 2253
    SRpcMsg rpcMsgLocalCmd = {0};
    (void)syncBuildLocalCmd(&rpcMsgLocalCmd, ths->vgId);

    SyncLocalCmd* pSyncMsg = rpcMsgLocalCmd.pCont;
2254 2255 2256
    pSyncMsg->cmd = SYNC_LOCAL_CMD_STEP_DOWN;
    pSyncMsg->sdNewTerm = pMsg->term;

S
Shengliang Guan 已提交
2257 2258
    if (ths->syncEqMsg != NULL && ths->msgcb != NULL) {
      int32_t code = ths->syncEqMsg(ths->msgcb, &rpcMsgLocalCmd);
2259 2260 2261 2262
      if (code != 0) {
        sError("vgId:%d, sync enqueue step-down msg error, code:%d", ths->vgId, code);
        rpcFreeCont(rpcMsgLocalCmd.pCont);
      } else {
2263
        sTrace("vgId:%d, sync enqueue step-down msg, new-term: %" PRId64, ths->vgId, pSyncMsg->sdNewTerm);
2264
      }
2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279
    }
  }

  /*
    // htonl
    SMsgHead* pHead = rpcMsg.pCont;
    pHead->contLen = htonl(pHead->contLen);
    pHead->vgId = htonl(pHead->vgId);
  */

  // reply
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);
  return 0;
}

2280
int32_t syncNodeOnHeartbeatReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
2281
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
B
Benguang Zhao 已提交
2282
  SSyncLogReplMgr*    pMgr = syncNodeGetLogReplMgr(ths, &pMsg->srcId);
2283 2284 2285 2286 2287 2288 2289
  if (pMgr == NULL) {
    sError("vgId:%d, failed to get log repl mgr for the peer at addr 0x016%" PRIx64 "", ths->vgId, pMsg->srcId.addr);
    return -1;
  }
  return syncLogReplMgrProcessHeartbeatReply(pMgr, ths, pMsg);
}

2290
int32_t syncNodeOnHeartbeatReplyOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
2291
  SyncHeartbeatReply* pMsg = pRpcMsg->pCont;
2292 2293 2294
  syncLogRecvHeartbeatReply(ths, pMsg, "");

  // update last reply time, make decision whether the other node is alive or not
2295
  syncIndexMgrSetRecvTime(ths->pMatchIndex, &pMsg->destId, pMsg->startTime);
2296 2297 2298
  return 0;
}

S
Shengliang Guan 已提交
2299 2300
int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncLocalCmd* pMsg = pRpcMsg->pCont;
2301 2302
  syncLogRecvLocalCmd(ths, pMsg, "");

M
Minghao Li 已提交
2303 2304 2305
  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
    syncNodeStepDown(ths, pMsg->sdNewTerm);

2306 2307 2308
  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
    syncNodeFollowerCommit(ths, pMsg->fcIndex);

M
Minghao Li 已提交
2309
  } else {
M
Minghao Li 已提交
2310
    sError("error local cmd");
M
Minghao Li 已提交
2311
  }
2312 2313 2314 2315

  return 0;
}

2316 2317 2318 2319 2320 2321
int32_t syncLogToAppendEntries(SSyncNode* pNode, SSyncRaftEntry* pEntry, SyncTerm prevLogTerm, SRpcMsg* pRpcMsg) {
  uint32_t dataLen = pEntry->bytes;
  uint32_t bytes = sizeof(SyncAppendEntries) + dataLen;
  pRpcMsg->contLen = bytes;
  pRpcMsg->pCont = rpcMallocCont(pRpcMsg->contLen);
  if (pRpcMsg->pCont == NULL) {
B
Benguang Zhao 已提交
2322
    terrno = TSDB_CODE_OUT_OF_MEMORY;
2323
    return -1;
B
Benguang Zhao 已提交
2324 2325
  }

2326 2327 2328 2329 2330 2331
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
  pMsg->bytes = pRpcMsg->contLen;
  pMsg->msgType = pRpcMsg->msgType = TDMT_SYNC_APPEND_ENTRIES;
  pMsg->dataLen = dataLen;

  (void)memcpy(pMsg->data, pEntry, dataLen);
B
Benguang Zhao 已提交
2332

B
Benguang Zhao 已提交
2333 2334
  pMsg->prevLogIndex = pEntry->index - 1;
  pMsg->prevLogTerm = prevLogTerm;
B
Benguang Zhao 已提交
2335 2336 2337 2338 2339
  pMsg->vgId = pNode->vgId;
  pMsg->srcId = pNode->myRaftId;
  pMsg->term = pNode->pRaftStore->currentTerm;
  pMsg->commitIndex = pNode->commitIndex;
  pMsg->privateTerm = 0;
2340
  return 0;
B
Benguang Zhao 已提交
2341 2342
}

M
Minghao Li 已提交
2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
// TLA+ Spec
// ClientRequest(i, v) ==
//     /\ state[i] = Leader
//     /\ LET entry == [term  |-> currentTerm[i],
//                      value |-> v]
//            newLog == Append(log[i], entry)
//        IN  log' = [log EXCEPT ![i] = newLog]
//     /\ UNCHANGED <<messages, serverVars, candidateVars,
//                    leaderVars, commitIndex>>
//
M
Minghao Li 已提交
2353

2354
int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
S
Shengliang Guan 已提交
2355
  sNTrace(ths, "on client request");
2356

B
Benguang Zhao 已提交
2357 2358
  int32_t code = 0;

B
Benguang Zhao 已提交
2359 2360 2361
  SyncIndex       index = syncLogBufferGetEndIndex(ths->pLogBuf);
  SyncTerm        term = ths->pRaftStore->currentTerm;
  SSyncRaftEntry* pEntry = NULL;
2362 2363 2364 2365
  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
B
Benguang Zhao 已提交
2366 2367 2368 2369 2370 2371 2372 2373 2374 2375
  }

  if (ths->state == TAOS_SYNC_STATE_LEADER) {
    if (pRetIndex) {
      (*pRetIndex) = index;
    }

    return syncNodeAppend(ths, pEntry);
  }

B
Benguang Zhao 已提交
2376
  return -1;
B
Benguang Zhao 已提交
2377 2378
}

2379 2380
int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
  sNTrace(ths, "on client request");
B
Benguang Zhao 已提交
2381

M
Minghao Li 已提交
2382
  int32_t ret = 0;
2383
  int32_t code = 0;
M
Minghao Li 已提交
2384

M
Minghao Li 已提交
2385
  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
M
Minghao Li 已提交
2386
  SyncTerm        term = ths->pRaftStore->currentTerm;
2387 2388 2389 2390 2391 2392 2393
  SSyncRaftEntry* pEntry;

  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
  } else {
    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
  }
M
Minghao Li 已提交
2394

2395 2396
  LRUHandle* h = NULL;

M
Minghao Li 已提交
2397
  if (ths->state == TAOS_SYNC_STATE_LEADER) {
2398 2399 2400
    // append entry
    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
    if (code != 0) {
2401 2402 2403 2404
      if (ths->replicaNum == 1) {
        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2405
          syncEntryDestroy(pEntry);
2406
        }
2407

2408 2409 2410 2411
        return -1;

      } else {
        // del resp mgr, call FpCommitCb
2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422
        SFsmCbMeta cbMeta = {
            .index = pEntry->index,
            .lastConfigIndex = SYNC_INDEX_INVALID,
            .isWeak = pEntry->isWeak,
            .code = -1,
            .state = ths->state,
            .seqNum = pEntry->seqNum,
            .term = pEntry->term,
            .currentTerm = ths->pRaftStore->currentTerm,
            .flag = 0,
        };
2423
        ths->pFsm->FpCommitCb(ths->pFsm, pMsg, &cbMeta);
2424 2425 2426 2427

        if (h) {
          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
        } else {
2428
          syncEntryDestroy(pEntry);
2429 2430
        }

2431 2432
        return -1;
      }
2433
    }
M
Minghao Li 已提交
2434

2435 2436
    syncCacheEntry(ths->pLogStore, pEntry, &h);

2437 2438
    // if mulit replica, start replicate right now
    if (ths->replicaNum > 1) {
M
Minghao Li 已提交
2439
      syncNodeReplicate(ths);
2440
    }
2441

2442 2443
    // if only myself, maybe commit right now
    if (ths->replicaNum == 1) {
2444 2445 2446 2447 2448
      if (syncNodeIsMnode(ths)) {
        syncMaybeAdvanceCommitIndex(ths);
      } else {
        syncOneReplicaAdvance(ths);
      }
2449
    }
M
Minghao Li 已提交
2450 2451
  }

2452 2453 2454 2455 2456 2457 2458 2459
  if (pRetIndex != NULL) {
    if (ret == 0 && pEntry != NULL) {
      *pRetIndex = pEntry->index;
    } else {
      *pRetIndex = SYNC_INDEX_INVALID;
    }
  }

2460 2461 2462
  if (h) {
    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
  } else {
B
Benguang Zhao 已提交
2463
    syncEntryDestroy(pEntry);
2464 2465
  }

M
Minghao Li 已提交
2466
  return ret;
2467
}
M
Minghao Li 已提交
2468

S
Shengliang Guan 已提交
2469 2470 2471
const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
2472
      return "follower";
S
Shengliang Guan 已提交
2473
    case TAOS_SYNC_STATE_CANDIDATE:
2474
      return "candidate";
S
Shengliang Guan 已提交
2475
    case TAOS_SYNC_STATE_LEADER:
2476
      return "leader";
S
Shengliang Guan 已提交
2477
    default:
2478
      return "error";
S
Shengliang Guan 已提交
2479
  }
M
Minghao Li 已提交
2480
}
2481

2482
#if 0
2483
int32_t syncDoLeaderTransfer(SSyncNode* ths, SRpcMsg* pRpcMsg, SSyncRaftEntry* pEntry) {
2484
  if (ths->state != TAOS_SYNC_STATE_FOLLOWER) {
S
Shengliang Guan 已提交
2485
    sNTrace(ths, "I am not follower, can not do leader transfer");
2486 2487
    return 0;
  }
2488 2489

  if (!ths->restoreFinish) {
S
Shengliang Guan 已提交
2490
    sNTrace(ths, "restore not finish, can not do leader transfer");
2491 2492 2493
    return 0;
  }

2494
  if (pEntry->term < ths->pRaftStore->currentTerm) {
2495
    sNTrace(ths, "little term:%" PRId64 ", can not do leader transfer", pEntry->term);
2496 2497 2498 2499
    return 0;
  }

  if (pEntry->index < syncNodeGetLastIndex(ths)) {
S
Shengliang Guan 已提交
2500
    sNTrace(ths, "little index:%" PRId64 ", can not do leader transfer", pEntry->index);
2501 2502 2503
    return 0;
  }

2504 2505
  /*
    if (ths->vgId > 1) {
S
Shengliang Guan 已提交
2506
      sNTrace(ths, "I am vnode, can not do leader transfer");
2507 2508 2509 2510
      return 0;
    }
  */

2511
  SyncLeaderTransfer* pSyncLeaderTransfer = pRpcMsg->pCont;
S
Shengliang Guan 已提交
2512
  sNTrace(ths, "do leader transfer, index:%" PRId64, pEntry->index);
M
Minghao Li 已提交
2513

M
Minghao Li 已提交
2514 2515 2516
  bool sameId = syncUtilSameId(&(pSyncLeaderTransfer->newLeaderId), &(ths->myRaftId));
  bool sameNodeInfo = strcmp(pSyncLeaderTransfer->newNodeInfo.nodeFqdn, ths->myNodeInfo.nodeFqdn) == 0 &&
                      pSyncLeaderTransfer->newNodeInfo.nodePort == ths->myNodeInfo.nodePort;
M
Minghao Li 已提交
2517

M
Minghao Li 已提交
2518 2519
  bool same = sameId || sameNodeInfo;
  if (same) {
M
Minghao Li 已提交
2520 2521 2522 2523
    // reset elect timer now!
    int32_t electMS = 1;
    int32_t ret = syncNodeRestartElectTimer(ths, electMS);
    ASSERT(ret == 0);
M
Minghao Li 已提交
2524

2525
    sNTrace(ths, "maybe leader transfer to %s:%d %" PRId64, pSyncLeaderTransfer->newNodeInfo.nodeFqdn,
S
Shengliang Guan 已提交
2526
            pSyncLeaderTransfer->newNodeInfo.nodePort, pSyncLeaderTransfer->newLeaderId.addr);
2527 2528
  }

M
Minghao Li 已提交
2529
  if (ths->pFsm->FpLeaderTransferCb != NULL) {
S
Shengliang Guan 已提交
2530
    SFsmCbMeta cbMeta = {
S
Shengliang Guan 已提交
2531 2532 2533 2534 2535 2536 2537 2538 2539
        .code = 0,
        .currentTerm = ths->pRaftStore->currentTerm,
        .flag = 0,
        .index = pEntry->index,
        .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
        .isWeak = pEntry->isWeak,
        .seqNum = pEntry->seqNum,
        .state = ths->state,
        .term = pEntry->term,
S
Shengliang Guan 已提交
2540 2541
    };
    ths->pFsm->FpLeaderTransferCb(ths->pFsm, pRpcMsg, &cbMeta);
2542 2543
  }

2544 2545 2546
  return 0;
}

2547 2548
#endif

2549
int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg) {
S
Shengliang Guan 已提交
2550
  for (int32_t i = 0; i < pNewCfg->replicaNum; ++i) {
2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563
    SRaftId raftId;
    raftId.addr = syncUtilAddr2U64((pNewCfg->nodeInfo)[i].nodeFqdn, (pNewCfg->nodeInfo)[i].nodePort);
    raftId.vgId = ths->vgId;

    if (syncUtilSameId(&(ths->myRaftId), &raftId)) {
      pNewCfg->myIndex = i;
      return 0;
    }
  }

  return -1;
}

2564
bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
2565
  // return false;
2566 2567 2568
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
}

M
Minghao Li 已提交
2569
int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) {
2570 2571 2572 2573
  if (beginIndex > endIndex) {
    return 0;
  }

M
Minghao Li 已提交
2574 2575 2576 2577 2578 2579 2580 2581 2582
  if (ths == NULL) {
    return -1;
  }

  if (ths->pFsm != NULL && ths->pFsm->FpGetSnapshotInfo != NULL) {
    // advance commit index to sanpshot first
    SSnapshot snapshot = {0};
    ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot);
    if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex >= beginIndex) {
S
Shengliang Guan 已提交
2583
      sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, beginIndex, snapshot.lastApplyIndex);
2584

M
Minghao Li 已提交
2585 2586 2587
      // update begin index
      beginIndex = snapshot.lastApplyIndex + 1;
    }
2588 2589
  }

2590 2591
  int32_t    code = 0;
  ESyncState state = flag;
M
Minghao Li 已提交
2592

S
Shengliang Guan 已提交
2593
  sNTrace(ths, "commit by wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex);
2594 2595 2596 2597 2598 2599

  // execute fsm
  if (ths->pFsm != NULL) {
    for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
      if (i != SYNC_INDEX_INVALID) {
        SSyncRaftEntry* pEntry;
2600 2601 2602 2603
        SLRUCache*      pCache = ths->pLogStore->pCache;
        LRUHandle*      h = taosLRUCacheLookup(pCache, &i, sizeof(i));
        if (h) {
          pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
2604 2605 2606

          sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", i, pEntry->bytes, pEntry);

2607
        } else {
2608 2609
          sNTrace(ths, "miss cache index:%" PRId64, i);

2610
          code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry);
M
Minghao Li 已提交
2611 2612 2613
          // ASSERT(code == 0);
          // ASSERT(pEntry != NULL);
          if (code != 0 || pEntry == NULL) {
S
Shengliang Guan 已提交
2614
            sNError(ths, "get log entry error");
2615
            sFatal("vgId:%d, get log entry %" PRId64 " error when commit since %s", ths->vgId, i, terrstr());
M
Minghao Li 已提交
2616 2617
            continue;
          }
2618
        }
2619

2620
        SRpcMsg rpcMsg = {0};
2621 2622
        syncEntry2OriginalRpc(pEntry, &rpcMsg);

2623
        // user commit
2624 2625
        if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) {
          bool internalExecute = true;
S
Shengliang Guan 已提交
2626
          if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) {
2627 2628 2629
            internalExecute = false;
          }

S
Shengliang Guan 已提交
2630
          sNTrace(ths, "commit index:%" PRId64 ", internal:%d", i, internalExecute);
2631

2632 2633
          // execute fsm in apply thread, or execute outside syncPropose
          if (internalExecute) {
S
Shengliang Guan 已提交
2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645
            SFsmCbMeta cbMeta = {
                .index = pEntry->index,
                .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
                .isWeak = pEntry->isWeak,
                .code = 0,
                .state = ths->state,
                .seqNum = pEntry->seqNum,
                .term = pEntry->term,
                .currentTerm = ths->pRaftStore->currentTerm,
                .flag = flag,
            };

S
Shengliang Guan 已提交
2646
            syncRespMgrGetAndDel(ths->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info);
S
Shengliang Guan 已提交
2647
            ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, &cbMeta);
M
Minghao Li 已提交
2648
          }
2649
        }
2650

2651 2652
#if 0
        // execute in pre-commit
M
Minghao Li 已提交
2653
        // leader transfer
2654 2655 2656
        if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) {
          code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry);
          ASSERT(code == 0);
2657
        }
2658
#endif
2659 2660

        // restore finish
2661
        // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok
2662 2663 2664 2665 2666 2667
        if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
          if (ths->restoreFinish == false) {
            if (ths->pFsm->FpRestoreFinishCb != NULL) {
              ths->pFsm->FpRestoreFinishCb(ths->pFsm);
            }
            ths->restoreFinish = true;
M
Minghao Li 已提交
2668

2669
            int64_t restoreDelay = taosGetTimestampMs() - ths->leaderTime;
S
Shengliang Guan 已提交
2670
            sNTrace(ths, "restore finish, index:%" PRId64 ", elapsed:%" PRId64 " ms", pEntry->index, restoreDelay);
2671 2672 2673 2674
          }
        }

        rpcFreeCont(rpcMsg.pCont);
2675 2676 2677
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
B
Benguang Zhao 已提交
2678
          syncEntryDestroy(pEntry);
2679
        }
2680 2681 2682 2683
      }
    }
  }
  return 0;
2684 2685 2686
}

bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
S
Shengliang Guan 已提交
2687
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2688 2689 2690 2691 2692
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
      return true;
    }
  }
  return false;
M
Minghao Li 已提交
2693 2694 2695 2696
}

SSyncSnapshotSender* syncNodeGetSnapshotSender(SSyncNode* ths, SRaftId* pDestId) {
  SSyncSnapshotSender* pSender = NULL;
S
Shengliang Guan 已提交
2697
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
2698 2699 2700 2701 2702
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pSender = (ths->senders)[i];
    }
  }
  return pSender;
M
Minghao Li 已提交
2703
}
M
Minghao Li 已提交
2704

2705 2706
SSyncTimer* syncNodeGetHbTimer(SSyncNode* ths, SRaftId* pDestId) {
  SSyncTimer* pTimer = NULL;
S
Shengliang Guan 已提交
2707
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
2708 2709 2710 2711 2712 2713 2714
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pTimer = &((ths->peerHeartbeatTimerArr)[i]);
    }
  }
  return pTimer;
}

M
Minghao Li 已提交
2715 2716
SPeerState* syncNodeGetPeerState(SSyncNode* ths, const SRaftId* pDestId) {
  SPeerState* pState = NULL;
S
Shengliang Guan 已提交
2717
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
M
Minghao Li 已提交
2718 2719 2720 2721 2722 2723 2724 2725 2726
    if (syncUtilSameId(pDestId, &((ths->replicasId)[i]))) {
      pState = &((ths->peerStates)[i]);
    }
  }
  return pState;
}

bool syncNodeNeedSendAppendEntries(SSyncNode* ths, const SRaftId* pDestId, const SyncAppendEntries* pMsg) {
  SPeerState* pState = syncNodeGetPeerState(ths, pDestId);
M
Minghao Li 已提交
2727
  if (pState == NULL) {
2728
    sError("vgId:%d, replica maybe dropped", ths->vgId);
M
Minghao Li 已提交
2729 2730
    return false;
  }
M
Minghao Li 已提交
2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741

  SyncIndex sendIndex = pMsg->prevLogIndex + 1;
  int64_t   tsNow = taosGetTimestampMs();

  if (pState->lastSendIndex == sendIndex && tsNow - pState->lastSendTime < SYNC_APPEND_ENTRIES_TIMEOUT_MS) {
    return false;
  }

  return true;
}

M
Minghao Li 已提交
2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755
bool syncNodeCanChange(SSyncNode* pSyncNode) {
  if (pSyncNode->changing) {
    sError("sync cannot change");
    return false;
  }

  if ((pSyncNode->commitIndex >= SYNC_INDEX_BEGIN)) {
    SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
    if (pSyncNode->commitIndex != lastIndex) {
      sError("sync cannot change2");
      return false;
    }
  }

S
Shengliang Guan 已提交
2756
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
M
Minghao Li 已提交
2757
    SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, &(pSyncNode->peersId)[i]);
M
Minghao Li 已提交
2758
    if (pSender != NULL && pSender->start) {
M
Minghao Li 已提交
2759 2760 2761 2762 2763 2764
      sError("sync cannot change3");
      return false;
    }
  }

  return true;
M
Minghao Li 已提交
2765
}