syncReplication.c 8.9 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

S
Shengliang Guan 已提交
16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "syncReplication.h"
M
Minghao Li 已提交
18
#include "syncIndexMgr.h"
19
#include "syncPipeline.h"
M
Minghao Li 已提交
20
#include "syncRaftEntry.h"
M
Minghao Li 已提交
21
#include "syncRaftStore.h"
M
Minghao Li 已提交
22
#include "syncUtil.h"
M
Minghao Li 已提交
23

M
Minghao Li 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
// TLA+ Spec
// AppendEntries(i, j) ==
//    /\ i /= j
//    /\ state[i] = Leader
//    /\ LET prevLogIndex == nextIndex[i][j] - 1
//           prevLogTerm == IF prevLogIndex > 0 THEN
//                              log[i][prevLogIndex].term
//                          ELSE
//                              0
//           \* Send up to 1 entry, constrained by the end of the log.
//           lastEntry == Min({Len(log[i]), nextIndex[i][j]})
//           entries == SubSeq(log[i], nextIndex[i][j], lastEntry)
//       IN Send([mtype          |-> AppendEntriesRequest,
//                mterm          |-> currentTerm[i],
//                mprevLogIndex  |-> prevLogIndex,
//                mprevLogTerm   |-> prevLogTerm,
//                mentries       |-> entries,
//                \* mlog is used as a history variable for the proof.
//                \* It would not exist in a real implementation.
//                mlog           |-> log[i],
//                mcommitIndex   |-> Min({commitIndex[i], lastEntry}),
//                msource        |-> i,
//                mdest          |-> j])
//    /\ UNCHANGED <<serverVars, candidateVars, leaderVars, logVars>>

49 50
int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg);

M
Minghao Li 已提交
51
int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot) {
52
  ASSERT(false && "deplicated");
M
Minghao Li 已提交
53 54 55
  // next index
  SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId);

M
Minghao Li 已提交
56 57 58 59 60 61 62 63 64 65
  if (snapshot) {
    // maybe start snapshot
    SyncIndex logStartIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex logEndIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    if (nextIndex < logStartIndex || nextIndex - 1 > logEndIndex) {
      sNTrace(pSyncNode, "maybe start snapshot for next-index:%" PRId64 ", start:%" PRId64 ", end:%" PRId64, nextIndex,
              logStartIndex, logEndIndex);
      // start snapshot
      int32_t code = syncNodeStartSnapshot(pSyncNode, pDestId);
    }
M
Minghao Li 已提交
66 67 68 69 70 71 72
  }

  // pre index, pre term
  SyncIndex preLogIndex = syncNodeGetPreIndex(pSyncNode, nextIndex);
  SyncTerm  preLogTerm = syncNodeGetPreTerm(pSyncNode, nextIndex);

  // prepare entry
73
  SRpcMsg            rpcMsg = {0};
M
Minghao Li 已提交
74 75
  SyncAppendEntries* pMsg = NULL;

M
Minghao Li 已提交
76
  SSyncRaftEntry* pEntry = NULL;
77 78 79 80 81 82
  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
  LRUHandle*      h = taosLRUCacheLookup(pCache, &nextIndex, sizeof(nextIndex));
  int32_t         code = 0;
  if (h) {
    pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
    code = 0;
M
Minghao Li 已提交
83

84
    pSyncNode->pLogStore->cacheHit++;
85
    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", nextIndex, pEntry->bytes, pEntry);
M
Minghao Li 已提交
86

87
  } else {
88
    pSyncNode->pLogStore->cacheMiss++;
89
    sNTrace(pSyncNode, "miss cache index:%" PRId64, nextIndex);
M
Minghao Li 已提交
90

91 92
    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, nextIndex, &pEntry);
  }
M
Minghao Li 已提交
93 94 95 96

  if (code == 0) {
    ASSERT(pEntry != NULL);

97
    code = syncBuildAppendEntries(&rpcMsg, (int32_t)(pEntry->bytes), pSyncNode->vgId);
98
    ASSERT(code == 0);
M
Minghao Li 已提交
99

100 101
    pMsg = rpcMsg.pCont;
    memcpy(pMsg->data, pEntry, pEntry->bytes);
M
Minghao Li 已提交
102 103 104
  } else {
    if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) {
      // no entry in log
105 106
      code = syncBuildAppendEntries(&rpcMsg, 0, pSyncNode->vgId);
      ASSERT(code == 0);
M
Minghao Li 已提交
107

108
      pMsg = rpcMsg.pCont;
M
Minghao Li 已提交
109
    } else {
110 111 112 113
      char     host[64];
      uint16_t port;
      syncUtilU642Addr(pDestId->addr, host, sizeof(host), &port);
      sNError(pSyncNode, "replicate to %s:%d error, next-index:%" PRId64, host, port, nextIndex);
M
Minghao Li 已提交
114
      return -1;
M
Minghao Li 已提交
115 116 117
    }
  }

118 119 120
  if (h) {
    taosLRUCacheRelease(pCache, h, false);
  } else {
121
    syncEntryDestroy(pEntry);
122
  }
M
Minghao Li 已提交
123

M
Minghao Li 已提交
124 125 126 127 128 129 130 131 132 133 134 135
  // prepare msg
  ASSERT(pMsg != NULL);
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = *pDestId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->prevLogIndex = preLogIndex;
  pMsg->prevLogTerm = preLogTerm;
  pMsg->commitIndex = pSyncNode->commitIndex;
  pMsg->privateTerm = 0;
  // pMsg->privateTerm = syncIndexMgrGetTerm(pSyncNode->pNextIndex, pDestId);

  // send msg
136
  syncNodeMaybeSendAppendEntries(pSyncNode, pDestId, &rpcMsg);
M
Minghao Li 已提交
137 138 139
  return 0;
}

B
Benguang Zhao 已提交
140
int32_t syncNodeReplicate(SSyncNode* pNode) {
141 142 143 144 145 146 147 148
  SSyncLogBuffer* pBuf = pNode->pLogBuf;
  taosThreadMutexLock(&pBuf->mutex);
  int32_t ret = syncNodeReplicateWithoutLock(pNode);
  taosThreadMutexUnlock(&pBuf->mutex);
  return ret;
}

int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) {
B
Benguang Zhao 已提交
149 150 151 152 153 154 155 156
  if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) {
    return -1;
  }
  for (int32_t i = 0; i < pNode->replicaNum; i++) {
    if (syncUtilSameId(&pNode->replicasId[i], &pNode->myRaftId)) {
      continue;
    }
    SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i];
157
    (void)syncLogReplMgrReplicateOnce(pMgr, pNode);
B
Benguang Zhao 已提交
158 159 160 161 162
  }
  return 0;
}

int32_t syncNodeReplicateOld(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
163 164 165 166
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    return -1;
  }

S
Shengliang Guan 已提交
167
  sNTrace(pSyncNode, "do replicate");
168

M
Minghao Li 已提交
169 170 171
  int32_t ret = 0;
  for (int i = 0; i < pSyncNode->peersNum; ++i) {
    SRaftId* pDestId = &(pSyncNode->peersId[i]);
M
Minghao Li 已提交
172
    ret = syncNodeReplicateOne(pSyncNode, pDestId, true);
M
Minghao Li 已提交
173 174 175 176 177 178 179 180 181 182 183
    if (ret != 0) {
      char    host[64];
      int16_t port;
      syncUtilU642Addr(pDestId->addr, host, sizeof(host), &port);
      sError("vgId:%d, do append entries error for %s:%d", pSyncNode->vgId, host, port);
    }
  }

  return 0;
}

184
int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) {
185
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
B
Benguang Zhao 已提交
186 187
  int32_t ret = 0;
  pMsg->destId = *destRaftId;
188
  syncNodeSendMsgById(destRaftId, pSyncNode, pRpcMsg);
B
Benguang Zhao 已提交
189 190 191
  return 0;
}

192
int32_t syncNodeSendAppendEntriesOld(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) {
193 194
  int32_t            ret = 0;
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
195 196 197 198
  if (pMsg == NULL) {
    sError("vgId:%d, sync-append-entries msg is NULL", pSyncNode->vgId);
    return 0;
  }
M
Minghao Li 已提交
199 200

  SPeerState* pState = syncNodeGetPeerState(pSyncNode, destRaftId);
B
Benguang Zhao 已提交
201
  if (pState == NULL) {
202 203
    sError("vgId:%d, replica maybe dropped", pSyncNode->vgId);
    return 0;
B
Benguang Zhao 已提交
204
  }
M
Minghao Li 已提交
205

206 207
  // save index, otherwise pMsg will be free by rpc
  SyncIndex saveLastSendIndex = pState->lastSendIndex;
208
  bool      update = false;
209
  if (pMsg->dataLen > 0) {
210
    saveLastSendIndex = pMsg->prevLogIndex + 1;
211
    update = true;
212 213 214 215 216
  }

  syncLogSendAppendEntries(pSyncNode, pMsg, "");
  syncNodeSendMsgById(destRaftId, pSyncNode, pRpcMsg);

217
  if (update) {
218
    pState->lastSendIndex = saveLastSendIndex;
219 220
    pState->lastSendTime = taosGetTimestampMs();
  }
M
Minghao Li 已提交
221 222 223 224

  return ret;
}

225 226 227
int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) {
  int32_t            ret = 0;
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
228

229 230
  if (syncNodeNeedSendAppendEntries(pSyncNode, destRaftId, pMsg)) {
    ret = syncNodeSendAppendEntries(pSyncNode, destRaftId, pRpcMsg);
231 232 233 234 235
  } else {
    char    logBuf[128];
    char    host[64];
    int16_t port;
    syncUtilU642Addr(destRaftId->addr, host, sizeof(host), &port);
S
Shengliang Guan 已提交
236
    sNTrace(pSyncNode, "do not repcate to %s:%d for index:%" PRId64, host, port, pMsg->prevLogIndex + 1);
237
    rpcFreeCont(pRpcMsg->pCont);
M
Minghao Li 已提交
238
  }
239

M
Minghao Li 已提交
240 241 242
  return ret;
}

S
Shengliang Guan 已提交
243 244
int32_t syncNodeSendHeartbeat(SSyncNode* pSyncNode, const SRaftId* destId, SRpcMsg* pMsg) {
  return syncNodeSendMsgById(destId, pSyncNode, pMsg);
M
Minghao Li 已提交
245 246 247
}

int32_t syncNodeHeartbeatPeers(SSyncNode* pSyncNode) {
248
  int64_t ts = taosGetTimestampMs();
M
Minghao Li 已提交
249
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
S
Shengliang Guan 已提交
250 251
    SRpcMsg rpcMsg = {0};
    if (syncBuildHeartbeat(&rpcMsg, pSyncNode->vgId) != 0) {
252
      sError("vgId:%d, build sync-heartbeat error", pSyncNode->vgId);
S
Shengliang Guan 已提交
253 254 255 256
      continue;
    }

    SyncHeartbeat* pSyncMsg = rpcMsg.pCont;
M
Minghao Li 已提交
257 258 259 260
    pSyncMsg->srcId = pSyncNode->myRaftId;
    pSyncMsg->destId = pSyncNode->peersId[i];
    pSyncMsg->term = pSyncNode->pRaftStore->currentTerm;
    pSyncMsg->commitIndex = pSyncNode->commitIndex;
M
Minghao Li 已提交
261
    pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
M
Minghao Li 已提交
262
    pSyncMsg->privateTerm = 0;
263
    pSyncMsg->timeStamp = ts;
M
Minghao Li 已提交
264 265

    // send msg
S
Shengliang Guan 已提交
266
    syncLogSendHeartbeat(pSyncNode, pSyncMsg, true, 0, 0);
S
Shengliang Guan 已提交
267
    syncNodeSendHeartbeat(pSyncNode, &pSyncMsg->destId, &rpcMsg);
M
Minghao Li 已提交
268 269 270
  }

  return 0;
B
Benguang Zhao 已提交
271
}