syncReplication.c 8.5 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

M
Minghao Li 已提交
16
#include "syncReplication.h"
M
Minghao Li 已提交
17
#include "syncIndexMgr.h"
M
Minghao Li 已提交
18
#include "syncMessage.h"
M
Minghao Li 已提交
19
#include "syncRaftCfg.h"
M
Minghao Li 已提交
20
#include "syncRaftEntry.h"
M
Minghao Li 已提交
21
#include "syncRaftLog.h"
M
Minghao Li 已提交
22
#include "syncRaftStore.h"
23
#include "syncSnapshot.h"
M
Minghao Li 已提交
24
#include "syncUtil.h"
M
Minghao Li 已提交
25

M
Minghao Li 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
// TLA+ Spec
// AppendEntries(i, j) ==
//    /\ i /= j
//    /\ state[i] = Leader
//    /\ LET prevLogIndex == nextIndex[i][j] - 1
//           prevLogTerm == IF prevLogIndex > 0 THEN
//                              log[i][prevLogIndex].term
//                          ELSE
//                              0
//           \* Send up to 1 entry, constrained by the end of the log.
//           lastEntry == Min({Len(log[i]), nextIndex[i][j]})
//           entries == SubSeq(log[i], nextIndex[i][j], lastEntry)
//       IN Send([mtype          |-> AppendEntriesRequest,
//                mterm          |-> currentTerm[i],
//                mprevLogIndex  |-> prevLogIndex,
//                mprevLogTerm   |-> prevLogTerm,
//                mentries       |-> entries,
//                \* mlog is used as a history variable for the proof.
//                \* It would not exist in a real implementation.
//                mlog           |-> log[i],
//                mcommitIndex   |-> Min({commitIndex[i], lastEntry}),
//                msource        |-> i,
//                mdest          |-> j])
//    /\ UNCHANGED <<serverVars, candidateVars, leaderVars, logVars>>

M
Minghao Li 已提交
51
int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId) {
M
Minghao Li 已提交
52 53 54 55 56 57
  // next index
  SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId);

  // maybe start snapshot
  SyncIndex logStartIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
  SyncIndex logEndIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
B
Benguang Zhao 已提交
58 59 60 61 62
  if (nextIndex > logEndIndex) {
    return 0;
  }

  if (nextIndex < logStartIndex) {
63
    char logBuf[128];
S
Shengliang Guan 已提交
64 65
    snprintf(logBuf, sizeof(logBuf), "start snapshot for next-index:%" PRId64 ", start:%" PRId64 ", end:%" PRId64,
             nextIndex, logStartIndex, logEndIndex);
66 67
    syncNodeEventLog(pSyncNode, logBuf);

M
Minghao Li 已提交
68
    // start snapshot
69 70
    int32_t code = syncNodeStartSnapshot(pSyncNode, pDestId);
    ASSERT(code == 0);
M
Minghao Li 已提交
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    return 0;
  }

  // pre index, pre term
  SyncIndex preLogIndex = syncNodeGetPreIndex(pSyncNode, nextIndex);
  SyncTerm  preLogTerm = syncNodeGetPreTerm(pSyncNode, nextIndex);

  // prepare entry
  SyncAppendEntries* pMsg = NULL;

  SSyncRaftEntry* pEntry;
  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, nextIndex, &pEntry);

  if (code == 0) {
    ASSERT(pEntry != NULL);

    pMsg = syncAppendEntriesBuild(pEntry->bytes, pSyncNode->vgId);
    ASSERT(pMsg != NULL);

    // add pEntry into msg
    uint32_t len;
    char*    serialized = syncEntrySerialize(pEntry, &len);
    ASSERT(len == pEntry->bytes);
    memcpy(pMsg->data, serialized, len);

    taosMemoryFree(serialized);
B
Benguang Zhao 已提交
97
    syncEntryDestroy(pEntry);
M
Minghao Li 已提交
98 99 100 101 102 103 104 105

  } else {
    if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) {
      // no entry in log
      pMsg = syncAppendEntriesBuild(0, pSyncNode->vgId);
      ASSERT(pMsg != NULL);

    } else {
M
Minghao Li 已提交
106 107 108 109 110 111
      do {
        char     host[64];
        uint16_t port;
        syncUtilU642Addr(pDestId->addr, host, sizeof(host), &port);

        char logBuf[128];
S
Shengliang Guan 已提交
112
        snprintf(logBuf, sizeof(logBuf), "replicate to %s:%d error, next-index:%" PRId64, host, port, nextIndex);
M
Minghao Li 已提交
113 114 115 116 117
        syncNodeErrorLog(pSyncNode, logBuf);
      } while (0);

      syncAppendEntriesDestroy(pMsg);
      return -1;
M
Minghao Li 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
    }
  }

  // prepare msg
  ASSERT(pMsg != NULL);
  pMsg->srcId = pSyncNode->myRaftId;
  pMsg->destId = *pDestId;
  pMsg->term = pSyncNode->pRaftStore->currentTerm;
  pMsg->prevLogIndex = preLogIndex;
  pMsg->prevLogTerm = preLogTerm;
  pMsg->commitIndex = pSyncNode->commitIndex;
  pMsg->privateTerm = 0;
  // pMsg->privateTerm = syncIndexMgrGetTerm(pSyncNode->pNextIndex, pDestId);

  // send msg
  syncNodeMaybeSendAppendEntries(pSyncNode, pDestId, pMsg);
  syncAppendEntriesDestroy(pMsg);

  return 0;
}

B
Benguang Zhao 已提交
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
int32_t syncNodeReplicate(SSyncNode* pNode) {
  if (pNode->state != TAOS_SYNC_STATE_LEADER || pNode->replicaNum == 1) {
    return -1;
  }
  for (int32_t i = 0; i < pNode->replicaNum; i++) {
    if (syncUtilSameId(&pNode->replicasId[i], &pNode->myRaftId)) {
      continue;
    }
    SSyncLogReplMgr* pMgr = pNode->logReplMgrs[i];
    (void)syncLogBufferReplicateOnce(pMgr, pNode);
  }
  return 0;
}

int32_t syncNodeReplicateOld(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
154 155 156 157
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
    return -1;
  }

158 159
  syncNodeEventLog(pSyncNode, "do replicate");

M
Minghao Li 已提交
160 161 162
  int32_t ret = 0;
  for (int i = 0; i < pSyncNode->peersNum; ++i) {
    SRaftId* pDestId = &(pSyncNode->peersId[i]);
M
Minghao Li 已提交
163
    ret = syncNodeReplicateOne(pSyncNode, pDestId);
M
Minghao Li 已提交
164 165 166 167 168 169 170 171 172 173 174
    if (ret != 0) {
      char    host[64];
      int16_t port;
      syncUtilU642Addr(pDestId->addr, host, sizeof(host), &port);
      sError("vgId:%d, do append entries error for %s:%d", pSyncNode->vgId, host, port);
    }
  }

  return 0;
}

B
Benguang Zhao 已提交
175
int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) {
176 177
  sTrace("vgId:%d, send append entries msg index: %" PRId64 " to dest: 0x%016" PRId64, pSyncNode->vgId,
         pMsg->prevLogIndex + 1, destRaftId->addr);
B
Benguang Zhao 已提交
178 179 180 181 182 183 184 185 186
  int32_t ret = 0;
  pMsg->destId = *destRaftId;
  SRpcMsg rpcMsg;
  syncAppendEntries2RpcMsg(pMsg, &rpcMsg);
  syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg);
  return 0;
}

int32_t syncNodeSendAppendEntriesOld(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) {
M
Minghao Li 已提交
187
  int32_t ret = 0;
B
Benguang Zhao 已提交
188 189
  pMsg->destId = *destRaftId;

M
Minghao Li 已提交
190 191 192 193 194 195 196
  syncLogSendAppendEntries(pSyncNode, pMsg, "");

  SRpcMsg rpcMsg;
  syncAppendEntries2RpcMsg(pMsg, &rpcMsg);
  syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg);

  SPeerState* pState = syncNodeGetPeerState(pSyncNode, destRaftId);
B
Benguang Zhao 已提交
197 198 199 200
  if (pState == NULL) {
    sError("vgId:%d, failed to get peer state for addr:0x%016" PRIx64 "", pSyncNode->vgId, destRaftId->addr);
    return -1;
  }
M
Minghao Li 已提交
201

202 203 204 205
  if (pMsg->dataLen > 0) {
    pState->lastSendIndex = pMsg->prevLogIndex + 1;
    pState->lastSendTime = taosGetTimestampMs();
  }
M
Minghao Li 已提交
206 207 208 209

  return ret;
}

B
Benguang Zhao 已提交
210
int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, SRaftId* destRaftId, SyncAppendEntries* pMsg) {
M
Minghao Li 已提交
211 212 213
  int32_t ret = 0;
  if (syncNodeNeedSendAppendEntries(pSyncNode, destRaftId, pMsg)) {
    ret = syncNodeSendAppendEntries(pSyncNode, destRaftId, pMsg);
214 215 216 217 218 219 220

  } else {
    char    logBuf[128];
    char    host[64];
    int16_t port;
    syncUtilU642Addr(destRaftId->addr, host, sizeof(host), &port);

S
Shengliang Guan 已提交
221
    snprintf(logBuf, sizeof(logBuf), "do not repcate to %s:%d for index:%" PRId64, host, port, pMsg->prevLogIndex + 1);
222
    syncNodeEventLog(pSyncNode, logBuf);
M
Minghao Li 已提交
223
  }
224

M
Minghao Li 已提交
225 226 227
  return ret;
}

M
Minghao Li 已提交
228 229
int32_t syncNodeAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, const SyncAppendEntries* pMsg) {
  int32_t ret = 0;
M
Minghao Li 已提交
230
  syncLogSendAppendEntries(pSyncNode, pMsg, "");
231

M
Minghao Li 已提交
232 233 234 235
  SRpcMsg rpcMsg;
  syncAppendEntries2RpcMsg(pMsg, &rpcMsg);
  syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg);
  return ret;
236 237
}

M
Minghao Li 已提交
238
int32_t syncNodeSendHeartbeat(SSyncNode* pSyncNode, const SRaftId* destRaftId, const SyncHeartbeat* pMsg) {
239 240 241 242 243 244 245
  int32_t ret = 0;
  syncLogSendHeartbeat(pSyncNode, pMsg, "");

  SRpcMsg rpcMsg;
  syncHeartbeat2RpcMsg(pMsg, &rpcMsg);
  syncNodeSendMsgById(&(pMsg->destId), pSyncNode, &rpcMsg);
  return ret;
M
Minghao Li 已提交
246 247 248 249 250 251 252 253 254
}

int32_t syncNodeHeartbeatPeers(SSyncNode* pSyncNode) {
  for (int32_t i = 0; i < pSyncNode->peersNum; ++i) {
    SyncHeartbeat* pSyncMsg = syncHeartbeatBuild(pSyncNode->vgId);
    pSyncMsg->srcId = pSyncNode->myRaftId;
    pSyncMsg->destId = pSyncNode->peersId[i];
    pSyncMsg->term = pSyncNode->pRaftStore->currentTerm;
    pSyncMsg->commitIndex = pSyncNode->commitIndex;
M
Minghao Li 已提交
255
    pSyncMsg->minMatchIndex = syncMinMatchIndex(pSyncNode);
M
Minghao Li 已提交
256 257 258 259 260 261
    pSyncMsg->privateTerm = 0;

    SRpcMsg rpcMsg;
    syncHeartbeat2RpcMsg(pSyncMsg, &rpcMsg);

    // send msg
M
Minghao Li 已提交
262
    syncNodeSendHeartbeat(pSyncNode, &(pSyncMsg->destId), pSyncMsg);
M
Minghao Li 已提交
263 264 265 266 267

    syncHeartbeatDestroy(pSyncMsg);
  }

  return 0;
B
Benguang Zhao 已提交
268
}