syncAppendEntries.c 7.8 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "syncAppendEntries.h"
18
#include "syncPipeline.h"
19
#include "syncMessage.h"
M
Minghao Li 已提交
20 21
#include "syncRaftLog.h"
#include "syncRaftStore.h"
B
Benguang Zhao 已提交
22
#include "syncReplication.h"
M
Minghao Li 已提交
23
#include "syncUtil.h"
24
#include "syncCommit.h"
M
Minghao Li 已提交
25

M
Minghao Li 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
// TLA+ Spec
// HandleAppendEntriesRequest(i, j, m) ==
//    LET logOk == \/ m.mprevLogIndex = 0
//                 \/ /\ m.mprevLogIndex > 0
//                    /\ m.mprevLogIndex <= Len(log[i])
//                    /\ m.mprevLogTerm = log[i][m.mprevLogIndex].term
//    IN /\ m.mterm <= currentTerm[i]
//       /\ \/ /\ \* reject request
//                \/ m.mterm < currentTerm[i]
//                \/ /\ m.mterm = currentTerm[i]
//                   /\ state[i] = Follower
//                   /\ \lnot logOk
//             /\ Reply([mtype           |-> AppendEntriesResponse,
//                       mterm           |-> currentTerm[i],
//                       msuccess        |-> FALSE,
//                       mmatchIndex     |-> 0,
//                       msource         |-> i,
//                       mdest           |-> j],
//                       m)
//             /\ UNCHANGED <<serverVars, logVars>>
//          \/ \* return to follower state
//             /\ m.mterm = currentTerm[i]
//             /\ state[i] = Candidate
//             /\ state' = [state EXCEPT ![i] = Follower]
//             /\ UNCHANGED <<currentTerm, votedFor, logVars, messages>>
//          \/ \* accept request
//             /\ m.mterm = currentTerm[i]
//             /\ state[i] = Follower
//             /\ logOk
//             /\ LET index == m.mprevLogIndex + 1
//                IN \/ \* already done with request
//                       /\ \/ m.mentries = << >>
//                          \/ /\ m.mentries /= << >>
//                             /\ Len(log[i]) >= index
//                             /\ log[i][index].term = m.mentries[1].term
//                          \* This could make our commitIndex decrease (for
//                          \* example if we process an old, duplicated request),
//                          \* but that doesn't really affect anything.
//                       /\ commitIndex' = [commitIndex EXCEPT ![i] =
//                                              m.mcommitIndex]
//                       /\ Reply([mtype           |-> AppendEntriesResponse,
//                                 mterm           |-> currentTerm[i],
//                                 msuccess        |-> TRUE,
//                                 mmatchIndex     |-> m.mprevLogIndex +
//                                                     Len(m.mentries),
//                                 msource         |-> i,
//                                 mdest           |-> j],
//                                 m)
//                       /\ UNCHANGED <<serverVars, log>>
//                   \/ \* conflict: remove 1 entry
//                       /\ m.mentries /= << >>
//                       /\ Len(log[i]) >= index
//                       /\ log[i][index].term /= m.mentries[1].term
//                       /\ LET new == [index2 \in 1..(Len(log[i]) - 1) |->
//                                          log[i][index2]]
//                          IN log' = [log EXCEPT ![i] = new]
//                       /\ UNCHANGED <<serverVars, commitIndex, messages>>
//                   \/ \* no conflict: append entry
//                       /\ m.mentries /= << >>
//                       /\ Len(log[i]) = m.mprevLogIndex
//                       /\ log' = [log EXCEPT ![i] =
//                                      Append(log[i], m.mentries[1])]
//                       /\ UNCHANGED <<serverVars, commitIndex, messages>>
//       /\ UNCHANGED <<candidateVars, leaderVars>>
//
91

92
SSyncRaftEntry* syncBuildRaftEntryFromAppendEntries(const SyncAppendEntries* pMsg) {
B
Benguang Zhao 已提交
93 94 95 96 97 98 99 100 101 102
  SSyncRaftEntry* pEntry = taosMemoryMalloc(pMsg->dataLen);
  if (pEntry == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }
  (void)memcpy(pEntry, pMsg->data, pMsg->dataLen);
  ASSERT(pEntry->bytes == pMsg->dataLen);
  return pEntry;
}

103 104
int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
105
  SRpcMsg            rpcRsp = {0};
106
  bool               accepted = false;
107
  SSyncRaftEntry*    pEntry = NULL;
108
  bool               resetElect = false;
109

B
Benguang Zhao 已提交
110 111 112 113 114 115
  // if already drop replica, do not process
  if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) {
    syncLogRecvAppendEntries(ths, pMsg, "not in my config");
    goto _IGNORE;
  }

116 117 118 119 120 121 122
  int32_t code = syncBuildAppendEntriesReply(&rpcRsp, ths->vgId);
  if (code != 0) {
    syncLogRecvAppendEntries(ths, pMsg, "build rsp error");
    goto _IGNORE;
  }

  SyncAppendEntriesReply* pReply = rpcRsp.pCont;
B
Benguang Zhao 已提交
123 124 125
  // prepare response msg
  pReply->srcId = ths->myRaftId;
  pReply->destId = pMsg->srcId;
126
  pReply->term = raftStoreGetTerm(ths);
B
Benguang Zhao 已提交
127 128 129 130 131
  pReply->success = false;
  pReply->matchIndex = SYNC_INDEX_INVALID;
  pReply->lastSendIndex = pMsg->prevLogIndex + 1;
  pReply->startTime = ths->startTime;

132
  if (pMsg->term < raftStoreGetTerm(ths)) {
B
Benguang Zhao 已提交
133 134 135
    goto _SEND_RESPONSE;
  }

136
  if (pMsg->term > raftStoreGetTerm(ths)) {
B
Benguang Zhao 已提交
137 138 139
    pReply->term = pMsg->term;
  }

C
cadem 已提交
140 141 142 143
  if(ths->raftCfg.cfg.nodeInfo[ths->raftCfg.cfg.myIndex].nodeRole != TAOS_SYNC_ROLE_LEARNER){
    syncNodeStepDown(ths, pMsg->term);
    resetElect = true;
  }
B
Benguang Zhao 已提交
144

145
  if (pMsg->dataLen < sizeof(SSyncRaftEntry)) {
B
Benguang Zhao 已提交
146 147 148 149 150
    sError("vgId:%d, incomplete append entries received. prev index:%" PRId64 ", term:%" PRId64 ", datalen:%d",
           ths->vgId, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->dataLen);
    goto _IGNORE;
  }

151
  pEntry = syncBuildRaftEntryFromAppendEntries(pMsg);
B
Benguang Zhao 已提交
152 153 154 155 156
  if (pEntry == NULL) {
    sError("vgId:%d, failed to get raft entry from append entries since %s", ths->vgId, terrstr());
    goto _IGNORE;
  }

157
  if (pMsg->prevLogIndex + 1 != pEntry->index || pEntry->term < 0) {
B
Benguang Zhao 已提交
158 159 160 161 162 163
    sError("vgId:%d, invalid previous log index in msg. index:%" PRId64 ",  term:%" PRId64 ", prevLogIndex:%" PRId64
           ", prevLogTerm:%" PRId64,
           ths->vgId, pEntry->index, pEntry->term, pMsg->prevLogIndex, pMsg->prevLogTerm);
    goto _IGNORE;
  }

164 165 166
  sTrace("vgId:%d, recv append entries msg. index:%" PRId64 ", term:%" PRId64 ", preLogIndex:%" PRId64
         ", prevLogTerm:%" PRId64 " commitIndex:%" PRId64 "",
         pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex);
B
Benguang Zhao 已提交
167 168 169 170 171

  // accept
  if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) {
    goto _SEND_RESPONSE;
  }
172
  accepted = true;
B
Benguang Zhao 已提交
173 174

_SEND_RESPONSE:
175
  pEntry = NULL;
176
  pReply->matchIndex = syncLogBufferProceed(ths->pLogBuf, ths, &pReply->lastMatchTerm);
B
Benguang Zhao 已提交
177
  bool matched = (pReply->matchIndex >= pReply->lastSendIndex);
178 179
  if (accepted && matched) {
    pReply->success = true;
B
Benguang Zhao 已提交
180
    // update commit index only after matching
181
    (void)syncNodeUpdateCommitIndex(ths, TMIN(pMsg->commitIndex, pReply->lastSendIndex));
B
Benguang Zhao 已提交
182
  }
B
Benguang Zhao 已提交
183 184

  // ack, i.e. send response
185
  (void)syncNodeSendMsgById(&pReply->destId, ths, &rpcRsp);
B
Benguang Zhao 已提交
186 187

  // commit index, i.e. leader notice me
B
Benguang Zhao 已提交
188
  if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) {
B
Benguang Zhao 已提交
189 190 191
    sError("vgId:%d, failed to commit raft fsm log since %s.", ths->vgId, terrstr());
  }

192
  if (resetElect) syncNodeResetElectTimer(ths);
193 194
  return 0;

B
Benguang Zhao 已提交
195
_IGNORE:
196
  rpcFreeCont(rpcRsp.pCont);
197
  syncEntryDestroy(pEntry);
B
Benguang Zhao 已提交
198 199
  return 0;
}