syncCommit.c 10.7 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "syncCommit.h"
M
Minghao Li 已提交
18
#include "syncIndexMgr.h"
M
Minghao Li 已提交
19
#include "syncRaftLog.h"
M
Minghao Li 已提交
20
#include "syncRaftStore.h"
M
Minghao Li 已提交
21
#include "syncUtil.h"
M
Minghao Li 已提交
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

// \* Leader i advances its commitIndex.
// \* This is done as a separate step from handling AppendEntries responses,
// \* in part to minimize atomic regions, and in part so that leaders of
// \* single-server clusters are able to mark entries committed.
// AdvanceCommitIndex(i) ==
//     /\ state[i] = Leader
//     /\ LET \* The set of servers that agree up through index.
//            Agree(index) == {i} \cup {k \in Server :
//                                          matchIndex[i][k] >= index}
//            \* The maximum indexes for which a quorum agrees
//            agreeIndexes == {index \in 1..Len(log[i]) :
//                                 Agree(index) \in Quorum}
//            \* New value for commitIndex'[i]
//            newCommitIndex ==
//               IF /\ agreeIndexes /= {}
//                  /\ log[i][Max(agreeIndexes)].term = currentTerm[i]
//               THEN
//                   Max(agreeIndexes)
//               ELSE
//                   commitIndex[i]
//        IN commitIndex' = [commitIndex EXCEPT ![i] = newCommitIndex]
//     /\ UNCHANGED <<messages, serverVars, candidateVars, leaderVars, log>>
//
46
void syncOneReplicaAdvance(SSyncNode* pSyncNode) {
47
  ASSERT(false && "deprecated");
48 49 50 51 52 53
  if (pSyncNode == NULL) {
    sError("pSyncNode is NULL");
    return;
  }

  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
S
Shengliang Guan 已提交
54
    sNError(pSyncNode, "not leader, can not advance commit index");
55 56 57 58
    return;
  }

  if (pSyncNode->replicaNum != 1) {
S
Shengliang Guan 已提交
59
    sNError(pSyncNode, "not one replica, can not advance commit index");
60 61 62 63 64 65 66 67 68 69
    return;
  }

  // advance commit index to snapshot first
  SSnapshot snapshot;
  pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
  if (snapshot.lastApplyIndex > 0 && snapshot.lastApplyIndex > pSyncNode->commitIndex) {
    SyncIndex commitBegin = pSyncNode->commitIndex;
    SyncIndex commitEnd = snapshot.lastApplyIndex;
    pSyncNode->commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
70
    sNTrace(pSyncNode, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd);
71 72 73 74 75
  }

  // advance commit index as large as possible
  SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
  if (lastIndex > pSyncNode->commitIndex) {
S
Shengliang Guan 已提交
76
    sNTrace(pSyncNode, "commit by wal from index:%" PRId64 " to index:%" PRId64, pSyncNode->commitIndex + 1, lastIndex);
77 78 79 80 81 82 83 84 85
    pSyncNode->commitIndex = lastIndex;
  }

  // call back Wal
  SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore);
  if (pSyncNode->commitIndex > walCommitVer) {
    pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex);
  }
}
B
Benguang Zhao 已提交
86

M
Minghao Li 已提交
87
void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) {
S
Shengliang Guan 已提交
88
  ASSERTS(false, "deprecated");
M
Minghao Li 已提交
89 90 91 92 93
  if (pSyncNode == NULL) {
    sError("pSyncNode is NULL");
    return;
  }

M
Minghao Li 已提交
94
  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
S
Shengliang Guan 已提交
95
    sNError(pSyncNode, "not leader, can not advance commit index");
M
Minghao Li 已提交
96 97
    return;
  }
M
Minghao Li 已提交
98

99 100
  // advance commit index to sanpshot first
  SSnapshot snapshot;
101
  pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
102 103 104 105
  if (snapshot.lastApplyIndex > 0 && snapshot.lastApplyIndex > pSyncNode->commitIndex) {
    SyncIndex commitBegin = pSyncNode->commitIndex;
    SyncIndex commitEnd = snapshot.lastApplyIndex;
    pSyncNode->commitIndex = snapshot.lastApplyIndex;
S
Shengliang Guan 已提交
106
    sNTrace(pSyncNode, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd);
107 108
  }

M
Minghao Li 已提交
109
  // update commit index
M
Minghao Li 已提交
110
  SyncIndex newCommitIndex = pSyncNode->commitIndex;
111
  for (SyncIndex index = syncNodeGetLastIndex(pSyncNode); index > pSyncNode->commitIndex; --index) {
M
Minghao Li 已提交
112
    bool agree = syncAgree(pSyncNode, index);
113

M
Minghao Li 已提交
114
    if (agree) {
M
Minghao Li 已提交
115
      // term
116 117 118 119 120
      SSyncRaftEntry* pEntry = NULL;
      SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
      LRUHandle*      h = taosLRUCacheLookup(pCache, &index, sizeof(index));
      if (h) {
        pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
121

122
        pSyncNode->pLogStore->cacheHit++;
123 124
        sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", index, pEntry->bytes, pEntry);

125
      } else {
126
        pSyncNode->pLogStore->cacheMiss++;
127 128
        sNTrace(pSyncNode, "miss cache index:%" PRId64, index);

M
Minghao Li 已提交
129
        int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
M
Minghao Li 已提交
130
        if (code != 0) {
S
Shengliang Guan 已提交
131
          sNError(pSyncNode, "advance commit index error, read wal index:%" PRId64, index);
132
          return;
133
        }
134
      }
M
Minghao Li 已提交
135
      // cannot commit, even if quorum agree. need check term!
136
      if (pEntry->term <= pSyncNode->pRaftStore->currentTerm) {
M
Minghao Li 已提交
137 138
        // update commit index
        newCommitIndex = index;
139

140 141 142
        if (h) {
          taosLRUCacheRelease(pCache, h, false);
        } else {
B
Benguang Zhao 已提交
143
          syncEntryDestroy(pEntry);
144 145
        }

M
Minghao Li 已提交
146
        break;
M
Minghao Li 已提交
147
      } else {
S
Shengliang Guan 已提交
148 149
        sNTrace(pSyncNode, "can not commit due to term not equal, index:%" PRId64 ", term:%" PRIu64, pEntry->index,
                pEntry->term);
M
Minghao Li 已提交
150
      }
M
Minghao Li 已提交
151

152 153 154
      if (h) {
        taosLRUCacheRelease(pCache, h, false);
      } else {
B
Benguang Zhao 已提交
155
        syncEntryDestroy(pEntry);
156
      }
M
Minghao Li 已提交
157 158
    }
  }
M
Minghao Li 已提交
159

160 161 162 163 164 165
  // advance commit index as large as possible
  SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore);
  if (walCommitVer > newCommitIndex) {
    newCommitIndex = walCommitVer;
  }

166
  // maybe execute fsm
M
Minghao Li 已提交
167 168 169
  if (newCommitIndex > pSyncNode->commitIndex) {
    SyncIndex beginIndex = pSyncNode->commitIndex + 1;
    SyncIndex endIndex = newCommitIndex;
M
Minghao Li 已提交
170 171

    // update commit index
M
Minghao Li 已提交
172
    pSyncNode->commitIndex = newCommitIndex;
M
Minghao Li 已提交
173

M
Minghao Li 已提交
174
    // call back Wal
M
Minghao Li 已提交
175
    pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex);
M
Minghao Li 已提交
176 177

    // execute fsm
178
    if (pSyncNode != NULL && pSyncNode->pFsm != NULL) {
M
Minghao Li 已提交
179
      int32_t code = syncNodeDoCommit(pSyncNode, beginIndex, endIndex, pSyncNode->state);
180
      if (code != 0) {
S
Shengliang Guan 已提交
181 182
        sNError(pSyncNode, "advance commit index error, do commit begin:%" PRId64 ", end:%" PRId64, beginIndex,
                endIndex);
M
Minghao Li 已提交
183
        return;
184
      }
M
Minghao Li 已提交
185 186
    }
  }
M
Minghao Li 已提交
187 188 189
}

bool syncAgreeIndex(SSyncNode* pSyncNode, SRaftId* pRaftId, SyncIndex index) {
M
Minghao Li 已提交
190 191 192 193
  // I am leader, I agree
  if (syncUtilSameId(pRaftId, &(pSyncNode->myRaftId)) && pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    return true;
  }
M
Minghao Li 已提交
194

M
Minghao Li 已提交
195 196
  // follower agree
  SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, pRaftId);
M
Minghao Li 已提交
197
  if (matchIndex >= index) {
M
Minghao Li 已提交
198
    return true;
M
Minghao Li 已提交
199
  }
M
Minghao Li 已提交
200

M
Minghao Li 已提交
201
  // not agree
M
Minghao Li 已提交
202
  return false;
M
Minghao Li 已提交
203 204
}

205
static inline int64_t syncNodeAbs64(int64_t a, int64_t b) {
206 207
  ASSERT(a >= 0);
  ASSERT(b >= 0);
208 209 210 211 212 213

  int64_t c = a > b ? a - b : b - a;
  return c;
}

int32_t syncNodeDynamicQuorum(const SSyncNode* pSyncNode) {
M
Minghao Li 已提交
214 215
  return pSyncNode->quorum;

M
Minghao Li 已提交
216
#if 0
217 218 219 220
  int32_t quorum = 1;  // self

  int64_t timeNow = taosGetTimestampMs();
  for (int i = 0; i < pSyncNode->peersNum; ++i) {
221 222 223
    int64_t   peerStartTime = syncIndexMgrGetStartTime(pSyncNode->pNextIndex, &(pSyncNode->peersId)[i]);
    int64_t   peerRecvTime = syncIndexMgrGetRecvTime(pSyncNode->pNextIndex, &(pSyncNode->peersId)[i]);
    SyncIndex peerMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId)[i]);
224

225 226 227 228 229 230 231 232 233
    int64_t recvTimeDiff = TABS(peerRecvTime - timeNow);
    int64_t startTimeDiff = TABS(peerStartTime - pSyncNode->startTime);
    int64_t logDiff = TABS(peerMatchIndex - syncNodeGetLastIndex(pSyncNode));

    /*
        int64_t recvTimeDiff = syncNodeAbs64(peerRecvTime, timeNow);
        int64_t startTimeDiff = syncNodeAbs64(peerStartTime, pSyncNode->startTime);
        int64_t logDiff = syncNodeAbs64(peerMatchIndex, syncNodeGetLastIndex(pSyncNode));
    */
234 235 236 237

    int32_t addQuorum = 0;

    if (recvTimeDiff < SYNC_MAX_RECV_TIME_RANGE_MS) {
238 239 240 241 242 243 244 245 246
      if (startTimeDiff < SYNC_MAX_START_TIME_RANGE_MS) {
        addQuorum = 1;
      } else {
        if (logDiff < SYNC_ADD_QUORUM_COUNT) {
          addQuorum = 1;
        } else {
          addQuorum = 0;
        }
      }
247 248 249 250
    } else {
      addQuorum = 0;
    }

251 252 253 254 255 256 257 258 259 260 261
    /*
        if (recvTimeDiff < SYNC_MAX_RECV_TIME_RANGE_MS) {
          addQuorum = 1;
        } else {
          addQuorum = 0;
        }

        if (startTimeDiff > SYNC_MAX_START_TIME_RANGE_MS) {
          addQuorum = 0;
        }
    */
262 263 264 265

    quorum += addQuorum;
  }

266
  ASSERT(quorum <= pSyncNode->replicaNum);
267 268 269 270 271 272

  if (quorum < pSyncNode->quorum) {
    quorum = pSyncNode->quorum;
  }

  return quorum;
M
Minghao Li 已提交
273
#endif
274 275
}

M
Minghao Li 已提交
276
/*
277 278 279 280 281 282 283 284 285 286 287 288
bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) {
  int agreeCount = 0;
  for (int i = 0; i < pSyncNode->replicaNum; ++i) {
    if (syncAgreeIndex(pSyncNode, &(pSyncNode->replicasId[i]), index)) {
      ++agreeCount;
    }
    if (agreeCount >= syncNodeDynamicQuorum(pSyncNode)) {
      return true;
    }
  }
  return false;
}
M
Minghao Li 已提交
289
*/
290

B
Benguang Zhao 已提交
291 292 293
bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index) {
  int            count = 0;
  SSyncIndexMgr* pMatches = pNode->pMatchIndex;
294
  ASSERT(pNode->replicaNum == pMatches->replicaNum);
B
Benguang Zhao 已提交
295 296 297 298 299 300 301 302 303 304 305 306

  for (int i = 0; i < pNode->replicaNum; i++) {
    SyncIndex matchIndex = pMatches->index[i];
    if (matchIndex >= index) {
      count++;
    }
  }

  return count >= pNode->quorum;
}

bool syncAgree(SSyncNode* pNode, SyncIndex index) {
M
Minghao Li 已提交
307
  int agreeCount = 0;
B
Benguang Zhao 已提交
308 309
  for (int i = 0; i < pNode->replicaNum; ++i) {
    if (syncAgreeIndex(pNode, &(pNode->replicasId[i]), index)) {
M
Minghao Li 已提交
310 311
      ++agreeCount;
    }
B
Benguang Zhao 已提交
312
    if (agreeCount >= pNode->quorum) {
M
Minghao Li 已提交
313 314 315 316
      return true;
    }
  }
  return false;
317
}
318 319 320 321 322 323 324 325 326 327 328 329 330

int64_t syncNodeUpdateCommitIndex(SSyncNode* ths, SyncIndex commitIndex) {
  SyncIndex lastVer = ths->pLogStore->syncLogLastIndex(ths->pLogStore);
  commitIndex = TMAX(commitIndex, ths->commitIndex);
  ths->commitIndex = TMIN(commitIndex, lastVer);
  ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, ths->commitIndex);
  return ths->commitIndex;
}

int64_t syncNodeCheckCommitIndex(SSyncNode* ths, SyncIndex indexLikely) {
  if (indexLikely > ths->commitIndex && syncNodeAgreedUpon(ths, indexLikely)) {
    SyncIndex commitIndex = indexLikely;
    syncNodeUpdateCommitIndex(ths, commitIndex);
331 332
    sTrace("vgId:%d, agreed upon. role:%d, term:%" PRId64 ", index: %" PRId64 "", ths->vgId, ths->state,
           ths->pRaftStore->currentTerm, commitIndex);
333 334 335
  }
  return ths->commitIndex;
}