syncCommit.c 8.6 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

M
Minghao Li 已提交
16
#include "syncCommit.h"
M
Minghao Li 已提交
17
#include "syncIndexMgr.h"
M
Minghao Li 已提交
18
#include "syncInt.h"
M
Minghao Li 已提交
19
#include "syncRaftCfg.h"
M
Minghao Li 已提交
20
#include "syncRaftLog.h"
M
Minghao Li 已提交
21
#include "syncRaftStore.h"
M
Minghao Li 已提交
22
#include "syncUtil.h"
M
Minghao Li 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

// \* Leader i advances its commitIndex.
// \* This is done as a separate step from handling AppendEntries responses,
// \* in part to minimize atomic regions, and in part so that leaders of
// \* single-server clusters are able to mark entries committed.
// AdvanceCommitIndex(i) ==
//     /\ state[i] = Leader
//     /\ LET \* The set of servers that agree up through index.
//            Agree(index) == {i} \cup {k \in Server :
//                                          matchIndex[i][k] >= index}
//            \* The maximum indexes for which a quorum agrees
//            agreeIndexes == {index \in 1..Len(log[i]) :
//                                 Agree(index) \in Quorum}
//            \* New value for commitIndex'[i]
//            newCommitIndex ==
//               IF /\ agreeIndexes /= {}
//                  /\ log[i][Max(agreeIndexes)].term = currentTerm[i]
//               THEN
//                   Max(agreeIndexes)
//               ELSE
//                   commitIndex[i]
//        IN commitIndex' = [commitIndex EXCEPT ![i] = newCommitIndex]
//     /\ UNCHANGED <<messages, serverVars, candidateVars, leaderVars, log>>
//
M
Minghao Li 已提交
47
void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) {
M
Minghao Li 已提交
48 49
  syncIndexMgrLog2("==syncNodeMaybeAdvanceCommitIndex== pNextIndex", pSyncNode->pNextIndex);
  syncIndexMgrLog2("==syncNodeMaybeAdvanceCommitIndex== pMatchIndex", pSyncNode->pMatchIndex);
M
Minghao Li 已提交
50 51

  // update commit index
M
Minghao Li 已提交
52 53
  SyncIndex newCommitIndex = pSyncNode->commitIndex;
  for (SyncIndex index = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore); index > pSyncNode->commitIndex;
M
Minghao Li 已提交
54
       --index) {
M
Minghao Li 已提交
55
    bool agree = syncAgree(pSyncNode, index);
M
Minghao Li 已提交
56 57
    sTrace("syncMaybeAdvanceCommitIndex syncAgree:%d, index:%ld, pSyncNode->commitIndex:%ld", agree, index,
           pSyncNode->commitIndex);
M
Minghao Li 已提交
58
    if (agree) {
M
Minghao Li 已提交
59 60 61 62 63 64 65 66
      // term
      SSyncRaftEntry* pEntry = pSyncNode->pLogStore->getEntry(pSyncNode->pLogStore, index);
      assert(pEntry != NULL);

      // cannot commit, even if quorum agree. need check term!
      if (pEntry->term == pSyncNode->pRaftStore->currentTerm) {
        // update commit index
        newCommitIndex = index;
M
Minghao Li 已提交
67 68
        sTrace("syncMaybeAdvanceCommitIndex maybe to update, newCommitIndex:%ld commit, pSyncNode->commitIndex:%ld",
               newCommitIndex, pSyncNode->commitIndex);
69 70

        syncEntryDestory(pEntry);
M
Minghao Li 已提交
71
        break;
M
Minghao Li 已提交
72 73 74 75 76
      } else {
        sTrace(
            "syncMaybeAdvanceCommitIndex can not commit due to term not equal, pEntry->term:%lu, "
            "pSyncNode->pRaftStore->currentTerm:%lu",
            pEntry->term, pSyncNode->pRaftStore->currentTerm);
M
Minghao Li 已提交
77
      }
M
Minghao Li 已提交
78 79

      syncEntryDestory(pEntry);
M
Minghao Li 已提交
80 81
    }
  }
M
Minghao Li 已提交
82

M
Minghao Li 已提交
83 84 85
  if (newCommitIndex > pSyncNode->commitIndex) {
    SyncIndex beginIndex = pSyncNode->commitIndex + 1;
    SyncIndex endIndex = newCommitIndex;
M
Minghao Li 已提交
86

M
Minghao Li 已提交
87 88
    sTrace("syncMaybeAdvanceCommitIndex sync commit %ld", newCommitIndex);

M
Minghao Li 已提交
89
    // update commit index
M
Minghao Li 已提交
90
    pSyncNode->commitIndex = newCommitIndex;
M
Minghao Li 已提交
91

M
Minghao Li 已提交
92 93 94 95
    // call back Wal
    pSyncNode->pLogStore->updateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex);

    // execute fsm
M
Minghao Li 已提交
96
    if (pSyncNode->pFsm != NULL) {
97
      int32_t code = syncNodeCommit(pSyncNode, beginIndex, endIndex, pSyncNode->state);
98 99 100
      ASSERT(code == 0);

#if 0      
M
Minghao Li 已提交
101 102 103 104
      for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
        if (i != SYNC_INDEX_INVALID) {
          SSyncRaftEntry* pEntry = pSyncNode->pLogStore->getEntry(pSyncNode->pLogStore, i);
          assert(pEntry != NULL);
M
Minghao Li 已提交
105

M
Minghao Li 已提交
106 107
          SRpcMsg rpcMsg;
          syncEntry2OriginalRpc(pEntry, &rpcMsg);
M
Minghao Li 已提交
108

M
Minghao Li 已提交
109
          if (pSyncNode->pFsm->FpCommitCb != NULL && syncUtilUserCommit(pEntry->originalRpcType)) {
M
Minghao Li 已提交
110 111 112 113 114 115
            SFsmCbMeta cbMeta;
            cbMeta.index = pEntry->index;
            cbMeta.isWeak = pEntry->isWeak;
            cbMeta.code = 0;
            cbMeta.state = pSyncNode->state;
            cbMeta.seqNum = pEntry->seqNum;
116 117
            cbMeta.term = pEntry->term;
            cbMeta.currentTerm = pSyncNode->pRaftStore->currentTerm;
M
Minghao Li 已提交
118
            cbMeta.flag = 0x1;
119

120 121 122 123
            SSnapshot snapshot;
            ASSERT(pSyncNode->pFsm->FpGetSnapshot != NULL);
            pSyncNode->pFsm->FpGetSnapshot(pSyncNode->pFsm, &snapshot);

124
            bool needExecute = true;
125
            if (cbMeta.index <= snapshot.lastApplyIndex) {
126 127 128 129 130 131
              needExecute = false;
            }

            if (needExecute) {
              pSyncNode->pFsm->FpCommitCb(pSyncNode->pFsm, &rpcMsg, cbMeta);
            }
M
Minghao Li 已提交
132 133
          }

M
Minghao Li 已提交
134
          // config change
135
          if (pEntry->originalRpcType == TDMT_SYNC_CONFIG_CHANGE) {
M
Minghao Li 已提交
136 137
            SSyncCfg oldSyncCfg = pSyncNode->pRaftCfg->cfg;

M
Minghao Li 已提交
138 139 140
            SSyncCfg newSyncCfg;
            int32_t  ret = syncCfgFromStr(rpcMsg.pCont, &newSyncCfg);
            ASSERT(ret == 0);
M
Minghao Li 已提交
141

M
Minghao Li 已提交
142 143 144 145 146 147 148 149 150
            // update new config myIndex
            bool hit = false;
            for (int i = 0; i < newSyncCfg.replicaNum; ++i) {
              if (strcmp(pSyncNode->myNodeInfo.nodeFqdn, (newSyncCfg.nodeInfo)[i].nodeFqdn) == 0 &&
                  pSyncNode->myNodeInfo.nodePort == (newSyncCfg.nodeInfo)[i].nodePort) {
                newSyncCfg.myIndex = i;
                hit = true;
                break;
              }
M
Minghao Li 已提交
151
            }
M
Minghao Li 已提交
152 153 154 155

            if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
              ASSERT(hit == true);
            }
M
Minghao Li 已提交
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173

            bool isDrop;
            syncNodeUpdateConfig(pSyncNode, &newSyncCfg, &isDrop);

            // change isStandBy to normal
            if (!isDrop) {
              if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
                syncNodeBecomeLeader(pSyncNode);
              } else {
                syncNodeBecomeFollower(pSyncNode);
              }
            }

            char* sOld = syncCfg2Str(&oldSyncCfg);
            char* sNew = syncCfg2Str(&newSyncCfg);
            sInfo("==config change== 0x1 old:%s new:%s isDrop:%d \n", sOld, sNew, isDrop);
            taosMemoryFree(sOld);
            taosMemoryFree(sNew);
M
Minghao Li 已提交
174 175 176 177 178 179 180

            if (pSyncNode->pFsm->FpReConfigCb != NULL) {
              SReConfigCbMeta cbMeta = {0};
              cbMeta.code = 0;
              cbMeta.currentTerm = pSyncNode->pRaftStore->currentTerm;
              cbMeta.index = pEntry->index;
              cbMeta.term = pEntry->term;
M
Minghao Li 已提交
181 182 183
              cbMeta.oldCfg = oldSyncCfg;
              cbMeta.flag = 0x1;
              cbMeta.isDrop = isDrop;
M
Minghao Li 已提交
184 185
              pSyncNode->pFsm->FpReConfigCb(pSyncNode->pFsm, newSyncCfg, cbMeta);
            }
M
Minghao Li 已提交
186 187
          }

188 189 190
          // restore finish
          if (pEntry->index == pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore)) {
            if (pSyncNode->restoreFinish == false) {
191 192
              if (pSyncNode->pFsm->FpRestoreFinishCb != NULL) {
                pSyncNode->pFsm->FpRestoreFinishCb(pSyncNode->pFsm);
193
              }
194
              pSyncNode->restoreFinish = true;
195
              sInfo("==syncMaybeAdvanceCommitIndex== restoreFinish set true %p vgId:%d", pSyncNode, pSyncNode->vgId);
196

197
              /*
198
              tsem_post(&pSyncNode->restoreSem);
199 200
              sInfo("==syncMaybeAdvanceCommitIndex== RestoreFinish tsem_post %p", pSyncNode);
              */
201 202 203
            }
          }

M
Minghao Li 已提交
204 205 206
          rpcFreeCont(rpcMsg.pCont);
          syncEntryDestory(pEntry);
        }
M
Minghao Li 已提交
207
      }
208
#endif
M
Minghao Li 已提交
209 210
    }
  }
M
Minghao Li 已提交
211 212 213
}

bool syncAgreeIndex(SSyncNode* pSyncNode, SRaftId* pRaftId, SyncIndex index) {
M
Minghao Li 已提交
214 215 216 217
  // I am leader, I agree
  if (syncUtilSameId(pRaftId, &(pSyncNode->myRaftId)) && pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    return true;
  }
M
Minghao Li 已提交
218

M
Minghao Li 已提交
219 220
  // follower agree
  SyncIndex matchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, pRaftId);
M
Minghao Li 已提交
221
  if (matchIndex >= index) {
M
Minghao Li 已提交
222
    return true;
M
Minghao Li 已提交
223
  }
M
Minghao Li 已提交
224

M
Minghao Li 已提交
225
  // not agree
M
Minghao Li 已提交
226
  return false;
M
Minghao Li 已提交
227 228 229 230 231 232 233 234 235 236 237 238 239
}

bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) {
  int agreeCount = 0;
  for (int i = 0; i < pSyncNode->replicaNum; ++i) {
    if (syncAgreeIndex(pSyncNode, &(pSyncNode->replicasId[i]), index)) {
      ++agreeCount;
    }
    if (agreeCount >= pSyncNode->quorum) {
      return true;
    }
  }
  return false;
240
}