mndSync.c 12.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#define _DEFAULT_SOURCE
S
Shengliang Guan 已提交
17
#include "mndSync.h"
18
#include "mndTrans.h"
19

20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
static int32_t mndSyncEqCtrlMsg(const SMsgCb *msgcb, SRpcMsg *pMsg) {
  if (pMsg == NULL || pMsg->pCont == NULL) {
    return -1;
  }

  SMsgHead *pHead = pMsg->pCont;
  pHead->contLen = htonl(pHead->contLen);
  pHead->vgId = htonl(pHead->vgId);

  if (msgcb == NULL || msgcb->putToQueueFp == NULL) {
    rpcFreeCont(pMsg->pCont);
    pMsg->pCont = NULL;
    return -1;
  }

  int32_t code = tmsgPutToQueue(msgcb, SYNC_CTRL_QUEUE, pMsg);
  if (code != 0) {
    rpcFreeCont(pMsg->pCont);
    pMsg->pCont = NULL;
  }
  return code;
}

43
static int32_t mndSyncEqMsg(const SMsgCb *msgcb, SRpcMsg *pMsg) {
44 45 46 47
  if (pMsg == NULL || pMsg->pCont == NULL) {
    return -1;
  }

M
Minghao Li 已提交
48 49 50 51
  SMsgHead *pHead = pMsg->pCont;
  pHead->contLen = htonl(pHead->contLen);
  pHead->vgId = htonl(pHead->vgId);

52 53 54 55 56 57
  if (msgcb == NULL || msgcb->putToQueueFp == NULL) {
    rpcFreeCont(pMsg->pCont);
    pMsg->pCont = NULL;
    return -1;
  }

58 59 60 61 62 63
  int32_t code = tmsgPutToQueue(msgcb, SYNC_QUEUE, pMsg);
  if (code != 0) {
    rpcFreeCont(pMsg->pCont);
    pMsg->pCont = NULL;
  }
  return code;
M
Minghao Li 已提交
64
}
M
Minghao Li 已提交
65

66 67 68 69 70 71 72 73
static int32_t mndSyncSendMsg(const SEpSet *pEpSet, SRpcMsg *pMsg) {
  int32_t code = tmsgSendReq(pEpSet, pMsg);
  if (code != 0) {
    rpcFreeCont(pMsg->pCont);
    pMsg->pCont = NULL;
  }
  return code;
}
M
Minghao Li 已提交
74

75
int32_t mndProcessWriteMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) {
S
Shengliang Guan 已提交
76 77 78 79
  SMnode    *pMnode = pFsm->data;
  SSyncMgmt *pMgmt = &pMnode->syncMgmt;
  SSdbRaw   *pRaw = pMsg->pCont;

S
Shengliang Guan 已提交
80
  int32_t transId = sdbGetIdFromRaw(pMnode->pSdb, pRaw);
S
Shengliang Guan 已提交
81
  pMgmt->errCode = pMeta->code;
82
  mInfo("trans:%d, is proposed, saved:%d code:0x%x, apply index:%" PRId64 " term:%" PRIu64 " config:%" PRId64
H
Hongze Cheng 已提交
83
        " role:%s raw:%p",
S
Shengliang Guan 已提交
84
        transId, pMgmt->transId, pMeta->code, pMeta->index, pMeta->term, pMeta->lastConfigIndex, syncStr(pMeta->state),
H
Hongze Cheng 已提交
85
        pRaw);
S
Shengliang Guan 已提交
86 87 88

  if (pMgmt->errCode == 0) {
    sdbWriteWithoutFree(pMnode->pSdb, pRaw);
S
Shengliang Guan 已提交
89
    sdbSetApplyInfo(pMnode->pSdb, pMeta->index, pMeta->term, pMeta->lastConfigIndex);
S
Shengliang Guan 已提交
90 91
  }

92
  taosWLockLatch(&pMgmt->lock);
93
  if (transId <= 0) {
94
    taosWUnLockLatch(&pMgmt->lock);
95 96
    mError("trans:%d, invalid commit msg", transId);
  } else if (transId == pMgmt->transId) {
S
Shengliang Guan 已提交
97
    if (pMgmt->errCode != 0) {
98 99
      mError("trans:%d, failed to propose since %s, post sem", transId, tstrerror(pMgmt->errCode));
    } else {
S
Shengliang Guan 已提交
100
      mInfo("trans:%d, is proposed and post sem", transId);
S
Shengliang Guan 已提交
101
    }
S
Shengliang Guan 已提交
102
    pMgmt->transId = 0;
S
Shengliang Guan 已提交
103
    tsem_post(&pMgmt->syncSem);
104
    taosWUnLockLatch(&pMgmt->lock);
105
  } else {
106
    taosWUnLockLatch(&pMgmt->lock);
107 108
    STrans *pTrans = mndAcquireTrans(pMnode, transId);
    if (pTrans != NULL) {
109
      mInfo("trans:%d, execute in mnode which not leader", transId);
110 111
      mndTransExecute(pMnode, pTrans);
      mndReleaseTrans(pMnode, pTrans);
112 113 114
      // sdbWriteFile(pMnode->pSdb, SDB_WRITE_DELTA);
    } else {
      mError("trans:%d, not found while execute in mnode since %s", transId, terrstr());
115
    }
M
Minghao Li 已提交
116
  }
117 118

  return 0;
M
Minghao Li 已提交
119 120
}

121
int32_t mndSyncCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) {
122 123 124 125 126 127 128
  int32_t code = 0;
  if (!syncUtilUserCommit(pMsg->msgType)) {
    goto _out;
  }
  code = mndProcessWriteMsg(pFsm, pMsg, pMeta);

_out:
129 130
  rpcFreeCont(pMsg->pCont);
  pMsg->pCont = NULL;
131
  return code;
132 133
}

S
Shengliang Guan 已提交
134
int32_t mndSyncGetSnapshot(const SSyncFSM *pFsm, SSnapshot *pSnapshot, void *pReaderParam, void **ppReader) {
135
  mInfo("start to read snapshot from sdb in atomic way");
136 137 138
  SMnode *pMnode = pFsm->data;
  return sdbStartRead(pMnode->pSdb, (SSdbIter **)ppReader, &pSnapshot->lastApplyIndex, &pSnapshot->lastApplyTerm,
                      &pSnapshot->lastConfigIndex);
139 140 141
  return 0;
}

S
Shengliang Guan 已提交
142
int32_t mndSyncGetSnapshotInfo(const SSyncFSM *pFsm, SSnapshot *pSnapshot) {
143
  SMnode *pMnode = pFsm->data;
144
  sdbGetCommitInfo(pMnode->pSdb, &pSnapshot->lastApplyIndex, &pSnapshot->lastApplyTerm, &pSnapshot->lastConfigIndex);
M
Minghao Li 已提交
145 146 147
  return 0;
}

S
Shengliang Guan 已提交
148
void mndRestoreFinish(const SSyncFSM *pFsm) {
149
  SMnode *pMnode = pFsm->data;
S
Shengliang Guan 已提交
150

S
Shengliang Guan 已提交
151
  if (!pMnode->deploy) {
152 153 154 155 156 157 158
    if (pMnode->restored) {
      mInfo("vgId:1, sync restore finished, and will handle outstanding transactions");
      mndTransPullup(pMnode);
      mndSetRestored(pMnode, true);
    } else {
      mInfo("vgId:1, sync restore finished, repeat call");
    }
S
Shengliang Guan 已提交
159
  } else {
160
    mInfo("vgId:1, sync restore finished");
S
Shengliang Guan 已提交
161
  }
162 163
}

S
Shengliang Guan 已提交
164
int32_t mndSnapshotStartRead(const SSyncFSM *pFsm, void *pParam, void **ppReader) {
165
  mInfo("start to read snapshot from sdb");
S
Shengliang Guan 已提交
166
  SMnode *pMnode = pFsm->data;
167
  return sdbStartRead(pMnode->pSdb, (SSdbIter **)ppReader, NULL, NULL, NULL);
S
Shengliang Guan 已提交
168 169
}

S
Shengliang Guan 已提交
170
int32_t mndSnapshotStopRead(const SSyncFSM *pFsm, void *pReader) {
171
  mInfo("stop to read snapshot from sdb");
S
Shengliang Guan 已提交
172 173 174 175
  SMnode *pMnode = pFsm->data;
  return sdbStopRead(pMnode->pSdb, pReader);
}

S
Shengliang Guan 已提交
176
int32_t mndSnapshotDoRead(const SSyncFSM *pFsm, void *pReader, void **ppBuf, int32_t *len) {
S
Shengliang Guan 已提交
177 178 179 180
  SMnode *pMnode = pFsm->data;
  return sdbDoRead(pMnode->pSdb, pReader, ppBuf, len);
}

S
Shengliang Guan 已提交
181
int32_t mndSnapshotStartWrite(const SSyncFSM *pFsm, void *pParam, void **ppWriter) {
S
Shengliang Guan 已提交
182 183 184 185 186
  mInfo("start to apply snapshot to sdb");
  SMnode *pMnode = pFsm->data;
  return sdbStartWrite(pMnode->pSdb, (SSdbIter **)ppWriter);
}

S
Shengliang Guan 已提交
187
int32_t mndSnapshotStopWrite(const SSyncFSM *pFsm, void *pWriter, bool isApply, SSnapshot *pSnapshot) {
188
  mInfo("stop to apply snapshot to sdb, apply:%d, index:%" PRId64 " term:%" PRIu64 " config:%" PRId64, isApply,
S
Shengliang Guan 已提交
189
        pSnapshot->lastApplyIndex, pSnapshot->lastApplyTerm, pSnapshot->lastConfigIndex);
S
Shengliang Guan 已提交
190
  SMnode *pMnode = pFsm->data;
191 192
  return sdbStopWrite(pMnode->pSdb, pWriter, isApply, pSnapshot->lastApplyIndex, pSnapshot->lastApplyTerm,
                      pSnapshot->lastConfigIndex);
S
Shengliang Guan 已提交
193 194
}

S
Shengliang Guan 已提交
195
int32_t mndSnapshotDoWrite(const SSyncFSM *pFsm, void *pWriter, void *pBuf, int32_t len) {
S
Shengliang Guan 已提交
196 197 198 199
  SMnode *pMnode = pFsm->data;
  return sdbDoWrite(pMnode->pSdb, pWriter, pBuf, len);
}

S
Shengliang Guan 已提交
200
static void mndBecomeFollower(const SSyncFSM *pFsm) {
201
  SMnode *pMnode = pFsm->data;
202
  mInfo("vgId:1, become follower");
203

204
  taosWLockLatch(&pMnode->syncMgmt.lock);
205
  if (pMnode->syncMgmt.transId != 0) {
206 207
    mInfo("vgId:1, become follower and post sem, trans:%d, failed to propose since not leader",
          pMnode->syncMgmt.transId);
208
    pMnode->syncMgmt.transId = 0;
209
    pMnode->syncMgmt.errCode = TSDB_CODE_SYN_NOT_LEADER;
210 211
    tsem_post(&pMnode->syncMgmt.syncSem);
  }
212
  taosWUnLockLatch(&pMnode->syncMgmt.lock);
213 214
}

S
Shengliang Guan 已提交
215
static void mndBecomeLeader(const SSyncFSM *pFsm) {
216
  mInfo("vgId:1, become leader");
217
  SMnode *pMnode = pFsm->data;
218 219
}

220 221 222
static bool mndApplyQueueEmpty(const SSyncFSM *pFsm) {
  SMnode *pMnode = pFsm->data;

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
  if (pMnode != NULL && pMnode->msgCb.qsizeFp != NULL) {
    int32_t itemSize = tmsgGetQueueSize(&pMnode->msgCb, 1, APPLY_QUEUE);
    return (itemSize == 0);
  } else {
    return true;
  }
}

static int32_t mndApplyQueueItems(const SSyncFSM *pFsm) {
  SMnode *pMnode = pFsm->data;

  if (pMnode != NULL && pMnode->msgCb.qsizeFp != NULL) {
    int32_t itemSize = tmsgGetQueueSize(&pMnode->msgCb, 1, APPLY_QUEUE);
    return itemSize;
  } else {
    return -1;
  }
240 241
}

242 243
SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) {
  SSyncFSM *pFsm = taosMemoryCalloc(1, sizeof(SSyncFSM));
M
Minghao Li 已提交
244
  pFsm->data = pMnode;
245
  pFsm->FpCommitCb = mndSyncCommitMsg;
246 247
  pFsm->FpPreCommitCb = NULL;
  pFsm->FpRollBackCb = NULL;
248
  pFsm->FpRestoreFinishCb = mndRestoreFinish;
S
Shengliang Guan 已提交
249
  pFsm->FpLeaderTransferCb = NULL;
250
  pFsm->FpApplyQueueEmptyCb = mndApplyQueueEmpty;
251
  pFsm->FpApplyQueueItems = mndApplyQueueItems;
S
Shengliang Guan 已提交
252
  pFsm->FpReConfigCb = NULL;
253 254
  pFsm->FpBecomeLeaderCb = mndBecomeLeader;
  pFsm->FpBecomeFollowerCb = mndBecomeFollower;
S
Shengliang Guan 已提交
255
  pFsm->FpGetSnapshot = mndSyncGetSnapshot;
256
  pFsm->FpGetSnapshotInfo = mndSyncGetSnapshotInfo;
S
Shengliang Guan 已提交
257 258 259 260 261 262
  pFsm->FpSnapshotStartRead = mndSnapshotStartRead;
  pFsm->FpSnapshotStopRead = mndSnapshotStopRead;
  pFsm->FpSnapshotDoRead = mndSnapshotDoRead;
  pFsm->FpSnapshotStartWrite = mndSnapshotStartWrite;
  pFsm->FpSnapshotStopWrite = mndSnapshotStopWrite;
  pFsm->FpSnapshotDoWrite = mndSnapshotDoWrite;
M
Minghao Li 已提交
263
  return pFsm;
264 265 266 267
}

int32_t mndInitSync(SMnode *pMnode) {
  SSyncMgmt *pMgmt = &pMnode->syncMgmt;
268 269
  taosInitRWLatch(&pMgmt->lock);
  pMgmt->transId = 0;
270

S
Shengliang Guan 已提交
271 272 273 274 275
  SSyncInfo syncInfo = {
      .snapshotStrategy = SYNC_STRATEGY_STANDARD_SNAPSHOT,
      .batchSize = 1,
      .vgId = 1,
      .pWal = pMnode->pWal,
S
Shengliang Guan 已提交
276
      .msgcb = &pMnode->msgCb,
S
Shengliang Guan 已提交
277 278
      .syncSendMSg = mndSyncSendMsg,
      .syncEqMsg = mndSyncEqMsg,
279
      .syncEqCtrlMsg = mndSyncEqCtrlMsg,
S
Shengliang Guan 已提交
280 281 282
      .pingMs = 5000,
      .electMs = 3000,
      .heartbeatMs = 500,
S
Shengliang Guan 已提交
283 284
  };

285 286 287
  snprintf(syncInfo.path, sizeof(syncInfo.path), "%s%ssync", pMnode->path, TD_DIRSEP);
  syncInfo.pFsm = mndSyncMakeFsm(pMnode);

S
Shengliang Guan 已提交
288
  mInfo("vgId:1, start to open sync, replica:%d selfIndex:%d", pMgmt->numOfReplicas, pMgmt->selfIndex);
289 290 291 292 293 294 295 296
  SSyncCfg *pCfg = &syncInfo.syncCfg;
  pCfg->replicaNum = pMgmt->numOfReplicas;
  pCfg->myIndex = pMgmt->selfIndex;
  for (int32_t i = 0; i < pMgmt->numOfReplicas; ++i) {
    SNodeInfo *pNode = &pCfg->nodeInfo[i];
    tstrncpy(pNode->nodeFqdn, pMgmt->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
    pNode->nodePort = pMgmt->replicas[i].port;
    mInfo("vgId:1, index:%d ep:%s:%u", i, pNode->nodeFqdn, pNode->nodePort);
M
Minghao Li 已提交
297 298
  }

299
  tsem_init(&pMgmt->syncSem, 0, 0);
300 301 302 303 304
  pMgmt->sync = syncOpen(&syncInfo);
  if (pMgmt->sync <= 0) {
    mError("failed to open sync since %s", terrstr());
    return -1;
  }
M
Minghao Li 已提交
305

306
  mInfo("mnode-sync is opened, id:%" PRId64, pMgmt->sync);
S
Shengliang Guan 已提交
307 308 309 310 311
  return 0;
}

void mndCleanupSync(SMnode *pMnode) {
  SSyncMgmt *pMgmt = &pMnode->syncMgmt;
312
  syncStop(pMgmt->sync);
313
  mInfo("mnode-sync is stopped, id:%" PRId64, pMgmt->sync);
314

315
  tsem_destroy(&pMgmt->syncSem);
316 317
  memset(pMgmt, 0, sizeof(SSyncMgmt));
}
M
Minghao Li 已提交
318

S
Shengliang Guan 已提交
319
int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) {
S
Shengliang Guan 已提交
320
  SSyncMgmt *pMgmt = &pMnode->syncMgmt;
S
Shengliang Guan 已提交
321
  pMgmt->errCode = 0;
S
Shengliang Guan 已提交
322

S
Shengliang Guan 已提交
323
  SRpcMsg req = {.msgType = TDMT_MND_APPLY_MSG, .contLen = sdbGetRawTotalSize(pRaw)};
S
Shengliang Guan 已提交
324
  if (req.contLen <= 0) return -1;
S
Shengliang Guan 已提交
325

326 327 328
  req.pCont = rpcMallocCont(req.contLen);
  if (req.pCont == NULL) return -1;
  memcpy(req.pCont, pRaw, req.contLen);
S
Shengliang Guan 已提交
329

330
  taosWLockLatch(&pMgmt->lock);
331
  if (pMgmt->transId != 0) {
S
Shengliang Guan 已提交
332
    mError("trans:%d, can't be proposed since trans:%d already waiting for confirm", transId, pMgmt->transId);
333
    taosWUnLockLatch(&pMgmt->lock);
334
    terrno = TSDB_CODE_MND_LAST_TRANS_NOT_FINISHED;
335 336
    return -1;
  }
S
Shengliang Guan 已提交
337

S
Shengliang Guan 已提交
338 339 340
  mInfo("trans:%d, will be proposed", transId);
  pMgmt->transId = transId;
  taosWUnLockLatch(&pMgmt->lock);
341

S
Shengliang Guan 已提交
342
  int32_t code = syncPropose(pMgmt->sync, &req, false);
343
  if (code == 0) {
344
    mInfo("trans:%d, is proposing and wait sem", pMgmt->transId);
345
    tsem_wait(&pMgmt->syncSem);
346 347 348 349 350 351 352 353
  } else if (code > 0) {
    mInfo("trans:%d, confirm at once since replica is 1, continue execute", transId);
    taosWLockLatch(&pMgmt->lock);
    pMgmt->transId = 0;
    taosWUnLockLatch(&pMgmt->lock);
    sdbWriteWithoutFree(pMnode->pSdb, pRaw);
    sdbSetApplyInfo(pMnode->pSdb, req.info.conn.applyIndex, req.info.conn.applyTerm, SYNC_INDEX_INVALID);
    code = 0;
354
  } else {
355
    mError("trans:%d, failed to proposed since %s", transId, terrstr());
S
Shengliang Guan 已提交
356
    taosWLockLatch(&pMgmt->lock);
357 358
    pMgmt->transId = 0;
    taosWUnLockLatch(&pMgmt->lock);
359
    if (terrno == 0) {
360 361
      terrno = TSDB_CODE_APP_ERROR;
    }
362
  }
363

364
  rpcFreeCont(req.pCont);
S
Shengliang Guan 已提交
365
  req.pCont = NULL;
S
Shengliang Guan 已提交
366 367 368 369 370
  if (code != 0) {
    mError("trans:%d, failed to propose, code:0x%x", pMgmt->transId, code);
    return code;
  }

S
Shengliang Guan 已提交
371
  terrno = pMgmt->errCode;
S
Shengliang Guan 已提交
372
  return pMgmt->errCode;
373 374
}

375
void mndSyncStart(SMnode *pMnode) {
376
  SSyncMgmt *pMgmt = &pMnode->syncMgmt;
B
Benguang Zhao 已提交
377
  if (syncStart(pMgmt->sync) < 0) {
378
    mError("vgId:1, failed to start sync, id:%" PRId64, pMgmt->sync);
B
Benguang Zhao 已提交
379 380
    return;
  }
381
  mInfo("vgId:1, sync started, id:%" PRId64, pMgmt->sync);
382 383
}

S
Shengliang Guan 已提交
384
void mndSyncStop(SMnode *pMnode) {
385
  taosWLockLatch(&pMnode->syncMgmt.lock);
S
Shengliang Guan 已提交
386
  if (pMnode->syncMgmt.transId != 0) {
387
    mInfo("vgId:1, is stopped and post sem, trans:%d", pMnode->syncMgmt.transId);
S
Shengliang Guan 已提交
388
    pMnode->syncMgmt.transId = 0;
S
Shengliang Guan 已提交
389 390
    tsem_post(&pMnode->syncMgmt.syncSem);
  }
391
  taosWUnLockLatch(&pMnode->syncMgmt.lock);
S
Shengliang Guan 已提交
392
}
393

394
bool mndIsLeader(SMnode *pMnode) {
395
  terrno = 0;
396
  SSyncState state = syncGetState(pMnode->syncMgmt.sync);
397

398 399 400 401 402 403 404 405
  if (terrno != 0) {
    mDebug("vgId:1, mnode is stopping");
    return false;
  }

  if (state.state != TAOS_SYNC_STATE_LEADER) {
    terrno = TSDB_CODE_SYN_NOT_LEADER;
    mDebug("vgId:1, mnode not leader, state:%s", syncStr(state.state));
406 407 408
    return false;
  }

409 410 411
  if (!state.restored || !pMnode->restored) {
    terrno = TSDB_CODE_SYN_RESTORING;
    mDebug("vgId:1, mnode not restored:%d:%d", state.restored, pMnode->restored);
412 413 414 415
    return false;
  }

  return true;
L
Liu Jicong 已提交
416
}