syncSnapshot.c 27.5 KB
Newer Older
M
Minghao Li 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

16
#define _DEFAULT_SOURCE
M
Minghao Li 已提交
17
#include "syncSnapshot.h"
18
#include "syncIndexMgr.h"
19
#include "syncRaftCfg.h"
M
Minghao Li 已提交
20
#include "syncRaftLog.h"
M
Minghao Li 已提交
21 22
#include "syncRaftStore.h"
#include "syncUtil.h"
M
Minghao Li 已提交
23

M
Minghao Li 已提交
24
SSyncSnapshotSender *snapshotSenderCreate(SSyncNode *pSyncNode, int32_t replicaIndex) {
M
Minghao Li 已提交
25 26
  bool condition = (pSyncNode->pFsm->FpSnapshotStartRead != NULL) && (pSyncNode->pFsm->FpSnapshotStopRead != NULL) &&
                   (pSyncNode->pFsm->FpSnapshotDoRead != NULL);
M
Minghao Li 已提交
27

M
Minghao Li 已提交
28 29
  SSyncSnapshotSender *pSender = NULL;
  if (condition) {
30
    pSender = taosMemoryCalloc(1, sizeof(SSyncSnapshotSender));
31
    if (pSender == NULL) {
32 33
      terrno = TSDB_CODE_OUT_OF_MEMORY;
      return NULL;
34
    }
M
Minghao Li 已提交
35 36 37 38 39 40 41 42 43 44 45

    pSender->start = false;
    pSender->seq = SYNC_SNAPSHOT_SEQ_INVALID;
    pSender->ack = SYNC_SNAPSHOT_SEQ_INVALID;
    pSender->pReader = NULL;
    pSender->pCurrentBlock = NULL;
    pSender->blockLen = 0;
    pSender->sendingMS = SYNC_SNAPSHOT_RETRY_MS;
    pSender->pSyncNode = pSyncNode;
    pSender->replicaIndex = replicaIndex;
    pSender->term = pSyncNode->pRaftStore->currentTerm;
M
Minghao Li 已提交
46
    pSender->startTime = 0;
47
    pSender->pSyncNode->pFsm->FpGetSnapshotInfo(pSender->pSyncNode->pFsm, &(pSender->snapshot));
48
    pSender->finish = false;
M
Minghao Li 已提交
49
  } else {
50
    sError("vgId:%d, cannot create snapshot sender", pSyncNode->vgId);
M
Minghao Li 已提交
51
  }
52

M
Minghao Li 已提交
53 54
  return pSender;
}
M
Minghao Li 已提交
55

M
Minghao Li 已提交
56 57
void snapshotSenderDestroy(SSyncSnapshotSender *pSender) {
  if (pSender != NULL) {
58
    // free current block
M
Minghao Li 已提交
59 60
    if (pSender->pCurrentBlock != NULL) {
      taosMemoryFree(pSender->pCurrentBlock);
61
      pSender->pCurrentBlock = NULL;
M
Minghao Li 已提交
62
    }
63 64 65 66

    // close reader
    if (pSender->pReader != NULL) {
      int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotStopRead(pSender->pSyncNode->pFsm, pSender->pReader);
67
      if (ret != 0) {
S
Shengliang Guan 已提交
68
        sNError(pSender->pSyncNode, "stop reader error");
69
      }
70 71 72 73
      pSender->pReader = NULL;
    }

    // free sender
M
Minghao Li 已提交
74 75 76
    taosMemoryFree(pSender);
  }
}
M
Minghao Li 已提交
77

M
Minghao Li 已提交
78 79
bool snapshotSenderIsStart(SSyncSnapshotSender *pSender) { return pSender->start; }

M
Minghao Li 已提交
80
int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) {
M
Minghao Li 已提交
81
  ASSERT(!snapshotSenderIsStart(pSender));
M
Minghao Li 已提交
82

83 84 85
  pSender->start = true;
  pSender->seq = SYNC_SNAPSHOT_SEQ_BEGIN;
  pSender->ack = SYNC_SNAPSHOT_SEQ_INVALID;
M
Minghao Li 已提交
86 87 88
  pSender->pReader = NULL;
  pSender->pCurrentBlock = NULL;
  pSender->blockLen = 0;
89

M
Minghao Li 已提交
90 91
  pSender->snapshotParam.start = SYNC_INDEX_INVALID;
  pSender->snapshotParam.end = SYNC_INDEX_INVALID;
92

M
Minghao Li 已提交
93 94 95 96 97
  pSender->snapshot.data = NULL;
  pSender->snapshotParam.end = SYNC_INDEX_INVALID;
  pSender->snapshot.lastApplyIndex = SYNC_INDEX_INVALID;
  pSender->snapshot.lastApplyTerm = SYNC_TERM_INVALID;
  pSender->snapshot.lastConfigIndex = SYNC_INDEX_INVALID;
98

M
Minghao Li 已提交
99 100 101 102 103
  memset(&(pSender->lastConfig), 0, sizeof(pSender->lastConfig));
  pSender->sendingMS = 0;
  pSender->term = pSender->pSyncNode->pRaftStore->currentTerm;
  pSender->startTime = taosGetTimestampMs();
  pSender->finish = false;
M
Minghao Li 已提交
104

M
Minghao Li 已提交
105
  // build begin msg
M
Minghao Li 已提交
106 107 108 109
  SyncSnapshotSend *pMsg = syncSnapshotSendBuild(0, pSender->pSyncNode->vgId);
  pMsg->srcId = pSender->pSyncNode->myRaftId;
  pMsg->destId = (pSender->pSyncNode->replicasId)[pSender->replicaIndex];
  pMsg->term = pSender->pSyncNode->pRaftStore->currentTerm;
110
  pMsg->beginIndex = pSender->snapshotParam.start;
M
Minghao Li 已提交
111 112
  pMsg->lastIndex = pSender->snapshot.lastApplyIndex;
  pMsg->lastTerm = pSender->snapshot.lastApplyTerm;
113 114
  pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex;
  pMsg->lastConfig = pSender->lastConfig;
M
Minghao Li 已提交
115 116
  pMsg->startTime = pSender->startTime;
  pMsg->seq = SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT;
M
Minghao Li 已提交
117

M
Minghao Li 已提交
118
  // send msg
M
Minghao Li 已提交
119 120 121 122
  SRpcMsg rpcMsg;
  syncSnapshotSend2RpcMsg(pMsg, &rpcMsg);
  syncNodeSendMsgById(&(pMsg->destId), pSender->pSyncNode, &rpcMsg);
  syncSnapshotSendDestroy(pMsg);
123 124

  // event log
S
Shengliang Guan 已提交
125
  sSTrace(pSender, "snapshot sender start");
126
  return 0;
M
Minghao Li 已提交
127 128
}

129
int32_t snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) {
M
Minghao Li 已提交
130 131 132 133
  // update flag
  pSender->start = false;
  pSender->finish = finish;

134
  // close reader
M
Minghao Li 已提交
135 136 137 138 139
  if (pSender->pReader != NULL) {
    int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotStopRead(pSender->pSyncNode->pFsm, pSender->pReader);
    ASSERT(ret == 0);
    pSender->pReader = NULL;
  }
M
Minghao Li 已提交
140

141
  // free current block
M
Minghao Li 已提交
142 143
  if (pSender->pCurrentBlock != NULL) {
    taosMemoryFree(pSender->pCurrentBlock);
M
Minghao Li 已提交
144
    pSender->pCurrentBlock = NULL;
M
Minghao Li 已提交
145 146
    pSender->blockLen = 0;
  }
M
Minghao Li 已提交
147

148
  // event log
S
Shengliang Guan 已提交
149
  sSTrace(pSender, "snapshot sender stop");
150
  return 0;
M
Minghao Li 已提交
151 152
}

153
// when sender receive ack, call this function to send msg from seq
M
Minghao Li 已提交
154
// seq = ack + 1, already updated
M
Minghao Li 已提交
155
int32_t snapshotSend(SSyncSnapshotSender *pSender) {
156
  // free memory last time (current seq - 1)
M
Minghao Li 已提交
157 158
  if (pSender->pCurrentBlock != NULL) {
    taosMemoryFree(pSender->pCurrentBlock);
M
Minghao Li 已提交
159
    pSender->pCurrentBlock = NULL;
M
Minghao Li 已提交
160 161 162 163 164 165 166
    pSender->blockLen = 0;
  }

  // read data
  int32_t ret = pSender->pSyncNode->pFsm->FpSnapshotDoRead(pSender->pSyncNode->pFsm, pSender->pReader,
                                                           &(pSender->pCurrentBlock), &(pSender->blockLen));
  ASSERT(ret == 0);
M
Minghao Li 已提交
167 168 169
  if (pSender->blockLen > 0) {
    // has read data
  } else {
170
    // read finish, update seq to end
M
Minghao Li 已提交
171 172
    pSender->seq = SYNC_SNAPSHOT_SEQ_END;
  }
M
Minghao Li 已提交
173

M
Minghao Li 已提交
174
  // build msg
M
Minghao Li 已提交
175 176 177 178
  SyncSnapshotSend *pMsg = syncSnapshotSendBuild(pSender->blockLen, pSender->pSyncNode->vgId);
  pMsg->srcId = pSender->pSyncNode->myRaftId;
  pMsg->destId = (pSender->pSyncNode->replicasId)[pSender->replicaIndex];
  pMsg->term = pSender->pSyncNode->pRaftStore->currentTerm;
179
  pMsg->beginIndex = pSender->snapshotParam.start;
M
Minghao Li 已提交
180 181
  pMsg->lastIndex = pSender->snapshot.lastApplyIndex;
  pMsg->lastTerm = pSender->snapshot.lastApplyTerm;
182 183
  pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex;
  pMsg->lastConfig = pSender->lastConfig;
M
Minghao Li 已提交
184
  pMsg->seq = pSender->seq;
M
Minghao Li 已提交
185 186 187

  // pMsg->privateTerm = pSender->privateTerm;

M
Minghao Li 已提交
188 189
  memcpy(pMsg->data, pSender->pCurrentBlock, pSender->blockLen);

M
Minghao Li 已提交
190
  // send msg
M
Minghao Li 已提交
191 192 193
  SRpcMsg rpcMsg;
  syncSnapshotSend2RpcMsg(pMsg, &rpcMsg);
  syncNodeSendMsgById(&(pMsg->destId), pSender->pSyncNode, &rpcMsg);
194
  syncSnapshotSendDestroy(pMsg);
M
Minghao Li 已提交
195

196
  // event log
S
Shengliang Guan 已提交
197 198 199 200 201
  if (pSender->seq == SYNC_SNAPSHOT_SEQ_END) {
    sSTrace(pSender, "snapshot sender finish");
  } else {
    sSTrace(pSender, "snapshot sender sending");
  }
M
Minghao Li 已提交
202 203 204
  return 0;
}

M
Minghao Li 已提交
205 206
// send snapshot data from cache
int32_t snapshotReSend(SSyncSnapshotSender *pSender) {
207 208 209
  // send current block data
  if (pSender->pCurrentBlock != NULL && pSender->blockLen > 0) {
    // build msg
M
Minghao Li 已提交
210 211 212 213
    SyncSnapshotSend *pMsg = syncSnapshotSendBuild(pSender->blockLen, pSender->pSyncNode->vgId);
    pMsg->srcId = pSender->pSyncNode->myRaftId;
    pMsg->destId = (pSender->pSyncNode->replicasId)[pSender->replicaIndex];
    pMsg->term = pSender->pSyncNode->pRaftStore->currentTerm;
214
    pMsg->beginIndex = pSender->snapshotParam.start;
M
Minghao Li 已提交
215 216
    pMsg->lastIndex = pSender->snapshot.lastApplyIndex;
    pMsg->lastTerm = pSender->snapshot.lastApplyTerm;
217 218
    pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex;
    pMsg->lastConfig = pSender->lastConfig;
M
Minghao Li 已提交
219
    pMsg->seq = pSender->seq;
M
Minghao Li 已提交
220 221 222

    //  pMsg->privateTerm = pSender->privateTerm;

M
Minghao Li 已提交
223 224
    memcpy(pMsg->data, pSender->pCurrentBlock, pSender->blockLen);

225
    // send msg
M
Minghao Li 已提交
226 227 228 229
    SRpcMsg rpcMsg;
    syncSnapshotSend2RpcMsg(pMsg, &rpcMsg);
    syncNodeSendMsgById(&(pMsg->destId), pSender->pSyncNode, &rpcMsg);
    syncSnapshotSendDestroy(pMsg);
230 231

    // event log
S
Shengliang Guan 已提交
232
    sSTrace(pSender, "snapshot sender resend");
M
Minghao Li 已提交
233
  }
234

M
Minghao Li 已提交
235 236 237
  return 0;
}

238 239 240 241 242 243
static void snapshotSenderUpdateProgress(SSyncSnapshotSender *pSender, SyncSnapshotRsp *pMsg) {
  ASSERT(pMsg->ack == pSender->seq);
  pSender->ack = pMsg->ack;
  ++(pSender->seq);
}

244
int32_t syncNodeStartSnapshot(SSyncNode *pSyncNode, SRaftId *pDestId) {
S
Shengliang Guan 已提交
245
  sNTrace(pSyncNode, "starting snapshot ...");
M
Minghao Li 已提交
246

247 248
  SSyncSnapshotSender *pSender = syncNodeGetSnapshotSender(pSyncNode, pDestId);
  if (pSender == NULL) {
S
Shengliang Guan 已提交
249
    sNError(pSyncNode, "start snapshot error, sender is null");
M
Minghao Li 已提交
250
    return -1;
251 252
  }

M
Minghao Li 已提交
253 254 255 256 257
  int32_t code = 0;

  if (snapshotSenderIsStart(pSender)) {
    code = snapshotSenderStop(pSender, false);
    if (code != 0) {
S
Shengliang Guan 已提交
258
      sNError(pSyncNode, "snapshot sender stop error");
M
Minghao Li 已提交
259 260 261 262 263 264
      return -1;
    }
  }

  code = snapshotSenderStart(pSender);
  if (code != 0) {
S
Shengliang Guan 已提交
265
    sNError(pSyncNode, "snapshot sender start error");
M
Minghao Li 已提交
266 267
    return -1;
  }
268 269 270

  return 0;
}
271

272
SSyncSnapshotReceiver *snapshotReceiverCreate(SSyncNode *pSyncNode, SRaftId fromId) {
M
Minghao Li 已提交
273 274
  bool condition = (pSyncNode->pFsm->FpSnapshotStartWrite != NULL) && (pSyncNode->pFsm->FpSnapshotStopWrite != NULL) &&
                   (pSyncNode->pFsm->FpSnapshotDoWrite != NULL);
275

276
  SSyncSnapshotReceiver *pReceiver = NULL;
M
Minghao Li 已提交
277
  if (condition) {
278 279 280 281 282
    pReceiver = taosMemoryCalloc(1, sizeof(SSyncSnapshotReceiver));
    if (pReceiver == NULL) {
      terrno = TSDB_CODE_OUT_OF_MEMORY;
      return NULL;
    }
283

M
Minghao Li 已提交
284 285 286 287
    pReceiver->start = false;
    pReceiver->ack = SYNC_SNAPSHOT_SEQ_BEGIN;
    pReceiver->pWriter = NULL;
    pReceiver->pSyncNode = pSyncNode;
288
    pReceiver->fromId = fromId;
M
Minghao Li 已提交
289
    pReceiver->term = pSyncNode->pRaftStore->currentTerm;
M
Minghao Li 已提交
290
    pReceiver->snapshot.data = NULL;
291
    pReceiver->snapshot.lastApplyIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
292
    pReceiver->snapshot.lastApplyTerm = 0;
293
    pReceiver->snapshot.lastConfigIndex = SYNC_INDEX_INVALID;
M
Minghao Li 已提交
294

M
Minghao Li 已提交
295
  } else {
296
    sError("vgId:%d, cannot create snapshot receiver", pSyncNode->vgId);
M
Minghao Li 已提交
297
  }
298 299 300

  return pReceiver;
}
M
Minghao Li 已提交
301

302 303
void snapshotReceiverDestroy(SSyncSnapshotReceiver *pReceiver) {
  if (pReceiver != NULL) {
304 305
    // close writer
    if (pReceiver->pWriter != NULL) {
306 307
      int32_t ret = pReceiver->pSyncNode->pFsm->FpSnapshotStopWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter,
                                                                    false, &(pReceiver->snapshot));
308 309 310 311 312
      ASSERT(ret == 0);
      pReceiver->pWriter = NULL;
    }

    // free receiver
313 314 315
    taosMemoryFree(pReceiver);
  }
}
M
Minghao Li 已提交
316

317 318
bool snapshotReceiverIsStart(SSyncSnapshotReceiver *pReceiver) { return pReceiver->start; }

319
// static do start by privateTerm, pBeginMsg
320
// receive first snapshot data
321
// write first block data
322
static void snapshotReceiverDoStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pBeginMsg) {
M
Minghao Li 已提交
323
  pReceiver->start = true;
M
Minghao Li 已提交
324
  pReceiver->ack = SYNC_SNAPSHOT_SEQ_BEGIN;
M
Minghao Li 已提交
325 326 327 328 329 330 331 332 333 334 335

  // start writer
  ASSERT(pReceiver->pWriter == NULL);
  int32_t ret = pReceiver->pSyncNode->pFsm->FpSnapshotStartWrite(pReceiver->pSyncNode->pFsm,
                                                                 &(pReceiver->snapshotParam), &(pReceiver->pWriter));
  ASSERT(ret == 0);

  pReceiver->term = pReceiver->pSyncNode->pRaftStore->currentTerm;
  pReceiver->snapshotParam.start = pBeginMsg->beginIndex;
  pReceiver->snapshotParam.end = pBeginMsg->lastIndex;

M
Minghao Li 已提交
336 337
  pReceiver->fromId = pBeginMsg->srcId;

338
  // update snapshot
M
Minghao Li 已提交
339 340 341
  pReceiver->snapshot.lastApplyIndex = pBeginMsg->lastIndex;
  pReceiver->snapshot.lastApplyTerm = pBeginMsg->lastTerm;
  pReceiver->snapshot.lastConfigIndex = pBeginMsg->lastConfigIndex;
M
Minghao Li 已提交
342

M
Minghao Li 已提交
343
  pReceiver->startTime = pBeginMsg->startTime;
344 345

  // event log
S
Shengliang Guan 已提交
346
  sRTrace(pReceiver, "snapshot receiver start");
M
Minghao Li 已提交
347 348
}

349
// force stop
350
void snapshotReceiverForceStop(SSyncSnapshotReceiver *pReceiver) {
351 352
  // force close, abandon incomplete data
  if (pReceiver->pWriter != NULL) {
353 354
    int32_t ret = pReceiver->pSyncNode->pFsm->FpSnapshotStopWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, false,
                                                                  &(pReceiver->snapshot));
355 356 357 358 359
    ASSERT(ret == 0);
    pReceiver->pWriter = NULL;
  }

  pReceiver->start = false;
360 361

  // event log
S
Shengliang Guan 已提交
362
  sRTrace(pReceiver, "snapshot receiver force stop");
363 364
}

M
Minghao Li 已提交
365
// if receiver receive msg from seq = SYNC_SNAPSHOT_SEQ_BEGIN, start receiver
366
int32_t snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pBeginMsg) {
M
Minghao Li 已提交
367 368
  ASSERT(!snapshotReceiverIsStart(pReceiver));
  snapshotReceiverDoStart(pReceiver, pBeginMsg);
369 370

  return 0;
371
}
M
Minghao Li 已提交
372

373 374
// just set start = false
// FpSnapshotStopWrite should not be called, assert writer == NULL
375
int32_t snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) {
M
Minghao Li 已提交
376
  if (pReceiver->pWriter != NULL) {
377 378
    int32_t ret = pReceiver->pSyncNode->pFsm->FpSnapshotStopWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, false,
                                                                  &(pReceiver->snapshot));
M
Minghao Li 已提交
379 380
    ASSERT(ret == 0);
    pReceiver->pWriter = NULL;
381
  }
M
Minghao Li 已提交
382 383

  pReceiver->start = false;
384

385
  // event log
S
Shengliang Guan 已提交
386
  sRTrace(pReceiver, "snapshot receiver stop");
387
  return 0;
388 389
}

390
// when recv last snapshot block, apply data into snapshot
391
static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pMsg) {
392 393
  ASSERT(pMsg->seq == SYNC_SNAPSHOT_SEQ_END);

394
  int32_t code = 0;
395
  if (pReceiver->pWriter != NULL) {
396
    // write data
397 398 399
    if (pMsg->dataLen > 0) {
      code = pReceiver->pSyncNode->pFsm->FpSnapshotDoWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, pMsg->data,
                                                           pMsg->dataLen);
400
      if (code != 0) {
S
Shengliang Guan 已提交
401
        sNError(pReceiver->pSyncNode, "snapshot write error");
402 403 404 405 406 407 408 409
        return -1;
      }
    }

    // reset wal
    code =
        pReceiver->pSyncNode->pLogStore->syncLogRestoreFromSnapshot(pReceiver->pSyncNode->pLogStore, pMsg->lastIndex);
    if (code != 0) {
S
Shengliang Guan 已提交
410
      sNError(pReceiver->pSyncNode, "wal restore from snapshot error");
411
      return -1;
412 413
    }

414 415 416 417 418
    // update commit index
    if (pReceiver->snapshot.lastApplyIndex > pReceiver->pSyncNode->commitIndex) {
      pReceiver->pSyncNode->commitIndex = pReceiver->snapshot.lastApplyIndex;
    }

M
Minghao Li 已提交
419 420 421 422 423 424
    // maybe update term
    if (pReceiver->snapshot.lastApplyTerm > pReceiver->pSyncNode->pRaftStore->currentTerm) {
      pReceiver->pSyncNode->pRaftStore->currentTerm = pReceiver->snapshot.lastApplyTerm;
      raftStorePersist(pReceiver->pSyncNode->pRaftStore);
    }

425
    // stop writer, apply data
426 427
    code = pReceiver->pSyncNode->pFsm->FpSnapshotStopWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, true,
                                                           &(pReceiver->snapshot));
428
    if (code != 0) {
S
Shengliang Guan 已提交
429
      sNError(pReceiver->pSyncNode, "snapshot stop writer true error");
430 431
      return -1;
    }
432 433
    pReceiver->pWriter = NULL;

434 435
    // update progress
    pReceiver->ack = SYNC_SNAPSHOT_SEQ_END;
436

437
  } else {
S
Shengliang Guan 已提交
438
    sNError(pReceiver->pSyncNode, "snapshot stop writer true error");
439
    return -1;
440 441 442
  }

  // event log
S
Shengliang Guan 已提交
443
  sRTrace(pReceiver, "snapshot receiver got last data, finish, apply snapshot");
444
  return 0;
445 446
}

447 448
// apply data block
// update progress
449 450 451 452 453
static void snapshotReceiverGotData(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pMsg) {
  ASSERT(pMsg->seq == pReceiver->ack + 1);

  if (pReceiver->pWriter != NULL) {
    if (pMsg->dataLen > 0) {
454
      // apply data block
455 456 457 458
      int32_t code = pReceiver->pSyncNode->pFsm->FpSnapshotDoWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter,
                                                                   pMsg->data, pMsg->dataLen);
      ASSERT(code == 0);
    }
459 460

    // update progress
461 462 463
    pReceiver->ack = pMsg->seq;

    // event log
S
Shengliang Guan 已提交
464
    sRTrace(pReceiver, "snapshot receiver receiving");
465
  }
466 467
}

M
Minghao Li 已提交
468 469 470 471 472 473 474 475 476 477 478 479 480 481
SyncIndex syncNodeGetSnapBeginIndex(SSyncNode *ths) {
  SyncIndex snapStart = SYNC_INDEX_INVALID;

  if (syncNodeIsMnode(ths)) {
    snapStart = SYNC_INDEX_BEGIN;

  } else {
    SSyncLogStoreData *pData = ths->pLogStore->data;
    SWal              *pWal = pData->pWal;

    bool    isEmpty = ths->pLogStore->syncLogIsEmpty(ths->pLogStore);
    int64_t walCommitVer = walGetCommittedVer(pWal);

    if (!isEmpty && ths->commitIndex != walCommitVer) {
S
Shengliang Guan 已提交
482 483
      sNError(ths, "commit not same, wal-commit:%" PRId64 ", commit:%" PRId64 ", ignore", walCommitVer,
              ths->commitIndex);
M
Minghao Li 已提交
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
      snapStart = walCommitVer + 1;
    } else {
      snapStart = ths->commitIndex + 1;
    }
  }

  return snapStart;
}

static int32_t syncNodeOnSnapshotPre(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) {
  SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver;

  if (snapshotReceiverIsStart(pReceiver)) {
    // already start

    if (pMsg->startTime > pReceiver->startTime) {
      goto _START_RECEIVER;

    } else if (pMsg->startTime == pReceiver->startTime) {
      goto _SEND_REPLY;

    } else {
      // ignore
      return 0;
    }

  } else {
    // start new
    goto _START_RECEIVER;
  }

_START_RECEIVER:
  if (taosGetTimestampMs() - pMsg->startTime > SNAPSHOT_MAX_CLOCK_SKEW_MS) {
S
Shengliang Guan 已提交
517
    sNError(pSyncNode, "snapshot receiver time skew too much");
M
Minghao Li 已提交
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
    return -1;
  } else {
    // waiting for clock match
    while (taosGetTimestampMs() > pMsg->startTime) {
      taosMsleep(10);
    }

    snapshotReceiverStart(pReceiver, pMsg);  // set start-time same with sender
  }

_SEND_REPLY:
    // build msg
    ;  // make complier happy
  SyncSnapshotRsp *pRspMsg = syncSnapshotRspBuild(pSyncNode->vgId);
  pRspMsg->srcId = pSyncNode->myRaftId;
  pRspMsg->destId = pMsg->srcId;
  pRspMsg->term = pSyncNode->pRaftStore->currentTerm;
  pRspMsg->lastIndex = pMsg->lastIndex;
  pRspMsg->lastTerm = pMsg->lastTerm;
  pRspMsg->startTime = pReceiver->startTime;
  pRspMsg->ack = pMsg->seq;  // receiver maybe already closed
  pRspMsg->code = 0;
  pRspMsg->snapBeginIndex = syncNodeGetSnapBeginIndex(pSyncNode);

  // send msg
  SRpcMsg rpcMsg;
  syncSnapshotRsp2RpcMsg(pRspMsg, &rpcMsg);
  syncNodeSendMsgById(&(pRspMsg->destId), pSyncNode, &rpcMsg);
  syncSnapshotRspDestroy(pRspMsg);

  return 0;
}

static int32_t syncNodeOnSnapshotBegin(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) {
  // condition 1
  SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver;

  if (snapshotReceiverIsStart(pReceiver)) {
    if (pMsg->startTime > pReceiver->startTime) {
      snapshotReceiverStop(pReceiver);

    } else if (pMsg->startTime == pReceiver->startTime) {
      return 0;
    } else {
      // ignore
S
Shengliang Guan 已提交
563
      sNTrace(pSyncNode, "msg ignore");
M
Minghao Li 已提交
564 565 566 567 568 569
      return 0;
    }
  }

_START_RECEIVER:
  if (taosGetTimestampMs() - pMsg->startTime > SNAPSHOT_MAX_CLOCK_SKEW_MS) {
S
Shengliang Guan 已提交
570
    sNError(pSyncNode, "snapshot receiver time skew too much");
M
Minghao Li 已提交
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
    return -1;
  } else {
    // waiting for clock match
    while (taosGetTimestampMs() > pMsg->startTime) {
      taosMsleep(10);
    }

    snapshotReceiverStart(pReceiver, pMsg);

    // build msg
    SyncSnapshotRsp *pRspMsg = syncSnapshotRspBuild(pSyncNode->vgId);
    pRspMsg->srcId = pSyncNode->myRaftId;
    pRspMsg->destId = pMsg->srcId;
    pRspMsg->term = pSyncNode->pRaftStore->currentTerm;
    pRspMsg->lastIndex = pMsg->lastIndex;
    pRspMsg->lastTerm = pMsg->lastTerm;
    pRspMsg->ack = pReceiver->ack;  // receiver maybe already closed
    pRspMsg->code = 0;

    // send msg
    SRpcMsg rpcMsg;
    syncSnapshotRsp2RpcMsg(pRspMsg, &rpcMsg);
    syncNodeSendMsgById(&(pRspMsg->destId), pSyncNode, &rpcMsg);
    syncSnapshotRspDestroy(pRspMsg);
  }

  return 0;
}

static int32_t syncNodeOnSnapshotTransfer(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) { return 0; }

static int32_t syncNodeOnSnapshotEnd(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) { return 0; }

604 605
// receiver on message
//
M
Minghao Li 已提交
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
// condition 1, recv SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT
//              if receiver already start
//                    if sender.start-time > receiver.start-time, restart receiver(reply snapshot start)
//                    if sender.start-time = receiver.start-time, maybe duplicate msg
//                    if sender.start-time < receiver.start-time, ignore
//              else
//                    waiting for clock match
//                    start receiver(reply snapshot start)
//
// condition 2, recv SYNC_SNAPSHOT_SEQ_BEGIN
//              a. create writer with <begin, end>
//
// condition 3, recv SYNC_SNAPSHOT_SEQ_END, finish receiver(apply snapshot data, update commit index, maybe reconfig)
//
// condition 4, recv SYNC_SNAPSHOT_SEQ_FORCE_CLOSE, force close
//
// condition 5, got data, update ack
M
Minghao Li 已提交
623
//
M
Minghao Li 已提交
624
int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, SyncSnapshotSend *pMsg) {
M
Minghao Li 已提交
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
  // if already drop replica, do not process
  if (!syncNodeInRaftGroup(pSyncNode, &(pMsg->srcId))) {
    syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "not in my config");
    return 0;
  }

  if (pMsg->term < pSyncNode->pRaftStore->currentTerm) {
    syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "reject, small term");
    return 0;
  }

  if (pMsg->term > pSyncNode->pRaftStore->currentTerm) {
    syncNodeStepDown(pSyncNode, pMsg->term);
  }
  syncNodeResetElectTimer(pSyncNode);

641
  int32_t                code = 0;
M
Minghao Li 已提交
642
  SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver;
M
Minghao Li 已提交
643

644 645
  // state, term, seq/ack
  if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER) {
M
Minghao Li 已提交
646
    if (pMsg->term == pSyncNode->pRaftStore->currentTerm) {
M
Minghao Li 已提交
647 648 649 650 651
      if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT) {
        syncNodeOnSnapshotPre(pSyncNode, pMsg);

      } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_BEGIN) {
        syncNodeOnSnapshotBegin(pSyncNode, pMsg);
M
Minghao Li 已提交
652 653

      } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_END) {
654
        // condition 2
M
Minghao Li 已提交
655
        // end, finish FSM
656 657 658 659
        code = snapshotReceiverFinish(pReceiver, pMsg);
        if (code == 0) {
          snapshotReceiverStop(pReceiver);
        }
M
Minghao Li 已提交
660
        bool needRsp = true;
661

662 663
        // maybe update lastconfig
        if (pMsg->lastConfigIndex >= SYNC_INDEX_BEGIN) {
M
Minghao Li 已提交
664
          SSyncCfg oldSyncCfg = pSyncNode->pRaftCfg->cfg;
665

666 667 668
          // update new config myIndex
          SSyncCfg newSyncCfg = pMsg->lastConfig;
          syncNodeUpdateNewConfigIndex(pSyncNode, &newSyncCfg);
M
Minghao Li 已提交
669 670 671

          // do config change
          syncNodeDoConfigChange(pSyncNode, &newSyncCfg, pMsg->lastConfigIndex);
672 673
        }

M
Minghao Li 已提交
674
      } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_FORCE_CLOSE) {
675
        // condition 3
676 677
        // force close
        snapshotReceiverForceStop(pReceiver);
M
Minghao Li 已提交
678
        bool needRsp = false;
M
Minghao Li 已提交
679 680

      } else if (pMsg->seq > SYNC_SNAPSHOT_SEQ_BEGIN && pMsg->seq < SYNC_SNAPSHOT_SEQ_END) {
681
        // condition 4
M
Minghao Li 已提交
682 683
        // transfering
        if (pMsg->seq == pReceiver->ack + 1) {
684
          snapshotReceiverGotData(pReceiver, pMsg);
M
Minghao Li 已提交
685
        }
M
Minghao Li 已提交
686
        bool needRsp = true;
M
Minghao Li 已提交
687 688

      } else {
M
Minghao Li 已提交
689
        // error log
S
Shengliang Guan 已提交
690
        sRTrace(pReceiver, "snapshot receiver recv error seq:%d, my ack:%d", pMsg->seq, pReceiver->ack);
M
Minghao Li 已提交
691
        return -1;
692
      }
M
Minghao Li 已提交
693

694 695
    } else {
      // error log
S
Shengliang Guan 已提交
696
      sRTrace(pReceiver, "snapshot receiver term not equal");
697
      return -1;
M
Minghao Li 已提交
698
    }
M
Minghao Li 已提交
699
  } else {
700
    // error log
S
Shengliang Guan 已提交
701
    sRTrace(pReceiver, "snapshot receiver not follower");
702
    return -1;
M
Minghao Li 已提交
703
  }
M
Minghao Li 已提交
704

M
Minghao Li 已提交
705 706 707
  return 0;
}

M
Minghao Li 已提交
708 709 710 711 712 713 714 715 716 717 718 719 720
int32_t syncNodeOnSnapshotReplyPre(SSyncNode *pSyncNode, SyncSnapshotRsp *pMsg) {
  // get sender
  SSyncSnapshotSender *pSender = syncNodeGetSnapshotSender(pSyncNode, &(pMsg->srcId));
  ASSERT(pSender != NULL);

  SSnapshot snapshot;
  pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);

  // prepare <begin, end>
  pSender->snapshotParam.start = pMsg->snapBeginIndex;
  pSender->snapshotParam.end = snapshot.lastApplyIndex;

  if (pMsg->snapBeginIndex > snapshot.lastApplyIndex) {
S
Shengliang Guan 已提交
721
    sNError(pSyncNode, "snapshot last index too small");
M
Minghao Li 已提交
722 723 724 725 726 727
    return -1;
  }

  // start reader
  int32_t code = pSyncNode->pFsm->FpSnapshotStartRead(pSyncNode->pFsm, &(pSender->snapshotParam), &(pSender->pReader));
  if (code != 0) {
S
Shengliang Guan 已提交
728
    sNError(pSyncNode, "create snapshot reader error");
M
Minghao Li 已提交
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
    return -1;
  }

  // build begin msg
  SyncSnapshotSend *pSendMsg = syncSnapshotSendBuild(0, pSender->pSyncNode->vgId);
  pSendMsg->srcId = pSender->pSyncNode->myRaftId;
  pSendMsg->destId = (pSender->pSyncNode->replicasId)[pSender->replicaIndex];
  pSendMsg->term = pSender->pSyncNode->pRaftStore->currentTerm;
  pSendMsg->beginIndex = pSender->snapshotParam.start;
  pSendMsg->lastIndex = pSender->snapshot.lastApplyIndex;
  pSendMsg->lastTerm = pSender->snapshot.lastApplyTerm;
  pSendMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex;
  pSendMsg->lastConfig = pSender->lastConfig;
  pSendMsg->startTime = pSender->startTime;
  pSendMsg->seq = SYNC_SNAPSHOT_SEQ_BEGIN;

  // send msg
  SRpcMsg rpcMsg;
  syncSnapshotSend2RpcMsg(pSendMsg, &rpcMsg);
  syncNodeSendMsgById(&(pSendMsg->destId), pSender->pSyncNode, &rpcMsg);
  syncSnapshotSendDestroy(pSendMsg);

  return 0;
}

754 755 756 757 758 759
// sender on message
//
// condition 1 sender receives SYNC_SNAPSHOT_SEQ_END, close sender
// condition 2 sender receives ack, set seq = ack + 1, send msg from seq
// condition 3 sender receives error msg, just print error log
//
M
Minghao Li 已提交
760
int32_t syncNodeOnSnapshotReply(SSyncNode *pSyncNode, SyncSnapshotRsp *pMsg) {
761
  // if already drop replica, do not process
M
Minghao Li 已提交
762 763
  if (!syncNodeInRaftGroup(pSyncNode, &(pMsg->srcId))) {
    syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "maybe replica already dropped");
764
    return -1;
765 766
  }

M
Minghao Li 已提交
767
  // get sender
768
  SSyncSnapshotSender *pSender = syncNodeGetSnapshotSender(pSyncNode, &(pMsg->srcId));
769 770
  ASSERT(pSender != NULL);

M
Minghao Li 已提交
771 772 773 774 775
  if (pMsg->startTime != pSender->startTime) {
    syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "sender/receiver start time not match");
    return -1;
  }

776 777 778
  // state, term, seq/ack
  if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
    if (pMsg->term == pSyncNode->pRaftStore->currentTerm) {
M
Minghao Li 已提交
779 780 781 782 783 784
      // prepare <begin, end>, send begin msg
      if (pMsg->ack == SYNC_SNAPSHOT_SEQ_PRE_SNAPSHOT) {
        syncNodeOnSnapshotReplyPre(pSyncNode, pMsg);
        return 0;
      }

785
      // receive ack is finish, close sender
M
Minghao Li 已提交
786
      if (pMsg->ack == SYNC_SNAPSHOT_SEQ_END) {
787
        snapshotSenderStop(pSender, true);
M
Minghao Li 已提交
788 789 790 791
        return 0;
      }

      // send next msg
792
      if (pMsg->ack == pSender->seq) {
M
Minghao Li 已提交
793
        // update sender ack
794
        snapshotSenderUpdateProgress(pSender, pMsg);
M
Minghao Li 已提交
795
        snapshotSend(pSender);
796

M
Minghao Li 已提交
797
      } else if (pMsg->ack == pSender->seq - 1) {
798
        // maybe resend
M
Minghao Li 已提交
799
        snapshotReSend(pSender);
800

M
Minghao Li 已提交
801
      } else {
802
        // error log
S
Shengliang Guan 已提交
803
        sSError(pSender, "snapshot sender recv error ack:%d, my seq:%d", pMsg->ack, pSender->seq);
804
        return -1;
805
      }
806 807
    } else {
      // error log
S
Shengliang Guan 已提交
808
      sSError(pSender, "snapshot sender term not equal");
809
      return -1;
810
    }
M
Minghao Li 已提交
811
  } else {
812
    // error log
S
Shengliang Guan 已提交
813
    sSError(pSender, "snapshot sender not leader");
814
    return -1;
815 816 817
  }

  return 0;
818
}
819 820 821

int32_t syncNodeOnPreSnapshot(SSyncNode *ths, SyncPreSnapshot *pMsg) {
  syncLogRecvSyncPreSnapshot(ths, pMsg, "");
822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838

  SyncPreSnapshotReply *pMsgReply = syncPreSnapshotReplyBuild(ths->vgId);
  pMsgReply->srcId = ths->myRaftId;
  pMsgReply->destId = pMsg->srcId;
  pMsgReply->term = ths->pRaftStore->currentTerm;

  SSyncLogStoreData *pData = ths->pLogStore->data;
  SWal              *pWal = pData->pWal;

  if (syncNodeIsMnode(ths)) {
    pMsgReply->snapStart = SYNC_INDEX_BEGIN;

  } else {
    bool    isEmpty = ths->pLogStore->syncLogIsEmpty(ths->pLogStore);
    int64_t walCommitVer = walGetCommittedVer(pWal);

    if (!isEmpty && ths->commitIndex != walCommitVer) {
S
Shengliang Guan 已提交
839 840
      sNError(ths, "commit not same, wal-commit:%" PRId64 ", commit:%" PRId64 ", ignore", walCommitVer,
              ths->commitIndex);
841 842 843 844 845 846 847 848
      goto _IGNORE;
    }

    pMsgReply->snapStart = ths->commitIndex + 1;

    // make local log clean
    int32_t code = ths->pLogStore->syncLogTruncate(ths->pLogStore, pMsgReply->snapStart);
    if (code != 0) {
S
Shengliang Guan 已提交
849
      sNError(ths, "truncate wal error");
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
      goto _IGNORE;
    }
  }

  // can not write behind _RESPONSE
  SRpcMsg rpcMsg;

_RESPONSE:
  syncPreSnapshotReply2RpcMsg(pMsgReply, &rpcMsg);
  syncNodeSendMsgById(&pMsgReply->destId, ths, &rpcMsg);

  syncPreSnapshotReplyDestroy(pMsgReply);
  return 0;

_IGNORE:
  syncPreSnapshotReplyDestroy(pMsgReply);
866 867 868 869 870
  return 0;
}

int32_t syncNodeOnPreSnapshotReply(SSyncNode *ths, SyncPreSnapshotReply *pMsg) {
  syncLogRecvSyncPreSnapshotReply(ths, pMsg, "");
871 872 873

  // start snapshot

874 875
  return 0;
}