提交 5159d60f 编写于 作者: B Benguang Zhao

enh: vnodeCommit on consensus only

上级 a09870a9
......@@ -49,10 +49,13 @@ extern "C" {
#define SYNC_HEARTBEAT_REPLY_SLOW_MS 1500
#define SYNC_SNAP_RESEND_MS 1000 * 60
#define SYNC_VND_COMMIT_MIN_MS 200
#define SYNC_VND_COMMIT_MAX_MS 60000
#define SYNC_MAX_BATCH_SIZE 1
#define SYNC_INDEX_BEGIN 0
#define SYNC_INDEX_INVALID -1
#define SYNC_TERM_INVALID -1 // 0xFFFFFFFFFFFFFFFF
#define SYNC_TERM_INVALID -1
typedef enum {
SYNC_STRATEGY_NO_SNAPSHOT = 0,
......
......@@ -79,6 +79,8 @@ int32_t vmOpenVnode(SVnodeMgmt *pMgmt, SWrapperCfg *pCfg, SVnode *pImpl) {
void vmCloseVnode(SVnodeMgmt *pMgmt, SVnodeObj *pVnode) {
char path[TSDB_FILENAME_LEN] = {0};
vnodeProposeCommitOnNeed(pVnode->pImpl);
taosThreadRwlockWrlock(&pMgmt->lock);
taosHashRemove(pMgmt->hash, &pVnode->vgId, sizeof(int32_t));
taosThreadRwlockUnlock(&pMgmt->lock);
......
......@@ -88,6 +88,7 @@ int32_t vnodeProcessQueryMsg(SVnode *pVnode, SRpcMsg *pMsg);
int32_t vnodeProcessFetchMsg(SVnode *pVnode, SRpcMsg *pMsg, SQueueInfo *pInfo);
void vnodeProposeWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs);
void vnodeApplyWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs);
void vnodeProposeCommitOnNeed(SVnode *pVnode);
// meta
typedef struct SMeta SMeta; // todo: remove
......
......@@ -102,6 +102,7 @@ void vnodeSyncClose(SVnode* pVnode);
void vnodeRedirectRpcMsg(SVnode* pVnode, SRpcMsg* pMsg, int32_t code);
bool vnodeIsLeader(SVnode* pVnode);
bool vnodeIsRoleLeader(SVnode* pVnode);
int vnodeShouldCommit(SVnode* pVnode);
#ifdef __cplusplus
}
......
......@@ -348,6 +348,7 @@ struct SVnode {
STQ* pTq;
SSink* pSink;
tsem_t canCommit;
int64_t commitMs;
int64_t sync;
TdThreadMutex lock;
bool blocked;
......
......@@ -203,6 +203,7 @@ _err:
int metaClose(SMeta *pMeta) {
if (pMeta) {
if (pMeta->txn) tdbTxnClose(pMeta->txn);
if (pMeta->pCache) metaCacheClose(pMeta);
if (pMeta->pIdx) metaCloseIdx(pMeta);
if (pMeta->pStreamDb) tdbTbClose(pMeta->pStreamDb);
......
......@@ -59,6 +59,17 @@ int vnodeBegin(SVnode *pVnode) {
}
int vnodeShouldCommit(SVnode *pVnode) {
if (!pVnode->inUse || !osDataSpaceAvailable()) {
return false;
}
int64_t nowMs = taosGetMonoTimestampMs();
return (((pVnode->inUse->size > pVnode->inUse->node.size) && (pVnode->commitMs + SYNC_VND_COMMIT_MIN_MS < nowMs)) ||
(pVnode->inUse->size > 0 && pVnode->commitMs + SYNC_VND_COMMIT_MAX_MS < nowMs));
}
int vnodeShouldCommitOld(SVnode *pVnode) {
if (pVnode->inUse) {
return osDataSpaceAvailable() && (pVnode->inUse->size > pVnode->inUse->node.size);
}
......@@ -194,6 +205,7 @@ static void vnodePrepareCommit(SVnode *pVnode) {
vnodeBufPoolUnRef(pVnode->inUse);
pVnode->inUse = NULL;
}
static int32_t vnodeCommitTask(void *arg) {
int32_t code = 0;
......@@ -210,6 +222,7 @@ _exit:
taosMemoryFree(pInfo);
return code;
}
int vnodeAsyncCommit(SVnode *pVnode) {
int32_t code = 0;
......@@ -257,7 +270,9 @@ static int vnodeCommitImpl(SCommitInfo *pInfo) {
SVnode *pVnode = pInfo->pVnode;
vInfo("vgId:%d, start to commit, commit ID:%" PRId64 " version:%" PRId64 " term: %" PRId64, TD_VID(pVnode),
pVnode->state.commitID, pVnode->state.applied, pVnode->state.applyTerm);
pInfo->info.state.commitID, pInfo->info.state.committed, pInfo->info.state.commitTerm);
pVnode->commitMs = taosGetMonoTimestampMs();
// persist wal before starting
if (walPersist(pVnode->pWal) < 0) {
......
......@@ -249,15 +249,18 @@ void vnodePreClose(SVnode *pVnode) {
void vnodeClose(SVnode *pVnode) {
if (pVnode) {
vnodeSyncCommit(pVnode);
vnodeSyncClose(pVnode);
vnodeQueryClose(pVnode);
tsem_wait(&pVnode->canCommit);
walClose(pVnode->pWal);
tqClose(pVnode->pTq);
if (pVnode->pTsdb) tsdbClose(&pVnode->pTsdb);
smaClose(pVnode->pSma);
metaClose(pVnode->pMeta);
vnodeCloseBufPool(pVnode);
tsem_post(&pVnode->canCommit);
// destroy handle
tsem_destroy(&(pVnode->canCommit));
tsem_destroy(&pVnode->syncSem);
......
......@@ -200,6 +200,7 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp
// skip header
pReq = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead));
len = pMsg->contLen - sizeof(SMsgHead);
bool needCommit = false;
switch (pMsg->msgType) {
/* META */
......@@ -296,9 +297,8 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp
vnodeProcessAlterConfigReq(pVnode, version, pReq, len, pRsp);
break;
case TDMT_VND_COMMIT:
vnodeSyncCommit(pVnode);
vnodeBegin(pVnode);
goto _exit;
needCommit = true;
break;
default:
vError("vgId:%d, unprocessed msg, %d", TD_VID(pVnode), pMsg->msgType);
return -1;
......@@ -315,7 +315,7 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp
}
// commit if need
if (vnodeShouldCommit(pVnode)) {
if (needCommit) {
vInfo("vgId:%d, commit at version %" PRId64, TD_VID(pVnode), version);
vnodeAsyncCommit(pVnode);
......
......@@ -101,6 +101,64 @@ static void vnodeHandleProposeError(SVnode *pVnode, SRpcMsg *pMsg, int32_t code)
}
}
static int32_t inline vnodeProposeMsg(SVnode *pVnode, SRpcMsg *pMsg, bool isWeak) {
int64_t seq = 0;
taosThreadMutexLock(&pVnode->lock);
int32_t code = syncPropose(pVnode->sync, pMsg, isWeak, &seq);
bool wait = (code == 0 && vnodeIsMsgBlock(pMsg->msgType));
if (wait) {
ASSERT(!pVnode->blocked);
pVnode->blocked = true;
pVnode->blockSec = taosGetTimestampSec();
pVnode->blockSeq = seq;
#if 0
pVnode->blockInfo = pMsg->info;
#endif
}
taosThreadMutexUnlock(&pVnode->lock);
if (code > 0) {
vnodeHandleWriteMsg(pVnode, pMsg);
} else if (code < 0) {
if (terrno != 0) code = terrno;
vnodeHandleProposeError(pVnode, pMsg, code);
}
if (wait) vnodeWaitBlockMsg(pVnode, pMsg);
return code;
}
void vnodeProposeCommitOnNeed(SVnode *pVnode) {
if (!vnodeShouldCommit(pVnode)) {
return;
}
int32_t contLen = sizeof(SMsgHead);
SMsgHead *pHead = rpcMallocCont(contLen);
pHead->contLen = contLen;
pHead->vgId = pVnode->config.vgId;
SRpcMsg rpcMsg = {0};
rpcMsg.msgType = TDMT_VND_COMMIT;
rpcMsg.contLen = contLen;
rpcMsg.pCont = pHead;
rpcMsg.info.noResp = 1;
bool isWeak = false;
if (vnodeProposeMsg(pVnode, &rpcMsg, isWeak) < 0) {
vTrace("vgId:%d, failed to propose vnode commit since %s", pVnode->config.vgId, terrstr());
goto _out;
}
vInfo("vgId:%d, proposed vnode commit", pVnode->config.vgId);
_out:
pVnode->commitMs = taosGetMonoTimestampMs();
rpcFreeCont(rpcMsg.pCont);
rpcMsg.pCont = NULL;
}
#if BATCH_ENABLE
static void inline vnodeProposeBatchMsg(SVnode *pVnode, SRpcMsg **pMsgArr, bool *pIsWeakArr, int32_t *arrSize) {
......@@ -178,6 +236,8 @@ void vnodeProposeWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs)
continue;
}
vnodeProposeCommitOnNeed(pVnode);
code = vnodePreProcessWriteMsg(pVnode, pMsg);
if (code != 0) {
vGError("vgId:%d, msg:%p failed to pre-process since %s", vgId, pMsg, terrstr());
......@@ -205,34 +265,6 @@ void vnodeProposeWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs)
#else
static int32_t inline vnodeProposeMsg(SVnode *pVnode, SRpcMsg *pMsg, bool isWeak) {
int64_t seq = 0;
taosThreadMutexLock(&pVnode->lock);
int32_t code = syncPropose(pVnode->sync, pMsg, isWeak, &seq);
bool wait = (code == 0 && vnodeIsMsgBlock(pMsg->msgType));
if (wait) {
ASSERT(!pVnode->blocked);
pVnode->blocked = true;
pVnode->blockSec = taosGetTimestampSec();
pVnode->blockSeq = seq;
#if 0
pVnode->blockInfo = pMsg->info;
#endif
}
taosThreadMutexUnlock(&pVnode->lock);
if (code > 0) {
vnodeHandleWriteMsg(pVnode, pMsg);
} else if (code < 0) {
if (terrno != 0) code = terrno;
vnodeHandleProposeError(pVnode, pMsg, code);
}
if (wait) vnodeWaitBlockMsg(pVnode, pMsg);
return code;
}
void vnodeProposeWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs) {
SVnode *pVnode = pInfo->ahandle;
int32_t vgId = pVnode->config.vgId;
......@@ -256,6 +288,8 @@ void vnodeProposeWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs)
continue;
}
vnodeProposeCommitOnNeed(pVnode);
code = vnodePreProcessWriteMsg(pVnode, pMsg);
if (code != 0) {
vGError("vgId:%d, msg:%p failed to pre-process since %s", vgId, pMsg, terrstr());
......
......@@ -326,6 +326,8 @@ int32_t syncLogBufferAccept(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt
}
// update
ASSERT(pBuf->startIndex < index);
ASSERT(index - pBuf->startIndex < pBuf->size);
ASSERT(pBuf->entries[index % pBuf->size].pItem == NULL);
SSyncLogBufEntry tmp = {.pItem = pEntry, .prevLogIndex = prevIndex, .prevLogTerm = prevTerm};
pEntry = NULL;
......@@ -454,6 +456,11 @@ int32_t syncLogFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, Syn
pEntry->index, pEntry->term, TMSG_INFO(pEntry->originalRpcType));
}
if (pEntry->originalRpcType == TDMT_VND_COMMIT) {
sInfo("vgId:%d, fsm execute vnode commit. index: %" PRId64 ", term: %" PRId64 "", pNode->vgId, pEntry->index,
pEntry->term);
}
SRpcMsg rpcMsg = {0};
syncEntry2OriginalRpc(pEntry, &rpcMsg);
......
......@@ -219,6 +219,10 @@ static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntr
ASSERT(pEntry->index == index);
if (pEntry->originalRpcType == TDMT_VND_COMMIT) {
walFsync(pWal, true);
}
sNTrace(pData->pSyncNode, "write index:%" PRId64 ", type:%s, origin type:%s, elapsed:%" PRId64, pEntry->index,
TMSG_INFO(pEntry->msgType), TMSG_INFO(pEntry->originalRpcType), tsElapsed);
return 0;
......
......@@ -107,7 +107,7 @@ int32_t syncRespMgrGetAndDel(SSyncRespMgr *pObj, uint64_t seq, SRpcHandleInfo *p
taosThreadMutexUnlock(&pObj->mutex);
return 1; // get one object
} else {
sNError(pObj->data, "get-and-del message handle, no object of seq:%" PRIu64, seq);
sNTrace(pObj->data, "get-and-del message handle, no object of seq:%" PRIu64, seq);
}
taosThreadMutexUnlock(&pObj->mutex);
......
......@@ -74,7 +74,12 @@ int32_t tdbTbcUpsert(TBC *pTbc, const void *pKey, int nKey, const void *pData, i
int32_t tdbTxnOpen(TXN *pTxn, int64_t txnid, void *(*xMalloc)(void *, size_t), void (*xFree)(void *, void *),
void *xArg, int flags);
int32_t tdbTxnClose(TXN *pTxn);
int32_t tdbTxnCloseImpl(TXN *pTxn);
#define tdbTxnClose(pTxn) \
do { \
tdbTxnCloseImpl(pTxn); \
(pTxn) = NULL; \
} while (0)
// other
void tdbFree(void *);
......
......@@ -77,7 +77,7 @@ int tdbPageDestroy(SPage *pPage, void (*xFree)(void *arg, void *ptr), void *arg)
u8 *ptr;
tdbTrace("page/destroy: %p/%d %p", pPage, pPage->id, xFree);
ASSERT(!pPage->isDirty);
// ASSERT(!pPage->isDirty);
ASSERT(xFree);
for (int iOvfl = 0; iOvfl < pPage->nOverflow; iOvfl++) {
......
......@@ -28,13 +28,18 @@ int tdbTxnOpen(TXN *pTxn, int64_t txnid, void *(*xMalloc)(void *, size_t), void
return 0;
}
int tdbTxnClose(TXN *pTxn) {
int tdbTxnCloseImpl(TXN *pTxn) {
if (pTxn) {
if (pTxn->jPageSet) {
hashset_destroy(pTxn->jPageSet);
pTxn->jPageSet = NULL;
}
if (pTxn->jfd) {
tdbOsClose(pTxn->jfd);
ASSERT(pTxn->jfd == NULL);
}
tdbOsFree(pTxn);
}
......
......@@ -1124,7 +1124,7 @@ void cliHandleReq(SCliMsg* pMsg, SCliThrd* pThrd) {
int ret = uv_tcp_connect(&conn->connReq, (uv_tcp_t*)(conn->stream), (const struct sockaddr*)&addr, cliConnCb);
if (ret != 0) {
tGTrace("%s conn %p failed to connect to %s:%d, reason:%s", pTransInst->label, conn, conn->ip, conn->port,
tGError("%s conn %p failed to connect to %s:%d, reason:%s", pTransInst->label, conn, conn->ip, conn->port,
uv_err_name(ret));
uv_timer_stop(conn->timer);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册