diff --git a/src/inc/taosdef.h b/src/inc/taosdef.h index fa8af2c67e5dff0cbc0a66a4fca3bfa9f84cbe26..875092b88da1aa03b7eb017e14fff3148f3644a8 100644 --- a/src/inc/taosdef.h +++ b/src/inc/taosdef.h @@ -431,6 +431,8 @@ void tsDataSwap(void *pLeft, void *pRight, int32_t type, int32_t size, void* buf #define TSDB_PORT_HTTP 11 #define TSDB_PORT_ARBITRATOR 12 +#define TSDB_MAX_WAL_SIZE (1024*1024) + typedef enum { TAOS_QTYPE_RPC = 0, TAOS_QTYPE_FWD = 1, diff --git a/src/inc/taoserror.h b/src/inc/taoserror.h index 01891234efeae1972a3884c57b99d204a335e7d5..1919747a0b019239e5d6b717921c6a957728d455 100644 --- a/src/inc/taoserror.h +++ b/src/inc/taoserror.h @@ -237,7 +237,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_QRY_NOT_READY, 0, 0x0707, "Query not TAOS_DEFINE_ERROR(TSDB_CODE_QRY_HAS_RSP, 0, 0x0708, "Query should response") TAOS_DEFINE_ERROR(TSDB_CODE_QRY_IN_EXEC, 0, 0x0709, "Multiple retrieval of this query") TAOS_DEFINE_ERROR(TSDB_CODE_QRY_TOO_MANY_TIMEWINDOW, 0, 0x070A, "Too many time window in query") -TAOS_DEFINE_ERROR(TSDB_CODE_QRY_NOT_ENOUGH_BUFFER, 0, 0x070B, "Query buffer limit has reached") +TAOS_DEFINE_ERROR(TSDB_CODE_QRY_NOT_ENOUGH_BUFFER, 0, 0x070B, "Query buffer limit has reached") // grant TAOS_DEFINE_ERROR(TSDB_CODE_GRANT_EXPIRED, 0, 0x0800, "License expired") @@ -261,6 +261,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_SYN_INVALID_VERSION, 0, 0x0902, "Invalid Sy // wal TAOS_DEFINE_ERROR(TSDB_CODE_WAL_APP_ERROR, 0, 0x1000, "Unexpected generic error in wal") TAOS_DEFINE_ERROR(TSDB_CODE_WAL_FILE_CORRUPTED, 0, 0x1001, "WAL file is corrupted") +TAOS_DEFINE_ERROR(TSDB_CODE_WAL_SIZE_LIMIT, 0, 0x1002, "WAL size exceeds limit") // http TAOS_DEFINE_ERROR(TSDB_CODE_HTTP_SERVER_OFFLINE, 0, 0x1100, "http server is not onlin") diff --git a/src/inc/twal.h b/src/inc/twal.h index c32bb870213fbcb183af599b0ac94b37f54e8906..b85377d8d43521f4927b958d1839a9e565431c6a 100644 --- a/src/inc/twal.h +++ b/src/inc/twal.h @@ -59,6 +59,7 @@ int32_t walAlter(twalh pWal, SWalCfg *pCfg); void walStop(twalh); void walClose(twalh); int32_t walRenew(twalh); +void walRemoveOldFiles(twalh); int32_t walWrite(twalh, SWalHead *); void walFsync(twalh, bool forceFsync); int32_t walRestore(twalh, void *pVnode, FWalWrite writeFp); diff --git a/src/sync/inc/syncInt.h b/src/sync/inc/syncInt.h index 240b401bdaef252e36109490658b39ede749a8fe..93c6efc20a8d79a7d6436680f01cee87dadda853 100644 --- a/src/sync/inc/syncInt.h +++ b/src/sync/inc/syncInt.h @@ -35,6 +35,8 @@ extern "C" { #define TAOS_SMSG_SYNC_MUST 6 #define TAOS_SMSG_STATUS 7 +#define SYNC_MAX_SIZE (TSDB_MAX_WAL_SIZE + sizeof(SWalHead) + sizeof(SSyncHead) + 16) + #define nodeRole pNode->peerInfo[pNode->selfIndex]->role #define nodeVersion pNode->peerInfo[pNode->selfIndex]->version #define nodeSStatus pNode->peerInfo[pNode->selfIndex]->sstatus diff --git a/src/sync/src/syncMain.c b/src/sync/src/syncMain.c index 9dcd0fd632396baf5f8bf68c3df24dae72d45a02..ef1ada4c2efd4967bdfee871596a61806db46a15 100644 --- a/src/sync/src/syncMain.c +++ b/src/sync/src/syncMain.c @@ -79,7 +79,7 @@ int32_t syncInit() { info.numOfThreads = tsSyncTcpThreads; info.serverIp = 0; info.port = tsSyncPort; - info.bufferSize = 640000; + info.bufferSize = SYNC_MAX_SIZE; info.processBrokenLink = syncProcessBrokenLink; info.processIncomingMsg = syncProcessPeerMsg; info.processIncomingConn = syncProcessIncommingConnection; @@ -850,7 +850,7 @@ static void syncProcessForwardFromPeer(char *cont, SSyncPeer *pPeer) { SSyncNode *pNode = pPeer->pSyncNode; SWalHead * pHead = (SWalHead *)cont; - sDebug("%s, forward is received, ver:%" PRIu64, pPeer->id, pHead->version); + sDebug("%s, forward is received, hver:%" PRIu64 ", len:%d", pPeer->id, pHead->version, pHead->len); if (nodeRole == TAOS_SYNC_ROLE_SLAVE) { // nodeVersion = pHead->version; @@ -859,7 +859,7 @@ static void syncProcessForwardFromPeer(char *cont, SSyncPeer *pPeer) { if (nodeSStatus != TAOS_SYNC_STATUS_INIT) { syncSaveIntoBuffer(pPeer, pHead); } else { - sError("%s, forward discarded, ver:%" PRIu64, pPeer->id, pHead->version); + sError("%s, forward discarded, hver:%" PRIu64, pPeer->id, pHead->version); } } } @@ -890,10 +890,11 @@ static int32_t syncReadPeerMsg(SSyncPeer *pPeer, SSyncHead *pHead, char *cont) { // head.len = htonl(head.len); if (pHead->len < 0) { - sError("%s, invalid pkt length, len:%d", pPeer->id, pHead->len); + sError("%s, invalid pkt length, hlen:%d", pPeer->id, pHead->len); return -1; } + assert(pHead->len <= TSDB_MAX_WAL_SIZE); int32_t bytes = taosReadMsg(pPeer->peerFd, cont, pHead->len); if (bytes != pHead->len) { sError("%s, failed to read, bytes:%d len:%d", pPeer->id, bytes, pHead->len); diff --git a/src/sync/src/syncRetrieve.c b/src/sync/src/syncRetrieve.c index 82a4627ea55a9f0caf041b9ee41b4b3e50c229e4..968f5becaddc7b06c06171a4d91cc0e0ffffc0da 100644 --- a/src/sync/src/syncRetrieve.c +++ b/src/sync/src/syncRetrieve.c @@ -244,7 +244,7 @@ static int32_t syncCheckLastWalChanges(SSyncPeer *pPeer, uint32_t *pEvent) { } static int32_t syncRetrieveLastWal(SSyncPeer *pPeer, char *name, uint64_t fversion, int64_t offset, uint32_t *pEvent) { - SWalHead *pHead = malloc(640000); + SWalHead *pHead = malloc(SYNC_MAX_SIZE); int32_t code = -1; int32_t bytes = 0; int32_t sfd; diff --git a/src/sync/src/tarbitrator.c b/src/sync/src/tarbitrator.c index 496bf074357487ab1c87072948b2e26be056dc58..ae8f2ea518204d57891aa71ac7b4f3ee9e863ed7 100644 --- a/src/sync/src/tarbitrator.c +++ b/src/sync/src/tarbitrator.c @@ -86,7 +86,7 @@ int32_t main(int32_t argc, char *argv[]) { info.numOfThreads = 1; info.serverIp = 0; info.port = tsArbitratorPort; - info.bufferSize = 640000; + info.bufferSize = SYNC_MAX_SIZE; info.processBrokenLink = arbProcessBrokenLink; info.processIncomingMsg = arbProcessPeerMsg; info.processIncomingConn = arbProcessIncommingConnection; diff --git a/src/vnode/src/vnodeMain.c b/src/vnode/src/vnodeMain.c index bd44ce8e1cd65761a084686e4281cd237720d762..7592b4edc4c6d62c2f404242426d625d7b3518c7 100644 --- a/src/vnode/src/vnodeMain.c +++ b/src/vnode/src/vnodeMain.c @@ -583,6 +583,7 @@ static int vnodeProcessTsdbStatus(void *arg, int status) { if (status == TSDB_STATUS_COMMIT_OVER) { vDebug("vgId:%d, commit over, fver:%" PRIu64 " vver:%" PRIu64, pVnode->vgId, pVnode->fversion, pVnode->version); + walRemoveOldFiles(pVnode->wal); return vnodeSaveVersion(pVnode); } diff --git a/src/vnode/src/vnodeWrite.c b/src/vnode/src/vnodeWrite.c index 99594607ac0a19e31f438e0a02320ee189d55253..e67c544fb23456d8775077e8a86f1745508bd0fc 100644 --- a/src/vnode/src/vnodeWrite.c +++ b/src/vnode/src/vnodeWrite.c @@ -217,6 +217,11 @@ int32_t vnodeWriteToWQueue(void *vparam, void *wparam, int32_t qtype, void *rpar if (code != TSDB_CODE_SUCCESS) return code; } + if (pHead->len > TSDB_MAX_WAL_SIZE) { + vError("vgId:%d, wal len:%d exceeds limit, hver:%" PRIu64, pVnode->vgId, pHead->len, pHead->version); + return TSDB_CODE_WAL_SIZE_LIMIT; + } + int32_t size = sizeof(SVWriteMsg) + sizeof(SWalHead) + pHead->len; SVWriteMsg *pWrite = taosAllocateQitem(size); if (pWrite == NULL) { diff --git a/src/wal/inc/walInt.h b/src/wal/inc/walInt.h index d1e977225929d8e34b42fbe6aec53c3f41bb7273..36311c8f5d92fe0ba97793224df59d4dd33b5da6 100644 --- a/src/wal/inc/walInt.h +++ b/src/wal/inc/walInt.h @@ -34,7 +34,7 @@ extern int32_t wDebugFlag; #define WAL_PREFIX "wal" #define WAL_PREFIX_LEN 3 #define WAL_REFRESH_MS 1000 -#define WAL_MAX_SIZE (1024 * 1024) +#define WAL_MAX_SIZE (TSDB_MAX_WAL_SIZE + sizeof(SWalHead) + 16) #define WAL_SIGNATURE ((uint32_t)(0xFAFBFDFE)) #define WAL_PATH_LEN (TSDB_FILENAME_LEN + 12) #define WAL_FILE_LEN (TSDB_FILENAME_LEN + 32) diff --git a/src/wal/src/walMgmt.c b/src/wal/src/walMgmt.c index de666c85e832aa52ce33d442f543c0512d91adb8..9ba0dfd1240e56eec22c2f8a2eb42ec0326fc32c 100644 --- a/src/wal/src/walMgmt.c +++ b/src/wal/src/walMgmt.c @@ -135,7 +135,7 @@ void walClose(void *handle) { if (remove(pWal->name) < 0) { wError("vgId:%d, wal:%p file:%s, failed to remove", pWal->vgId, pWal, pWal->name); } else { - wDebug("vgId:%d, wal:%p file:%s, it is removed", pWal->vgId, pWal, pWal->name); + wInfo("vgId:%d, wal:%p file:%s, it is removed", pWal->vgId, pWal, pWal->name); } } } else { diff --git a/src/wal/src/walWrite.c b/src/wal/src/walWrite.c index 9681f4b898d0661d25650067362be429854e8c94..d3a41ec6b22883468625ef4a858f4d2f7d88a773 100644 --- a/src/wal/src/walWrite.c +++ b/src/wal/src/walWrite.c @@ -58,24 +58,32 @@ int32_t walRenew(void *handle) { wDebug("vgId:%d, file:%s, it is created", pWal->vgId, pWal->name); } - if (pWal->keep != TAOS_WAL_KEEP) { - // remove the oldest wal file - int64_t oldFileId = -1; - if (walGetOldFile(pWal, pWal->fileId, WAL_FILE_NUM, &oldFileId) == 0) { - char walName[WAL_FILE_LEN] = {0}; - snprintf(walName, sizeof(walName), "%s/%s%" PRId64, pWal->path, WAL_PREFIX, oldFileId); - - if (remove(walName) < 0) { - wError("vgId:%d, file:%s, failed to remove since %s", pWal->vgId, walName, strerror(errno)); - } else { - wDebug("vgId:%d, file:%s, it is removed", pWal->vgId, walName); - } + pthread_mutex_unlock(&pWal->mutex); + + return code; +} + +void walRemoveOldFiles(void *handle) { + SWal *pWal = handle; + if (pWal == NULL) return; + if (pWal->keep == TAOS_WAL_KEEP) return; + + pthread_mutex_lock(&pWal->mutex); + + // remove the oldest wal file + int64_t oldFileId = -1; + if (walGetOldFile(pWal, pWal->fileId, WAL_FILE_NUM, &oldFileId) == 0) { + char walName[WAL_FILE_LEN] = {0}; + snprintf(walName, sizeof(walName), "%s/%s%" PRId64, pWal->path, WAL_PREFIX, oldFileId); + + if (remove(walName) < 0) { + wError("vgId:%d, file:%s, failed to remove since %s", pWal->vgId, walName, strerror(errno)); + } else { + wInfo("vgId:%d, file:%s, it is removed", pWal->vgId, walName); } } pthread_mutex_unlock(&pWal->mutex); - - return code; } int32_t walWrite(void *handle, SWalHead *pHead) { diff --git a/tests/script/general/wal/kill.sim b/tests/script/general/wal/kill.sim new file mode 100644 index 0000000000000000000000000000000000000000..7f103874a561c2bb2534996ab30d30ab0e8907a3 --- /dev/null +++ b/tests/script/general/wal/kill.sim @@ -0,0 +1,77 @@ +system sh/stop_dnodes.sh +system sh/deploy.sh -n dnode1 -i 1 + +print ============== deploy +system sh/exec.sh -n dnode1 -s start +sleep 3001 +sql connect + +sql create database d1 +sql use d1 + +sql create table t1 (ts timestamp, i int) +sql insert into t1 values(now, 1); + +print =============== step3 +sleep 3000 +sql select * from t1; +print rows: $rows +if $rows != 1 then + return -1 +endi +system sh/exec.sh -n dnode1 -s stop -x SIGKILL +sleep 3000 + +print =============== step4 +system sh/exec.sh -n dnode1 -s start -x SIGKILL +sleep 3000 +sql select * from t1; +print rows: $rows +if $rows != 1 then + return -1 +endi +system sh/exec.sh -n dnode1 -s stop -x SIGKILL +sleep 3000 + +print =============== step5 +system sh/exec.sh -n dnode1 -s start -x SIGKILL +sleep 3000 +sql select * from t1; +print rows: $rows +if $rows != 1 then + return -1 +endi +system sh/exec.sh -n dnode1 -s stop -x SIGKILL +sleep 3000 + +print =============== step6 +system sh/exec.sh -n dnode1 -s start -x SIGKILL +sleep 3000 +sql select * from t1; +print rows: $rows +if $rows != 1 then + return -1 +endi +system sh/exec.sh -n dnode1 -s stop -x SIGKILL +sleep 3000 + +print =============== step7 +system sh/exec.sh -n dnode1 -s start -x SIGKILL +sleep 3000 +sql select * from t1; +print rows: $rows +if $rows != 1 then + return -1 +endi +system sh/exec.sh -n dnode1 -s stop -x SIGKILL +sleep 3000 + +print =============== step8 +system sh/exec.sh -n dnode1 -s start -x SIGKILL +sleep 3000 +sql select * from t1; +print rows: $rows +if $rows != 1 then + return -1 +endi +system sh/exec.sh -n dnode1 -s stop -x SIGKILL diff --git a/tests/script/general/wal/sync.sim b/tests/script/general/wal/sync.sim new file mode 100644 index 0000000000000000000000000000000000000000..abaf22f91921e5bcc8effbe6fc1c66766ff92a3f --- /dev/null +++ b/tests/script/general/wal/sync.sim @@ -0,0 +1,124 @@ +system sh/stop_dnodes.sh + +system sh/deploy.sh -n dnode1 -i 1 +system sh/deploy.sh -n dnode2 -i 2 +system sh/deploy.sh -n dnode3 -i 3 + +system sh/cfg.sh -n dnode1 -c numOfMnodes -v 3 +system sh/cfg.sh -n dnode2 -c numOfMnodes -v 3 +system sh/cfg.sh -n dnode3 -c numOfMnodes -v 3 + +system sh/cfg.sh -n dnode1 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode2 -c mnodeEqualVnodeNum -v 4 +system sh/cfg.sh -n dnode3 -c mnodeEqualVnodeNum -v 4 + +system sh/cfg.sh -n dnode1 -c http -v 1 +system sh/cfg.sh -n dnode2 -c http -v 1 +system sh/cfg.sh -n dnode3 -c http -v 1 + +system sh/cfg.sh -n dnode1 -c maxTablesPerVnode -v 20000 +system sh/cfg.sh -n dnode2 -c maxTablesPerVnode -v 20000 +system sh/cfg.sh -n dnode3 -c maxTablesPerVnode -v 20000 + +system sh/cfg.sh -n dnode1 -c replica -v 3 +system sh/cfg.sh -n dnode2 -c replica -v 3 +system sh/cfg.sh -n dnode3 -c replica -v 3 + +system sh/cfg.sh -n dnode1 -c maxSQLLength -v 940032 +system sh/cfg.sh -n dnode2 -c maxSQLLength -v 940032 +system sh/cfg.sh -n dnode3 -c maxSQLLength -v 940032 + +print ============== deploy + +system sh/exec.sh -n dnode1 -s start +sleep 5001 +sql connect + +sql create dnode $hostname2 +sql create dnode $hostname3 +system sh/exec.sh -n dnode2 -s start +system sh/exec.sh -n dnode3 -s start + +print =============== step1 +$x = 0 +show1: + $x = $x + 1 + sleep 2000 + if $x == 5 then + return -1 + endi +sql show mnodes -x show1 +$mnode1Role = $data2_1 +print mnode1Role $mnode1Role +$mnode2Role = $data2_2 +print mnode2Role $mnode2Role +$mnode3Role = $data2_3 +print mnode3Role $mnode3Role + +if $mnode1Role != master then + goto show1 +endi +if $mnode2Role != slave then + goto show1 +endi +if $mnode3Role != slave then + goto show1 +endi + +print =============== step2 +sql create database d1 replica 3 +sql use d1 + +sql create table table_rest (ts timestamp, i int) +print sql length is 870KB +restful d1 table_rest 1591072800 30000 +restful d1 table_rest 1591172800 30000 +restful d1 table_rest 1591272800 30000 +restful d1 table_rest 1591372800 30000 +restful d1 table_rest 1591472800 30000 +restful d1 table_rest 1591572800 30000 +restful d1 table_rest 1591672800 30000 +restful d1 table_rest 1591772800 30000 +restful d1 table_rest 1591872800 30000 +restful d1 table_rest 1591972800 30000 + +sql select * from table_rest; +print rows: $rows +if $rows != 300000 then + return -1 +endi + +print =============== step3 +system sh/exec.sh -n dnode1 -s stop -x SIGINT +sleep 5000 +sql select * from table_rest; +print rows: $rows +if $rows != 300000 then + return -1 +endi +system sh/exec.sh -n dnode1 -s start -x SIGINT +sleep 5000 + +print =============== step4 +system sh/exec.sh -n dnode2 -s stop -x SIGINT +sleep 5000 +sql select * from table_rest; +print rows: $rows +if $rows != 300000 then + return -1 +endi +system sh/exec.sh -n dnode2 -s start -x SIGINT +sleep 5000 + +print =============== step5 +system sh/exec.sh -n dnode3 -s stop -x SIGINT +sleep 5000 +sql select * from table_rest; +print rows: $rows +if $rows != 300000 then + return -1 +endi + +system sh/exec.sh -n dnode1 -s stop -x SIGINT +system sh/exec.sh -n dnode2 -s stop -x SIGINT +system sh/exec.sh -n dnode3 -s stop -x SIGINT \ No newline at end of file diff --git a/tests/script/jenkins/basic.txt b/tests/script/jenkins/basic.txt index 1b2fe37c71f26486ba847006ecf3aa373b5f55c1..2a84172da9cbd1f72f3a1dc0de3672a82085101b 100644 --- a/tests/script/jenkins/basic.txt +++ b/tests/script/jenkins/basic.txt @@ -236,6 +236,9 @@ cd ../../../debug; make ./test.sh -f general/vector/table_query.sim ./test.sh -f general/vector/table_time.sim +./test.sh -f general/wal/sync.sim +./test.sh -f general/wal/kill.sim + ./test.sh -f unique/account/account_create.sim ./test.sh -f unique/account/account_delete.sim ./test.sh -f unique/account/account_len.sim