diff --git a/include/common/tmsg.h b/include/common/tmsg.h index 202aeaf69e037b073e301bc2971c68eb539099d5..39dc5361e627096b3a61df4846e03cf91d1888da 100644 --- a/include/common/tmsg.h +++ b/include/common/tmsg.h @@ -1982,6 +1982,7 @@ typedef struct { typedef struct { SClientHbKey connKey; + int64_t clusterId; SQueryHbReqBasic* query; SHashObj* info; // hash } SClientHbReq; diff --git a/include/libs/catalog/catalog.h b/include/libs/catalog/catalog.h index f0e642bc9af8060d0b6bc0380f2c85284f307642..ee237741c37134fb80dbae3623168804e9f4d18e 100644 --- a/include/libs/catalog/catalog.h +++ b/include/libs/catalog/catalog.h @@ -68,6 +68,7 @@ typedef struct SCatalogReq { SArray *pIndex; // element is index name SArray *pUser; // element is SUserAuthInfo bool qNodeRequired; // valid qnode + bool forceUpdate; } SCatalogReq; typedef struct SMetaData { @@ -280,7 +281,7 @@ int32_t catalogUpdateUserAuthInfo(SCatalog* pCtg, SGetUserAuthRsp* pAuth); int32_t catalogUpdateVgEpSet(SCatalog* pCtg, const char* dbFName, int32_t vgId, SEpSet *epSet); -int32_t ctgdLaunchAsyncCall(SCatalog* pCtg, void *pTrans, const SEpSet* pMgmtEps, uint64_t reqId); +int32_t ctgdLaunchAsyncCall(SCatalog* pCtg, void *pTrans, const SEpSet* pMgmtEps, uint64_t reqId, bool forceUpdate); /** diff --git a/include/libs/qcom/query.h b/include/libs/qcom/query.h index 45a7e9a29f3457a68e9998659237a9e0d70d39ab..a17be4484604206a133d95966b7c9d5e812a4956 100644 --- a/include/libs/qcom/query.h +++ b/include/libs/qcom/query.h @@ -222,7 +222,7 @@ extern int32_t (*queryProcessMsgRsp[TDMT_MAX])(void* output, char* msg, int32_t || (_type) == TDMT_VND_DROP_TABLE || (_type) == TDMT_VND_DROP_STB) #define NEED_SCHEDULER_RETRY_ERROR(_code) \ - ((_code) == TSDB_CODE_RPC_REDIRECT || (_code) == TSDB_CODE_RPC_NETWORK_UNAVAIL) + ((_code) == TSDB_CODE_RPC_REDIRECT || (_code) == TSDB_CODE_RPC_NETWORK_UNAVAIL || (_code) == TSDB_CODE_SCH_TIMEOUT_ERROR) #define REQUEST_MAX_TRY_TIMES 1 diff --git a/include/libs/scheduler/scheduler.h b/include/libs/scheduler/scheduler.h index 5cb6dbd3bcb55f7ad2411fb8eb72754475b59eab..22706f0953298a95b40552d068189fffef24f116 100644 --- a/include/libs/scheduler/scheduler.h +++ b/include/libs/scheduler/scheduler.h @@ -104,6 +104,8 @@ void schedulerAsyncFetchRows(int64_t job, schedulerFetchCallback fp, void* param int32_t schedulerGetTasksStatus(int64_t job, SArray *pSub); +void schedulerStopQueryHb(void *pTrans); + /** * Cancel query job diff --git a/source/client/inc/clientInt.h b/source/client/inc/clientInt.h index 09116a6bd18999a293203955877b57f6018e635b..eaf18884c62caa0892a8aad6eeb22b4072567d4c 100644 --- a/source/client/inc/clientInt.h +++ b/source/client/inc/clientInt.h @@ -57,11 +57,6 @@ enum { typedef struct SAppInstInfo SAppInstInfo; -typedef struct { - void* param; - SClientHbReq* req; -} SHbConnInfo; - typedef struct { char* key; // statistics @@ -71,11 +66,8 @@ typedef struct { int64_t startTime; // ctl SRWLatch lock; // lock is used in serialization - // connection SAppInstInfo* pAppInstInfo; - // info SHashObj* activeInfo; // hash - SHashObj* connInfo; // hash } SAppHbMgr; typedef int32_t (*FHbRspHandle)(SAppHbMgr* pAppHbMgr, SClientHbRsp* pRsp); @@ -325,8 +317,6 @@ void appHbMgrCleanup(void); int hbRegisterConn(SAppHbMgr* pAppHbMgr, int64_t tscRefId, int64_t clusterId, int8_t connType); void hbDeregisterConn(SAppHbMgr* pAppHbMgr, SClientHbKey connKey); -int hbAddConnInfo(SAppHbMgr* pAppHbMgr, SClientHbKey connKey, void* key, void* value, int32_t keyLen, int32_t valueLen); - // --- mq void hbMgrInitMqHbRspHandle(); diff --git a/source/client/src/clientEnv.c b/source/client/src/clientEnv.c index 4b39a515847e77bd9f2df26f446c5da69f9de75c..be0a41ba40e417d2c7596ae985d056827c3f46f6 100644 --- a/source/client/src/clientEnv.c +++ b/source/client/src/clientEnv.c @@ -130,8 +130,13 @@ void destroyTscObj(void *pObj) { SClientHbKey connKey = {.tscRid = pTscObj->id, .connType = pTscObj->connType}; hbDeregisterConn(pTscObj->pAppInfo->pAppHbMgr, connKey); - atomic_sub_fetch_64(&pTscObj->pAppInfo->numOfConns, 1); + int64_t connNum = atomic_sub_fetch_64(&pTscObj->pAppInfo->numOfConns, 1); closeAllRequests(pTscObj->pRequests); + schedulerStopQueryHb(pTscObj->pAppInfo->pTransporter); + if (0 == connNum) { + // TODO + //closeTransporter(pTscObj); + } tscDebug("connObj 0x%" PRIx64 " destroyed, totalConn:%" PRId64, pTscObj->id, pTscObj->pAppInfo->numOfConns); taosThreadMutexDestroy(&pTscObj->mutex); taosMemoryFreeClear(pTscObj); @@ -223,6 +228,10 @@ static void doDestroyRequest(void *p) { taosHashRemove(pRequest->pTscObj->pRequests, &pRequest->self, sizeof(pRequest->self)); + if (pRequest->body.queryJob != 0) { + schedulerFreeJob(pRequest->body.queryJob); + } + taosMemoryFreeClear(pRequest->msgBuf); taosMemoryFreeClear(pRequest->sqlstr); taosMemoryFreeClear(pRequest->pDb); @@ -230,10 +239,6 @@ static void doDestroyRequest(void *p) { doFreeReqResultInfo(&pRequest->body.resInfo); qDestroyQueryPlan(pRequest->body.pDag); - if (pRequest->body.queryJob != 0) { - schedulerFreeJob(pRequest->body.queryJob); - } - taosArrayDestroy(pRequest->tableList); taosArrayDestroy(pRequest->dbList); diff --git a/source/client/src/clientHb.c b/source/client/src/clientHb.c index 09c3d269c703d6e2dc78cbef49a7790c98f34245..a4c109ee17b584a41a322a2cb868e42c446fdf58 100644 --- a/source/client/src/clientHb.c +++ b/source/client/src/clientHb.c @@ -129,9 +129,9 @@ static int32_t hbProcessStbInfoRsp(void *value, int32_t valueLen, struct SCatalo } static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { - SHbConnInfo *info = taosHashGet(pAppHbMgr->connInfo, &pRsp->connKey, sizeof(SClientHbKey)); - if (NULL == info) { - tscWarn("fail to get connInfo, may be dropped, refId:%" PRIx64 ", type:%d", pRsp->connKey.tscRid, + SClientHbReq *pReq = taosHashGet(pAppHbMgr->activeInfo, &pRsp->connKey, sizeof(SClientHbKey)); + if (NULL == pReq) { + tscWarn("pReq to get activeInfo, may be dropped, refId:%" PRIx64 ", type:%d", pRsp->connKey.tscRid, pRsp->connKey.connType); return TSDB_CODE_SUCCESS; } @@ -181,12 +181,11 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { break; } - int64_t *clusterId = (int64_t *)info->param; struct SCatalog *pCatalog = NULL; - int32_t code = catalogGetHandle(*clusterId, &pCatalog); + int32_t code = catalogGetHandle(pReq->clusterId, &pCatalog); if (code != TSDB_CODE_SUCCESS) { - tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", *clusterId, tstrerror(code)); + tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", pReq->clusterId, tstrerror(code)); break; } @@ -199,12 +198,11 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { break; } - int64_t *clusterId = (int64_t *)info->param; struct SCatalog *pCatalog = NULL; - int32_t code = catalogGetHandle(*clusterId, &pCatalog); + int32_t code = catalogGetHandle(pReq->clusterId, &pCatalog); if (code != TSDB_CODE_SUCCESS) { - tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", *clusterId, tstrerror(code)); + tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", pReq->clusterId, tstrerror(code)); break; } @@ -217,12 +215,11 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { break; } - int64_t *clusterId = (int64_t *)info->param; struct SCatalog *pCatalog = NULL; - int32_t code = catalogGetHandle(*clusterId, &pCatalog); + int32_t code = catalogGetHandle(pReq->clusterId, &pCatalog); if (code != TSDB_CODE_SUCCESS) { - tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", *clusterId, tstrerror(code)); + tscWarn("catalogGetHandle failed, clusterId:%" PRIx64 ", error:%s", pReq->clusterId, tstrerror(code)); break; } @@ -547,13 +544,10 @@ SClientHbBatchReq *hbGatherAllInfo(SAppHbMgr *pAppHbMgr) { pOneReq = taosArrayPush(pBatchReq->reqs, pOneReq); - SHbConnInfo *info = taosHashGet(pAppHbMgr->connInfo, &pOneReq->connKey, sizeof(SClientHbKey)); - if (info) { - code = (*clientHbMgr.reqHandle[pOneReq->connKey.connType])(&pOneReq->connKey, info->param, pOneReq); - if (code) { - pIter = taosHashIterate(pAppHbMgr->activeInfo, pIter); - continue; - } + code = (*clientHbMgr.reqHandle[pOneReq->connKey.connType])(&pOneReq->connKey, &pOneReq->clusterId, pOneReq); + if (code) { + pIter = taosHashIterate(pAppHbMgr->activeInfo, pIter); + continue; } //hbClearClientHbReq(pOneReq); @@ -569,23 +563,6 @@ SClientHbBatchReq *hbGatherAllInfo(SAppHbMgr *pAppHbMgr) { return pBatchReq; } -void hbClearReqInfo(SAppHbMgr *pAppHbMgr) { - void *pIter = taosHashIterate(pAppHbMgr->activeInfo, NULL); - while (pIter != NULL) { - SClientHbReq *pOneReq = pIter; - - tFreeReqKvHash(pOneReq->info); - taosHashClear(pOneReq->info); - - if (pOneReq->query) { - taosArrayDestroy(pOneReq->query->queryDesc); - taosMemoryFreeClear(pOneReq->query); - } - - pIter = taosHashIterate(pAppHbMgr->activeInfo, pIter); - } -} - void hbThreadFuncUnexpectedStopped(void) { atomic_store_8(&clientHbMgr.threadStop, 2); } @@ -715,14 +692,6 @@ SAppHbMgr *appHbMgrInit(SAppInstInfo *pAppInstInfo, char *key) { } taosHashSetFreeFp(pAppHbMgr->activeInfo, tFreeClientHbReq); - // init getInfoFunc - pAppHbMgr->connInfo = taosHashInit(64, hbKeyHashFunc, 1, HASH_ENTRY_LOCK); - - if (pAppHbMgr->connInfo == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - taosMemoryFree(pAppHbMgr); - return NULL; - } taosThreadMutexLock(&clientHbMgr.lock); taosArrayPush(clientHbMgr.appHbMgrs, &pAppHbMgr); @@ -745,15 +714,6 @@ void appHbMgrCleanup(void) { taosHashCleanup(pTarget->activeInfo); pTarget->activeInfo = NULL; - pIter = taosHashIterate(pTarget->connInfo, NULL); - while (pIter != NULL) { - SHbConnInfo *info = pIter; - taosMemoryFree(info->param); - pIter = taosHashIterate(pTarget->connInfo, pIter); - } - taosHashCleanup(pTarget->connInfo); - pTarget->connInfo = NULL; - taosMemoryFree(pTarget->key); taosMemoryFree(pTarget); } @@ -791,7 +751,7 @@ void hbMgrCleanUp() { clientHbMgr.appHbMgrs = NULL; } -int hbRegisterConnImpl(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, SHbConnInfo *info) { +int hbRegisterConnImpl(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, int64_t clusterId) { // init hash in activeinfo void *data = taosHashGet(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); if (data != NULL) { @@ -799,17 +759,11 @@ int hbRegisterConnImpl(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, SHbConnInfo * } SClientHbReq hbReq = {0}; hbReq.connKey = connKey; + hbReq.clusterId = clusterId; //hbReq.info = taosHashInit(64, hbKeyHashFunc, 1, HASH_ENTRY_LOCK); taosHashPut(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey), &hbReq, sizeof(SClientHbReq)); - // init hash - if (info != NULL) { - SClientHbReq *pReq = taosHashGet(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); - info->req = pReq; - taosHashPut(pAppHbMgr->connInfo, &connKey, sizeof(SClientHbKey), info, sizeof(SHbConnInfo)); - } - atomic_add_fetch_32(&pAppHbMgr->connKeyCnt, 1); return 0; } @@ -819,15 +773,10 @@ int hbRegisterConn(SAppHbMgr *pAppHbMgr, int64_t tscRefId, int64_t clusterId, in .tscRid = tscRefId, .connType = connType, }; - SHbConnInfo info = {0}; switch (connType) { case CONN_TYPE__QUERY: { - int64_t *pClusterId = taosMemoryMalloc(sizeof(int64_t)); - *pClusterId = clusterId; - - info.param = pClusterId; - return hbRegisterConnImpl(pAppHbMgr, connKey, &info); + return hbRegisterConnImpl(pAppHbMgr, connKey, clusterId); } case CONN_TYPE__TMQ: { return 0; @@ -844,26 +793,10 @@ void hbDeregisterConn(SAppHbMgr *pAppHbMgr, SClientHbKey connKey) { taosHashRemove(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); } - SHbConnInfo *info = taosHashGet(pAppHbMgr->connInfo, &connKey, sizeof(SClientHbKey)); - if (info) { - taosMemoryFree(info->param); - taosHashRemove(pAppHbMgr->connInfo, &connKey, sizeof(SClientHbKey)); - } - - if (NULL == pReq || NULL == info) { + if (NULL == pReq) { return; } atomic_sub_fetch_32(&pAppHbMgr->connKeyCnt, 1); } -int hbAddConnInfo(SAppHbMgr *pAppHbMgr, SClientHbKey connKey, void *key, void *value, int32_t keyLen, - int32_t valueLen) { - // find req by connection id - SClientHbReq *pReq = taosHashGet(pAppHbMgr->activeInfo, &connKey, sizeof(SClientHbKey)); - ASSERT(pReq != NULL); - - taosHashPut(pReq->info, key, keyLen, value, valueLen); - - return 0; -} diff --git a/source/client/src/clientMsgHandler.c b/source/client/src/clientMsgHandler.c index 1039d36362f72f643978b964ccd874db186fbc88..2d5199f18171364d8b8a80b0a0b3a76caace9d40 100644 --- a/source/client/src/clientMsgHandler.c +++ b/source/client/src/clientMsgHandler.c @@ -180,7 +180,7 @@ int32_t processUseDbRsp(void* param, const SDataBuf* pMsg, int32_t code) { taosMemoryFreeClear(output.dbVgroup); tscError("0x%" PRIx64 " failed to build use db output since %s", pRequest->requestId, terrstr()); - } else if (output.dbVgroup) { + } else if (output.dbVgroup && output.dbVgroup->vgHash) { struct SCatalog* pCatalog = NULL; int32_t code1 = catalogGetHandle(pRequest->pTscObj->pAppInfo->clusterId, &pCatalog); diff --git a/source/client/src/clientSml.c b/source/client/src/clientSml.c index c559ac58f8ae9888bbf9822b210ea414e0d0f869..5643af746dd29cb2c3e479266ae62bf452bf2971 100644 --- a/source/client/src/clientSml.c +++ b/source/client/src/clientSml.c @@ -2332,6 +2332,8 @@ static int32_t isSchemalessDb(SSmlHandle* info){ smlBuildInvalidDataMsg(&info->msgBuf, "catalogGetDBCfg error, code:", tstrerror(code)); return code; } + taosArrayDestroy(pInfo.pRetensions); + if (!pInfo.schemaless){ info->pRequest->code = TSDB_CODE_SML_INVALID_DB_CONF; smlBuildInvalidDataMsg(&info->msgBuf, "can not insert into schemaless db:", dbFname); diff --git a/source/libs/catalog/inc/catalogInt.h b/source/libs/catalog/inc/catalogInt.h index cebe69639045bb164da2e07121c0fe9dc9c5477c..9219a382e4865312fd8b546a0a9c3ea106fe76a2 100644 --- a/source/libs/catalog/inc/catalogInt.h +++ b/source/libs/catalog/inc/catalogInt.h @@ -52,6 +52,7 @@ enum { CTG_OP_UPDATE_VGROUP = 0, CTG_OP_UPDATE_TB_META, CTG_OP_DROP_DB_CACHE, + CTG_OP_DROP_DB_VGROUP, CTG_OP_DROP_STB_META, CTG_OP_DROP_TB_META, CTG_OP_UPDATE_USER, @@ -266,26 +267,32 @@ typedef struct SCtgUpdateTblMsg { STableMetaOutput* output; } SCtgUpdateTblMsg; -typedef struct SCtgRemoveDBMsg { +typedef struct SCtgDropDBMsg { SCatalog* pCtg; char dbFName[TSDB_DB_FNAME_LEN]; uint64_t dbId; -} SCtgRemoveDBMsg; +} SCtgDropDBMsg; -typedef struct SCtgRemoveStbMsg { +typedef struct SCtgDropDbVgroupMsg { + SCatalog* pCtg; + char dbFName[TSDB_DB_FNAME_LEN]; +} SCtgDropDbVgroupMsg; + + +typedef struct SCtgDropStbMetaMsg { SCatalog* pCtg; char dbFName[TSDB_DB_FNAME_LEN]; char stbName[TSDB_TABLE_NAME_LEN]; uint64_t dbId; uint64_t suid; -} SCtgRemoveStbMsg; +} SCtgDropStbMetaMsg; -typedef struct SCtgRemoveTblMsg { +typedef struct SCtgDropTblMetaMsg { SCatalog* pCtg; char dbFName[TSDB_DB_FNAME_LEN]; char tbName[TSDB_TABLE_NAME_LEN]; uint64_t dbId; -} SCtgRemoveTblMsg; +} SCtgDropTblMetaMsg; typedef struct SCtgUpdateUserMsg { SCatalog* pCtg; @@ -451,6 +458,7 @@ int32_t ctgGetTbMetaFromCache(CTG_PARAMS, SCtgTbMetaCtx* ctx, STableMeta** pTabl int32_t ctgOpUpdateVgroup(SCtgCacheOperation *action); int32_t ctgOpUpdateTbMeta(SCtgCacheOperation *action); int32_t ctgOpDropDbCache(SCtgCacheOperation *action); +int32_t ctgOpDropDbVgroup(SCtgCacheOperation *action); int32_t ctgOpDropStbMeta(SCtgCacheOperation *action); int32_t ctgOpDropTbMeta(SCtgCacheOperation *action); int32_t ctgOpUpdateUser(SCtgCacheOperation *action); @@ -464,6 +472,7 @@ int32_t ctgReadTbMetaFromCache(SCatalog* pCtg, SCtgTbMetaCtx* ctx, STableMeta** int32_t ctgReadTbVerFromCache(SCatalog *pCtg, const SName *pTableName, int32_t *sver, int32_t *tver, int32_t *tbType, uint64_t *suid, char *stbName); int32_t ctgChkAuthFromCache(SCatalog* pCtg, const char* user, const char* dbFName, AUTH_TYPE type, bool *inCache, bool *pass); int32_t ctgDropDbCacheEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId); +int32_t ctgDropDbVgroupEnqueue(SCatalog* pCtg, const char *dbFName, bool syncReq); int32_t ctgDropStbMetaEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId, const char *stbName, uint64_t suid, bool syncReq); int32_t ctgDropTbMetaEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId, const char *tbName, bool syncReq); int32_t ctgUpdateVgroupEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId, SDBVgInfo* dbInfo, bool syncReq); diff --git a/source/libs/catalog/src/ctgAsync.c b/source/libs/catalog/src/ctgAsync.c index ce94eace660ba3dedb2f8f8bebb30bd2fe982351..6998d8c778ad16af9bd8e30671315cb876109812 100644 --- a/source/libs/catalog/src/ctgAsync.c +++ b/source/libs/catalog/src/ctgAsync.c @@ -286,6 +286,9 @@ int32_t ctgInitJob(CTG_PARAMS, SCtgJob** job, uint64_t reqId, const SCatalogReq* int32_t taskIdx = 0; for (int32_t i = 0; i < dbVgNum; ++i) { char* dbFName = taosArrayGet(pReq->pDbVgroup, i); + if (pReq->forceUpdate) { + ctgDropDbVgroupEnqueue(pCtg, dbFName, true); + } CTG_ERR_JRET(ctgInitGetDbVgTask(pJob, taskIdx++, dbFName)); } @@ -301,6 +304,9 @@ int32_t ctgInitJob(CTG_PARAMS, SCtgJob** job, uint64_t reqId, const SCatalogReq* for (int32_t i = 0; i < tbMetaNum; ++i) { SName* name = taosArrayGet(pReq->pTableMeta, i); + if (pReq->forceUpdate) { + catalogRemoveTableMeta(pCtg, name); + } CTG_ERR_JRET(ctgInitGetTbMetaTask(pJob, taskIdx++, name)); } diff --git a/source/libs/catalog/src/ctgCache.c b/source/libs/catalog/src/ctgCache.c index 0f1344c3432b2540c2daa33de33c2f8c570658f0..8332c7b068225fe63469973063e3ff703d9ccfe0 100644 --- a/source/libs/catalog/src/ctgCache.c +++ b/source/libs/catalog/src/ctgCache.c @@ -35,6 +35,11 @@ SCtgOperation gCtgCacheOperation[CTG_OP_MAX] = { "drop DB", ctgOpDropDbCache }, + { + CTG_OP_DROP_DB_VGROUP, + "drop DBVgroup", + ctgOpDropDbVgroup + }, { CTG_OP_DROP_STB_META, "drop stbMeta", @@ -563,9 +568,9 @@ int32_t ctgEnqueue(SCatalog* pCtg, SCtgCacheOperation *operation) { int32_t ctgDropDbCacheEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId) { int32_t code = 0; SCtgCacheOperation action= {.opId = CTG_OP_DROP_DB_CACHE}; - SCtgRemoveDBMsg *msg = taosMemoryMalloc(sizeof(SCtgRemoveDBMsg)); + SCtgDropDBMsg *msg = taosMemoryMalloc(sizeof(SCtgDropDBMsg)); if (NULL == msg) { - ctgError("malloc %d failed", (int32_t)sizeof(SCtgRemoveDBMsg)); + ctgError("malloc %d failed", (int32_t)sizeof(SCtgDropDBMsg)); CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); } @@ -590,13 +595,43 @@ _return: CTG_RET(code); } +int32_t ctgDropDbVgroupEnqueue(SCatalog* pCtg, const char *dbFName, bool syncOp) { + int32_t code = 0; + SCtgCacheOperation action= {.opId = CTG_OP_DROP_DB_VGROUP, .syncOp = syncOp}; + SCtgDropDbVgroupMsg *msg = taosMemoryMalloc(sizeof(SCtgDropDbVgroupMsg)); + if (NULL == msg) { + ctgError("malloc %d failed", (int32_t)sizeof(SCtgDropDbVgroupMsg)); + CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); + } + + char *p = strchr(dbFName, '.'); + if (p && CTG_IS_SYS_DBNAME(p + 1)) { + dbFName = p + 1; + } + + msg->pCtg = pCtg; + strncpy(msg->dbFName, dbFName, sizeof(msg->dbFName)); + + action.data = msg; + + CTG_ERR_JRET(ctgEnqueue(pCtg, &action)); + + return TSDB_CODE_SUCCESS; + +_return: + + taosMemoryFreeClear(action.data); + CTG_RET(code); +} + + int32_t ctgDropStbMetaEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId, const char *stbName, uint64_t suid, bool syncOp) { int32_t code = 0; SCtgCacheOperation action= {.opId = CTG_OP_DROP_STB_META, .syncOp = syncOp}; - SCtgRemoveStbMsg *msg = taosMemoryMalloc(sizeof(SCtgRemoveStbMsg)); + SCtgDropStbMetaMsg *msg = taosMemoryMalloc(sizeof(SCtgDropStbMetaMsg)); if (NULL == msg) { - ctgError("malloc %d failed", (int32_t)sizeof(SCtgRemoveStbMsg)); + ctgError("malloc %d failed", (int32_t)sizeof(SCtgDropStbMetaMsg)); CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); } @@ -623,9 +658,9 @@ _return: int32_t ctgDropTbMetaEnqueue(SCatalog* pCtg, const char *dbFName, int64_t dbId, const char *tbName, bool syncOp) { int32_t code = 0; SCtgCacheOperation action= {.opId = CTG_OP_DROP_TB_META, .syncOp = syncOp}; - SCtgRemoveTblMsg *msg = taosMemoryMalloc(sizeof(SCtgRemoveTblMsg)); + SCtgDropTblMetaMsg *msg = taosMemoryMalloc(sizeof(SCtgDropTblMetaMsg)); if (NULL == msg) { - ctgError("malloc %d failed", (int32_t)sizeof(SCtgRemoveTblMsg)); + ctgError("malloc %d failed", (int32_t)sizeof(SCtgDropTblMetaMsg)); CTG_ERR_RET(TSDB_CODE_CTG_MEM_ERROR); } @@ -1281,7 +1316,7 @@ _return: int32_t ctgOpDropDbCache(SCtgCacheOperation *operation) { int32_t code = 0; - SCtgRemoveDBMsg *msg = operation->data; + SCtgDropDBMsg *msg = operation->data; SCatalog* pCtg = msg->pCtg; SCtgDBCache *dbCache = NULL; @@ -1304,6 +1339,33 @@ _return: CTG_RET(code); } +int32_t ctgOpDropDbVgroup(SCtgCacheOperation *operation) { + int32_t code = 0; + SCtgDropDbVgroupMsg *msg = operation->data; + SCatalog* pCtg = msg->pCtg; + + SCtgDBCache *dbCache = NULL; + ctgGetDBCache(msg->pCtg, msg->dbFName, &dbCache); + if (NULL == dbCache) { + goto _return; + } + + CTG_ERR_RET(ctgWAcquireVgInfo(pCtg, dbCache)); + + ctgFreeVgInfo(dbCache->vgInfo); + dbCache->vgInfo = NULL; + + ctgDebug("db vgInfo removed, dbFName:%s", msg->dbFName); + + ctgWReleaseVgInfo(dbCache); + +_return: + + taosMemoryFreeClear(msg); + + CTG_RET(code); +} + int32_t ctgOpUpdateTbMeta(SCtgCacheOperation *operation) { int32_t code = 0; @@ -1353,7 +1415,7 @@ _return: int32_t ctgOpDropStbMeta(SCtgCacheOperation *operation) { int32_t code = 0; - SCtgRemoveStbMsg *msg = operation->data; + SCtgDropStbMetaMsg *msg = operation->data; SCatalog* pCtg = msg->pCtg; SCtgDBCache *dbCache = NULL; @@ -1399,7 +1461,7 @@ _return: int32_t ctgOpDropTbMeta(SCtgCacheOperation *operation) { int32_t code = 0; - SCtgRemoveTblMsg *msg = operation->data; + SCtgDropTblMetaMsg *msg = operation->data; SCatalog* pCtg = msg->pCtg; SCtgDBCache *dbCache = NULL; diff --git a/source/libs/catalog/src/ctgDbg.c b/source/libs/catalog/src/ctgDbg.c index fdab50db0f65fd67d16d6f5b134f847dc0f882bc..42baa530ce69a7b5c37ee48a8b51cd4969f23b58 100644 --- a/source/libs/catalog/src/ctgDbg.c +++ b/source/libs/catalog/src/ctgDbg.c @@ -132,7 +132,22 @@ void ctgdUserCallback(SMetaData* pResult, void* param, int32_t code) { } } -int32_t ctgdLaunchAsyncCall(SCatalog* pCtg, void *pTrans, const SEpSet* pMgmtEps, uint64_t reqId) { + +/* +prepare SQL: +create database db1; +use db1; +create stable st1 (ts timestamp, f1 int) tags(t1 int); +create table tb1 using st1 tags(1); +insert into tb1 values (now, 1); +create qnode on dnode 1; +create user user1 pass "abc"; +create database db2; +grant write on db2.* to user1; +create function udf1 as '/tmp/libudf1.so' outputtype int; +create aggregate function udf2 as '/tmp/libudf2.so' outputtype int; +*/ +int32_t ctgdLaunchAsyncCall(SCatalog* pCtg, void *pTrans, const SEpSet* pMgmtEps, uint64_t reqId, bool forceUpdate) { int32_t code = 0; SCatalogReq req = {0}; req.pTableMeta = taosArrayInit(2, sizeof(SName)); @@ -144,6 +159,7 @@ int32_t ctgdLaunchAsyncCall(SCatalog* pCtg, void *pTrans, const SEpSet* pMgmtEps req.pIndex = NULL;//taosArrayInit(2, TSDB_INDEX_FNAME_LEN); req.pUser = taosArrayInit(2, sizeof(SUserAuthInfo)); req.qNodeRequired = true; + req.forceUpdate = forceUpdate; SName name = {0}; char dbFName[TSDB_DB_FNAME_LEN] = {0}; diff --git a/source/libs/qworker/inc/qwInt.h b/source/libs/qworker/inc/qwInt.h index 3804c114cb39f53fd5f2206f36537394073e587d..082db6428f79575cf29a635a1965f214893d22a5 100644 --- a/source/libs/qworker/inc/qwInt.h +++ b/source/libs/qworker/inc/qwInt.h @@ -34,6 +34,7 @@ extern "C" { #define QW_DEFAULT_SCH_TASK_NUMBER 10000 #define QW_DEFAULT_SHORT_RUN_TIMES 2 #define QW_DEFAULT_HEARTBEAT_MSEC 5000 +#define QW_SCH_TIMEOUT_MSEC 180000 enum { QW_PHASE_PRE_QUERY = 1, @@ -137,7 +138,7 @@ typedef struct SQWTaskCtx { } SQWTaskCtx; typedef struct SQWSchStatus { - int32_t lastAccessTs; // timestamp in second + int64_t hbBrokenTs; // timestamp in msecond SRWLatch hbConnLock; SRpcHandleInfo hbConnInfo; SQueryNodeEpId hbEpId; @@ -354,6 +355,8 @@ int32_t qwOpenRef(void); void qwSetHbParam(int64_t refId, SQWHbParam **pParam); int32_t qwUpdateTimeInQueue(SQWorker *mgmt, int64_t ts, EQueueType type); int64_t qwGetTimeInQueue(SQWorker *mgmt, EQueueType type); +void qwClearExpiredSch(SArray* pExpiredSch); +int32_t qwAcquireScheduler(SQWorker *mgmt, uint64_t sId, int32_t rwType, SQWSchStatus **sch); void qwDbgDumpMgmtInfo(SQWorker *mgmt); int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, bool *ignore); diff --git a/source/libs/qworker/src/qwUtil.c b/source/libs/qworker/src/qwUtil.c index 3d0204e355bd228836a2729cc9e52e74981c4e0f..8bfb80f0616e223f96eafcd34e8d89768ffb98ca 100644 --- a/source/libs/qworker/src/qwUtil.c +++ b/source/libs/qworker/src/qwUtil.c @@ -535,3 +535,9 @@ int64_t qwGetTimeInQueue(SQWorker *mgmt, EQueueType type) { return -1; } + +void qwClearExpiredSch(SArray* pExpiredSch) { + +} + + diff --git a/source/libs/qworker/src/qworker.c b/source/libs/qworker/src/qworker.c index 8c0366fa0be1150b595fb9bec71035e7f2fe58ed..44a8fdf7f4e29c6ad8ee0e8dc5b26612bc1e665f 100644 --- a/source/libs/qworker/src/qworker.c +++ b/source/libs/qworker/src/qworker.c @@ -21,10 +21,12 @@ int32_t qwProcessHbLinkBroken(SQWorker *mgmt, SQWMsg *qwMsg, SSchedulerHbReq *re SSchedulerHbRsp rsp = {0}; SQWSchStatus *sch = NULL; - QW_ERR_RET(qwAcquireAddScheduler(mgmt, req->sId, QW_READ, &sch)); + QW_ERR_RET(qwAcquireScheduler(mgmt, req->sId, QW_READ, &sch)); QW_LOCK(QW_WRITE, &sch->hbConnLock); + sch->hbBrokenTs = taosGetTimestampMs(); + if (qwMsg->connInfo.handle == sch->hbConnInfo.handle) { tmsgReleaseHandle(&sch->hbConnInfo, TAOS_CONN_SERVER); sch->hbConnInfo.handle = NULL; @@ -794,6 +796,7 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { SQWSchStatus *sch = NULL; int32_t taskNum = 0; SQWHbInfo *rspList = NULL; + SArray *pExpiredSch = NULL; int32_t code = 0; qwDbgDumpMgmtInfo(mgmt); @@ -809,8 +812,11 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { } rspList = taosMemoryCalloc(schNum, sizeof(SQWHbInfo)); - if (NULL == rspList) { + pExpiredSch = taosArrayInit(schNum, sizeof(uint64_t)); + if (NULL == rspList || NULL == pExpiredSch) { QW_UNLOCK(QW_READ, &mgmt->schLock); + taosMemoryFree(rspList); + taosArrayDestroy(pExpiredSch); QW_ELOG("calloc %d SQWHbInfo failed", schNum); taosTmrReset(qwProcessHbTimerEvent, QW_DEFAULT_HEARTBEAT_MSEC, param, mgmt->timer, &mgmt->hbTimer); qwRelease(refId); @@ -820,6 +826,7 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { void *key = NULL; size_t keyLen = 0; int32_t i = 0; + int64_t currentMs = taosGetTimestampMs(); void *pIter = taosHashIterate(mgmt->schHash, NULL); while (pIter) { @@ -827,6 +834,11 @@ void qwProcessHbTimerEvent(void *param, void *tmrId) { if (NULL == sch->hbConnInfo.handle) { uint64_t *sId = taosHashGetKey(pIter, NULL); QW_TLOG("cancel send hb to sch %" PRIx64 " cause of no connection handle", *sId); + + if (sch->hbBrokenTs > 0 && ((currentMs - sch->hbBrokenTs) > QW_SCH_TIMEOUT_MSEC) && taosHashGetSize(sch->tasksHash) <= 0) { + taosArrayPush(pExpiredSch, sId); + } + pIter = taosHashIterate(mgmt->schHash, pIter); continue; } @@ -852,7 +864,12 @@ _return: tFreeSSchedulerHbRsp(&rspList[j].rsp); } + if (taosArrayGetSize(pExpiredSch) > 0) { + qwClearExpiredSch(pExpiredSch); + } + taosMemoryFreeClear(rspList); + taosArrayDestroy(pExpiredSch); taosTmrReset(qwProcessHbTimerEvent, QW_DEFAULT_HEARTBEAT_MSEC, param, mgmt->timer, &mgmt->hbTimer); qwRelease(refId); diff --git a/source/libs/scheduler/inc/schedulerInt.h b/source/libs/scheduler/inc/schedulerInt.h index c8fee1e5323e9b55cf1f28e1605ab49797813886..d961780ae515c7c1aefbe4879957ce7d7607b3fa 100644 --- a/source/libs/scheduler/inc/schedulerInt.h +++ b/source/libs/scheduler/inc/schedulerInt.h @@ -102,11 +102,11 @@ typedef struct SSchedulerMgmt { uint64_t taskId; // sequential taksId uint64_t sId; // schedulerId SSchedulerCfg cfg; - SRWLatch lock; bool exit; int32_t jobRef; int32_t jobNum; SSchStat stat; + SRWLatch hbLock; SHashObj *hbConnections; } SSchedulerMgmt; @@ -320,6 +320,8 @@ extern SSchedulerMgmt schMgmt; #define SCH_UNLOCK(type, _lock) (SCH_READ == (type) ? taosRUnLockLatch(_lock) : taosWUnLockLatch(_lock)) +void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask); +void schCleanClusterHb(void* pTrans); int32_t schLaunchTask(SSchJob *job, SSchTask *task); int32_t schBuildAndSendMsg(SSchJob *job, SSchTask *task, SQueryNodeAddr *addr, int32_t msgType); SSchJob *schAcquireJob(int64_t refId); diff --git a/source/libs/scheduler/src/schJob.c b/source/libs/scheduler/src/schJob.c index 3e23c395c9b416e1b753fbb7305d47b0b59afd55..ca90d2fe34a74a61652d5fe61054b34b5b4412b2 100644 --- a/source/libs/scheduler/src/schJob.c +++ b/source/libs/scheduler/src/schJob.c @@ -126,30 +126,6 @@ _return: SCH_RET(code); } -void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask) { - if (!pTask->registerdHb) { - return; - } - - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - SQueryNodeEpId epId = {0}; - - epId.nodeId = addr->nodeId; - - SEp* pEp = SCH_GET_CUR_EP(addr); - strcpy(epId.ep.fqdn, pEp->fqdn); - epId.ep.port = pEp->port; - - SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - SCH_TASK_ELOG("nodeId %d fqdn %s port %d not in hb connections", epId.nodeId, epId.ep.fqdn, epId.ep.port); - return; - } - - atomic_sub_fetch_64(&hb->taskNum, 1); - - pTask->registerdHb = false; -} void schFreeTask(SSchJob *pJob, SSchTask *pTask) { schDeregisterTaskHb(pJob, pTask); @@ -377,15 +353,21 @@ int32_t schDropTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_ return TSDB_CODE_SUCCESS; } - taosHashRemove(pTask->execNodes, &execIdx, sizeof(execIdx)); + if (taosHashRemove(pTask->execNodes, &execIdx, sizeof(execIdx))) { + SCH_TASK_ELOG("fail to remove execIdx %d from execNodeList", execIdx); + } else { + SCH_TASK_DLOG("execIdx %d removed from execNodeList", execIdx); + } + if (execIdx != pTask->execIdx) { // ignore it + SCH_TASK_DLOG("execIdx %d is not current execIdx %d", execIdx, pTask->execIdx); SCH_RET(TSDB_CODE_SCH_IGNORE_ERROR); } return TSDB_CODE_SUCCESS; } -int32_t schUpdateTaskExecNode(SSchTask *pTask, void *handle, int32_t execIdx) { +int32_t schUpdateTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_t execIdx) { if (taosHashGetSize(pTask->execNodes) <= 0) { return TSDB_CODE_SUCCESS; } @@ -393,6 +375,8 @@ int32_t schUpdateTaskExecNode(SSchTask *pTask, void *handle, int32_t execIdx) { SSchNodeInfo *nodeInfo = taosHashGet(pTask->execNodes, &execIdx, sizeof(execIdx)); nodeInfo->handle = handle; + SCH_TASK_DLOG("handle updated to %p for execIdx %d", handle, execIdx); + return TSDB_CODE_SUCCESS; } @@ -403,7 +387,7 @@ int32_t schUpdateTaskHandle(SSchJob *pJob, SSchTask *pTask, bool dropExecNode, v SCH_SET_TASK_HANDLE(pTask, handle); - schUpdateTaskExecNode(pTask, handle, execIdx); + schUpdateTaskExecNode(pJob, pTask, handle, execIdx); return TSDB_CODE_SUCCESS; } @@ -551,6 +535,8 @@ int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) { SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); } + SCH_TASK_DLOG("set %dth condidate addr, id %d, fqdn:%s, port:%d", i, naddr->nodeId, SCH_GET_CUR_EP(naddr)->fqdn, SCH_GET_CUR_EP(naddr)->port); + ++addNum; } } @@ -1110,6 +1096,7 @@ int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask) { SCH_UNLOCK(SCH_WRITE, &parent->lock); if (SCH_TASK_READY_FOR_LAUNCH(readyNum, parent)) { + SCH_TASK_DLOG("all %d children task done, start to launch parent task %" PRIx64, readyNum, parent->taskId); SCH_ERR_RET(schLaunchTask(pJob, parent)); } } @@ -1186,7 +1173,7 @@ void schDropTaskOnExecNode(SSchJob *pJob, SSchTask *pTask) { nodeInfo = taosHashIterate(pTask->execNodes, nodeInfo); } - SCH_TASK_DLOG("task has %d exec address", size); + SCH_TASK_DLOG("task has been dropped on %d exec nodes", size); } @@ -1196,7 +1183,8 @@ int32_t schRescheduleTask(SSchJob *pJob, SSchTask *pTask) { } SCH_LOCK_TASK(pTask); - if (JOB_TASK_STATUS_EXECUTING == pTask->status && pJob->fetchTask != pTask) { + if (JOB_TASK_STATUS_EXECUTING == pTask->status && pJob->fetchTask != pTask && taosArrayGetSize(pTask->candidateAddrs) > 1) { + SCH_TASK_DLOG("task execIdx %d will be rescheduled now", pTask->execIdx); schDropTaskOnExecNode(pJob, pTask); taosHashClear(pTask->execNodes); schProcessOnTaskFailure(pJob, pTask, TSDB_CODE_SCH_TIMEOUT_ERROR); @@ -1306,9 +1294,10 @@ int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) { int32_t code = 0; atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1); - pTask->execIdx++; + SCH_TASK_DLOG("start to launch task's %dth exec", pTask->execIdx); + SCH_LOG_TASK_START_TS(pTask); if (schJobNeedToStop(pJob, &status)) { @@ -1471,9 +1460,10 @@ void schFreeJobImpl(void *job) { qDebug("QID:0x%" PRIx64 " job freed, refId:%" PRIx64 ", pointer:%p", queryId, refId, pJob); - atomic_sub_fetch_32(&schMgmt.jobNum, 1); - - schCloseJobRef(); + int32_t jobNum = atomic_sub_fetch_32(&schMgmt.jobNum, 1); + if (jobNum == 0) { + schCloseJobRef(); + } } int32_t schExecJobImpl(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql, diff --git a/source/libs/scheduler/src/schRemote.c b/source/libs/scheduler/src/schRemote.c index b336ce8c763632a576d3e8879739fd7e1c96f092..bf51d8d631bfff7501ba40fea396b570e14c5c90 100644 --- a/source/libs/scheduler/src/schRemote.c +++ b/source/libs/scheduler/src/schRemote.c @@ -648,31 +648,6 @@ _return: SCH_RET(code); } -int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId, bool *exist) { - int32_t code = 0; - SSchHbTrans hb = {0}; - - hb.trans.pTrans = pJob->pTrans; - - SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &hb.rpcCtx)); - - code = taosHashPut(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId), &hb, sizeof(SSchHbTrans)); - if (code) { - schFreeRpcCtx(&hb.rpcCtx); - - if (HASH_NODE_EXIST(code)) { - *exist = true; - return TSDB_CODE_SUCCESS; - } - - qError("taosHashPut hb trans failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); - SCH_ERR_RET(code); - } - - return TSDB_CODE_SUCCESS; -} - - int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId, SArray* taskAction) { SSchedulerHbReq req = {0}; int32_t code = 0; @@ -684,17 +659,20 @@ int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId, SArray* taskAction) { req.sId = schMgmt.sId; memcpy(&req.epId, nodeEpId, sizeof(SQueryNodeEpId)); + SCH_LOCK(SCH_READ, &schMgmt.hbLock); SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, nodeEpId, sizeof(SQueryNodeEpId)); if (NULL == hb) { - qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", nodeEpId->nodeId, nodeEpId->ep.fqdn, + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + qError("hb connection no longer exist, nodeId:%d, fqdn:%s, port:%d", nodeEpId->nodeId, nodeEpId->ep.fqdn, nodeEpId->ep.port); - SCH_ERR_RET(code); + return TSDB_CODE_SUCCESS; } SCH_LOCK(SCH_WRITE, &hb->lock); code = schCloneHbRpcCtx(&hb->rpcCtx, &rpcCtx); memcpy(&trans, &hb->trans, sizeof(trans)); SCH_UNLOCK(SCH_WRITE, &hb->lock); + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); SCH_ERR_RET(code); @@ -764,60 +742,6 @@ _return: SCH_RET(code); } - -int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask) { - SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); - SQueryNodeEpId epId = {0}; - - epId.nodeId = addr->nodeId; - - SEp* pEp = SCH_GET_CUR_EP(addr); - strcpy(epId.ep.fqdn, pEp->fqdn); - epId.ep.port = pEp->port; - - SSchHbTrans *hb = NULL; - while (true) { - hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - bool exist = false; - SCH_ERR_RET(schRegisterHbConnection(pJob, pTask, &epId, &exist)); - if (!exist) { - SCH_ERR_RET(schBuildAndSendHbMsg(&epId, NULL)); - } - - continue; - } - - break; - } - - atomic_add_fetch_64(&hb->taskNum, 1); - - pTask->registerdHb = true; - - return TSDB_CODE_SUCCESS; -} - -int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans) { - int32_t code = 0; - SSchHbTrans *hb = NULL; - - hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); - if (NULL == hb) { - qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); - SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); - } - - SCH_LOCK(SCH_WRITE, &hb->lock); - memcpy(&hb->trans, trans, sizeof(*trans)); - SCH_UNLOCK(SCH_WRITE, &hb->lock); - - qDebug("hb connection updated, sId:%" PRIx64 ", nodeId:%d, fqdn:%s, port:%d, pTrans:%p, pHandle:%p", schMgmt.sId, - epId->nodeId, epId->ep.fqdn, epId->ep.port, trans->pTrans, trans->pHandle); - - return TSDB_CODE_SUCCESS; -} - int32_t schHandleHbCallback(void *param, const SDataBuf *pMsg, int32_t code) { SSchedulerHbRsp rsp = {0}; SSchTaskCallbackParam *pParam = (SSchTaskCallbackParam *)param; @@ -1037,6 +961,7 @@ int32_t schBuildAndSendMsg(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, if (NULL == addr) { addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); isCandidateAddr = true; + SCH_TASK_DLOG("target candidateIdx %d", pTask->candidateIdx); } SEpSet epSet = addr->epSet; diff --git a/source/libs/scheduler/src/schUtil.c b/source/libs/scheduler/src/schUtil.c index 81c95ea976e0c685fa1585df6dbb42bed75fd0c8..18398802dbd2ed0c251bd93a1fcafc56363d797e 100644 --- a/source/libs/scheduler/src/schUtil.c +++ b/source/libs/scheduler/src/schUtil.c @@ -21,17 +21,189 @@ #include "tref.h" #include "trpc.h" + +void schCleanClusterHb(void* pTrans) { + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); + + SSchHbTrans *hb = taosHashIterate(schMgmt.hbConnections, NULL); + while (hb) { + if (hb->trans.pTrans == pTrans) { + SQueryNodeEpId* pEpId = taosHashGetKey(hb, NULL); + rpcReleaseHandle(hb->trans.pHandle, TAOS_CONN_CLIENT); + taosHashRemove(schMgmt.hbConnections, pEpId, sizeof(SQueryNodeEpId)); + } + + hb = taosHashIterate(schMgmt.hbConnections, hb); + } + + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); +} + +int32_t schRemoveHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId) { + return TSDB_CODE_SUCCESS; // TODO ENABLE IT WHEN RPC IS READY + + int32_t code = 0; + + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); + SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + SCH_TASK_ELOG("nodeId %d fqdn %s port %d not in hb connections", epId->nodeId, epId->ep.fqdn, epId->ep.port); + return TSDB_CODE_SUCCESS; + } + + int64_t taskNum = atomic_load_64(&hb->taskNum); + if (taskNum <= 0) { + rpcReleaseHandle(hb->trans.pHandle, TAOS_CONN_CLIENT); + taosHashRemove(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + } + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + + return TSDB_CODE_SUCCESS; +} + + +int32_t schAddHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *epId, bool *exist) { + int32_t code = 0; + SSchHbTrans hb = {0}; + + hb.trans.pTrans = pJob->pTrans; + hb.taskNum = 1; + + SCH_ERR_RET(schMakeHbRpcCtx(pJob, pTask, &hb.rpcCtx)); + + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); + code = taosHashPut(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId), &hb, sizeof(SSchHbTrans)); + if (code) { + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + schFreeRpcCtx(&hb.rpcCtx); + + if (HASH_NODE_EXIST(code)) { + *exist = true; + return TSDB_CODE_SUCCESS; + } + + qError("taosHashPut hb trans failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); + SCH_ERR_RET(code); + } + + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); + + return TSDB_CODE_SUCCESS; +} + +int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *pEpId) { + SSchHbTrans *hb = NULL; + + while (true) { + SCH_LOCK(SCH_READ, &schMgmt.hbLock); + hb = taosHashGet(schMgmt.hbConnections, pEpId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + bool exist = false; + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + SCH_ERR_RET(schAddHbConnection(pJob, pTask, pEpId, &exist)); + if (!exist) { + SCH_RET(schBuildAndSendHbMsg(pEpId, NULL)); + } + + continue; + } + + break; + } + + atomic_add_fetch_64(&hb->taskNum, 1); + + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + + return TSDB_CODE_SUCCESS; +} + +void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask) { + if (!pTask->registerdHb) { + return; + } + + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + SQueryNodeEpId epId = {0}; + + epId.nodeId = addr->nodeId; + + SEp* pEp = SCH_GET_CUR_EP(addr); + strcpy(epId.ep.fqdn, pEp->fqdn); + epId.ep.port = pEp->port; + + SCH_LOCK(SCH_READ, &schMgmt.hbLock); + SSchHbTrans *hb = taosHashGet(schMgmt.hbConnections, &epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + SCH_TASK_WLOG("nodeId %d fqdn %s port %d not in hb connections", epId.nodeId, epId.ep.fqdn, epId.ep.port); + return; + } + + int64_t taskNum = atomic_sub_fetch_64(&hb->taskNum, 1); + if (0 == taskNum) { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + schRemoveHbConnection(pJob, pTask, &epId); + } else { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + } + + pTask->registerdHb = false; +} + + + +int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask) { + SQueryNodeAddr *addr = taosArrayGet(pTask->candidateAddrs, pTask->candidateIdx); + SQueryNodeEpId epId = {0}; + + epId.nodeId = addr->nodeId; + + SEp* pEp = SCH_GET_CUR_EP(addr); + strcpy(epId.ep.fqdn, pEp->fqdn); + epId.ep.port = pEp->port; + + SCH_ERR_RET(schRegisterHbConnection(pJob, pTask, &epId)); + + pTask->registerdHb = true; + + return TSDB_CODE_SUCCESS; +} + +int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans) { + int32_t code = 0; + SSchHbTrans *hb = NULL; + + SCH_LOCK(SCH_READ, &schMgmt.hbLock); + hb = taosHashGet(schMgmt.hbConnections, epId, sizeof(SQueryNodeEpId)); + if (NULL == hb) { + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + qError("taosHashGet hb connection failed, nodeId:%d, fqdn:%s, port:%d", epId->nodeId, epId->ep.fqdn, epId->ep.port); + SCH_ERR_RET(TSDB_CODE_QRY_APP_ERROR); + } + + SCH_LOCK(SCH_WRITE, &hb->lock); + memcpy(&hb->trans, trans, sizeof(*trans)); + SCH_UNLOCK(SCH_WRITE, &hb->lock); + SCH_UNLOCK(SCH_READ, &schMgmt.hbLock); + + qDebug("hb connection updated, sId:%" PRIx64 ", nodeId:%d, fqdn:%s, port:%d, pTrans:%p, pHandle:%p", schMgmt.sId, + epId->nodeId, epId->ep.fqdn, epId->ep.port, trans->pTrans, trans->pHandle); + + return TSDB_CODE_SUCCESS; +} + + void schCloseJobRef(void) { if (!atomic_load_8((int8_t *)&schMgmt.exit)) { return; } - SCH_LOCK(SCH_WRITE, &schMgmt.lock); - if (atomic_load_32(&schMgmt.jobNum) <= 0 && schMgmt.jobRef >= 0) { + if (schMgmt.jobRef >= 0) { taosCloseRef(schMgmt.jobRef); schMgmt.jobRef = -1; } - SCH_UNLOCK(SCH_WRITE, &schMgmt.lock); } uint64_t schGenTaskId(void) { return atomic_add_fetch_64(&schMgmt.taskId, 1); } @@ -88,4 +260,3 @@ void schFreeRpcCtx(SRpcCtx *pCtx) { (*pCtx->freeFunc)(pCtx->brokenVal.val); } - diff --git a/source/libs/scheduler/src/scheduler.c b/source/libs/scheduler/src/scheduler.c index 32e00c1b700a3eadfa9e15580d76950798791927..b4dc067f4c08b7eb765cfa3365b551f0053671a2 100644 --- a/source/libs/scheduler/src/scheduler.c +++ b/source/libs/scheduler/src/scheduler.c @@ -182,6 +182,14 @@ int32_t scheduleCancelJob(int64_t job) { SCH_RET(code); } +void schedulerStopQueryHb(void *pTrans) { + if (NULL == pTrans) { + return; + } + + schCleanClusterHb(pTrans); +} + void schedulerFreeJob(int64_t job) { SSchJob *pJob = schAcquireJob(job); if (NULL == pJob) { @@ -220,6 +228,7 @@ void schedulerDestroy(void) { } } + SCH_LOCK(SCH_WRITE, &schMgmt.hbLock); if (schMgmt.hbConnections) { void *pIter = taosHashIterate(schMgmt.hbConnections, NULL); while (pIter != NULL) { @@ -230,4 +239,5 @@ void schedulerDestroy(void) { taosHashCleanup(schMgmt.hbConnections); schMgmt.hbConnections = NULL; } + SCH_UNLOCK(SCH_WRITE, &schMgmt.hbLock); } diff --git a/source/libs/transport/src/transSvr.c b/source/libs/transport/src/transSvr.c index 608fd00b2cda7c9508275cd4487496295b9e0711..50f99128b2fea235aaee1ef1d337149d98efaac6 100644 --- a/source/libs/transport/src/transSvr.c +++ b/source/libs/transport/src/transSvr.c @@ -180,6 +180,12 @@ static bool addHandleToAcceptloop(void* arg); if (!transQueuePush(&conn->srvMsgs, srvMsg)) { \ return; \ } \ + if (conn->regArg.init) { \ + tTrace("server conn %p release, notify server app", conn); \ + STrans* pTransInst = conn->pTransInst; \ + (*pTransInst->cfp)(pTransInst->parent, &(conn->regArg.msg), NULL); \ + memset(&conn->regArg, 0, sizeof(conn->regArg)); \ + } \ uvStartSendRespInternal(srvMsg); \ return; \ } \ @@ -1154,6 +1160,10 @@ int transGetConnInfo(void* thandle, STransHandleInfo* pInfo) { } SExHandle* ex = thandle; SSvrConn* pConn = ex->handle; + if (pConn == NULL) { + tTrace("invalid handle %p, failed to Get Conn info", thandle); + return -1; + } struct sockaddr_in addr = pConn->addr; pInfo->clientIp = (uint32_t)(addr.sin_addr.s_addr);