未验证 提交 4097b5ae 编写于 作者: D dapan1121 提交者: GitHub

Merge pull request #14986 from taosdata/fix/fixmsgorder

fix: fix msg disorder and other issues
...@@ -135,7 +135,7 @@ void rpcSendRequestWithCtx(void *thandle, const SEpSet *pEpSet, SRpcMsg *pMsg ...@@ -135,7 +135,7 @@ void rpcSendRequestWithCtx(void *thandle, const SEpSet *pEpSet, SRpcMsg *pMsg
int32_t rpcGetConnInfo(void *thandle, SRpcConnInfo *pInfo); int32_t rpcGetConnInfo(void *thandle, SRpcConnInfo *pInfo);
void rpcSendRecv(void *shandle, SEpSet *pEpSet, SRpcMsg *pReq, SRpcMsg *pRsp); void rpcSendRecv(void *shandle, SEpSet *pEpSet, SRpcMsg *pReq, SRpcMsg *pRsp);
void rpcSetDefaultAddr(void *thandle, const char *ip, const char *fqdn); void rpcSetDefaultAddr(void *thandle, const char *ip, const char *fqdn);
int64_t rpcAllocHandle(); void* rpcAllocHandle();
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -171,6 +171,7 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) { ...@@ -171,6 +171,7 @@ static int32_t hbQueryHbRspHandle(SAppHbMgr *pAppHbMgr, SClientHbRsp *pRsp) {
pTscObj->pAppInfo->totalDnodes = pRsp->query->totalDnodes; pTscObj->pAppInfo->totalDnodes = pRsp->query->totalDnodes;
pTscObj->pAppInfo->onlineDnodes = pRsp->query->onlineDnodes; pTscObj->pAppInfo->onlineDnodes = pRsp->query->onlineDnodes;
pTscObj->connId = pRsp->query->connId; pTscObj->connId = pRsp->query->connId;
tscTrace("conn %p hb rsp, dnodes %d/%d", pTscObj->connId, pTscObj->pAppInfo->onlineDnodes, pTscObj->pAppInfo->totalDnodes);
if (pRsp->query->killRid) { if (pRsp->query->killRid) {
tscDebug("request rid %" PRIx64 " need to be killed now", pRsp->query->killRid); tscDebug("request rid %" PRIx64 " need to be killed now", pRsp->query->killRid);
...@@ -294,6 +295,7 @@ static int32_t hbAsyncCallBack(void *param, SDataBuf *pMsg, int32_t code) { ...@@ -294,6 +295,7 @@ static int32_t hbAsyncCallBack(void *param, SDataBuf *pMsg, int32_t code) {
if (code != 0) { if (code != 0) {
(*pInst)->onlineDnodes = ((*pInst)->totalDnodes ? 0 : -1); (*pInst)->onlineDnodes = ((*pInst)->totalDnodes ? 0 : -1);
tscDebug("hb rsp error %s, update server status %d/%d", tstrerror(code), (*pInst)->onlineDnodes, (*pInst)->totalDnodes);
} }
if (rspNum) { if (rspNum) {
......
...@@ -1276,7 +1276,12 @@ int32_t doProcessMsgFromServer(void* param) { ...@@ -1276,7 +1276,12 @@ int32_t doProcessMsgFromServer(void* param) {
assert(pMsg->info.ahandle != NULL); assert(pMsg->info.ahandle != NULL);
STscObj* pTscObj = NULL; STscObj* pTscObj = NULL;
tscDebug("processMsgFromServer message: %s, code: %s", TMSG_INFO(pMsg->msgType), tstrerror(pMsg->code)); STraceId* trace = &pMsg->info.traceId;
char tbuf[40] = {0};
TRACE_TO_STR(trace, tbuf);
tscDebug("processMsgFromServer handle %p, message: %s, code: %s, gtid: %s", pMsg->info.handle, TMSG_INFO(pMsg->msgType), tstrerror(pMsg->code),
tbuf);
if (pSendInfo->requestObjRefId != 0) { if (pSendInfo->requestObjRefId != 0) {
SRequestObj* pRequest = (SRequestObj*)taosAcquireRef(clientReqRefPool, pSendInfo->requestObjRefId); SRequestObj* pRequest = (SRequestObj*)taosAcquireRef(clientReqRefPool, pSendInfo->requestObjRefId);
......
...@@ -81,6 +81,7 @@ int32_t qmPutRpcMsgToQueue(SQnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { ...@@ -81,6 +81,7 @@ int32_t qmPutRpcMsgToQueue(SQnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) {
taosWriteQitem(pMgmt->queryWorker.queue, pMsg); taosWriteQitem(pMgmt->queryWorker.queue, pMsg);
return 0; return 0;
case READ_QUEUE: case READ_QUEUE:
case FETCH_QUEUE:
dTrace("msg:%p, is created and will put into qnode-fetch queue", pMsg); dTrace("msg:%p, is created and will put into qnode-fetch queue", pMsg);
taosWriteQitem(pMgmt->fetchWorker.queue, pMsg); taosWriteQitem(pMgmt->fetchWorker.queue, pMsg);
return 0; return 0;
......
...@@ -163,7 +163,7 @@ int32_t ctgInitGetQnodeTask(SCtgJob *pJob, int32_t taskIdx, void* param) { ...@@ -163,7 +163,7 @@ int32_t ctgInitGetQnodeTask(SCtgJob *pJob, int32_t taskIdx, void* param) {
taosArrayPush(pJob->pTasks, &task); taosArrayPush(pJob->pTasks, &task);
qDebug("QID:0x%" PRIx64 " the %d task type %s initialized", pJob->queryId, taskIdx, ctgTaskTypeStr(task.type)); qDebug("QID:0x%" PRIx64 " the %dth task type %s initialized", pJob->queryId, taskIdx, ctgTaskTypeStr(task.type));
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
...@@ -178,7 +178,7 @@ int32_t ctgInitGetDnodeTask(SCtgJob *pJob, int32_t taskIdx, void* param) { ...@@ -178,7 +178,7 @@ int32_t ctgInitGetDnodeTask(SCtgJob *pJob, int32_t taskIdx, void* param) {
taosArrayPush(pJob->pTasks, &task); taosArrayPush(pJob->pTasks, &task);
qDebug("QID:0x%" PRIx64 " the %d task type %s initialized", pJob->queryId, taskIdx, ctgTaskTypeStr(task.type)); qDebug("QID:0x%" PRIx64 " the %dth task type %s initialized", pJob->queryId, taskIdx, ctgTaskTypeStr(task.type));
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
...@@ -264,7 +264,7 @@ int32_t ctgInitGetSvrVerTask(SCtgJob *pJob, int32_t taskIdx, void* param) { ...@@ -264,7 +264,7 @@ int32_t ctgInitGetSvrVerTask(SCtgJob *pJob, int32_t taskIdx, void* param) {
taosArrayPush(pJob->pTasks, &task); taosArrayPush(pJob->pTasks, &task);
qDebug("QID:0x%" PRIx64 " [%dth] task type %s initialized", pJob->queryId, taskIdx, ctgTaskTypeStr(task.type)); qDebug("QID:0x%" PRIx64 " the %dth task type %s initialized", pJob->queryId, taskIdx, ctgTaskTypeStr(task.type));
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
......
...@@ -1980,6 +1980,7 @@ int32_t loadRemoteDataCallback(void* param, SDataBuf* pMsg, int32_t code) { ...@@ -1980,6 +1980,7 @@ int32_t loadRemoteDataCallback(void* param, SDataBuf* pMsg, int32_t code) {
qDebug("%s fetch rsp received, index:%d, rows:%d", pSourceDataInfo->taskId, index, pRsp->numOfRows); qDebug("%s fetch rsp received, index:%d, rows:%d", pSourceDataInfo->taskId, index, pRsp->numOfRows);
} else { } else {
pSourceDataInfo->code = code; pSourceDataInfo->code = code;
qDebug("%s fetch rsp received, index:%d, error:%d", pSourceDataInfo->taskId, index, tstrerror(code));
} }
pSourceDataInfo->status = EX_SOURCE_DATA_READY; pSourceDataInfo->status = EX_SOURCE_DATA_READY;
......
...@@ -75,7 +75,9 @@ typedef struct SQWDebug { ...@@ -75,7 +75,9 @@ typedef struct SQWDebug {
bool lockEnable; bool lockEnable;
bool statusEnable; bool statusEnable;
bool dumpEnable; bool dumpEnable;
bool tmp; bool sleepSimulate;
bool deadSimulate;
bool redirectSimulate;
} SQWDebug; } SQWDebug;
extern SQWDebug gQWDebug; extern SQWDebug gQWDebug;
...@@ -130,12 +132,11 @@ typedef struct SQWTaskCtx { ...@@ -130,12 +132,11 @@ typedef struct SQWTaskCtx {
int8_t taskType; int8_t taskType;
int8_t explain; int8_t explain;
int8_t needFetch; int8_t needFetch;
int32_t queryType; int32_t msgType;
int32_t fetchType; int32_t fetchType;
int32_t execId; int32_t execId;
bool queryRsped; bool queryRsped;
bool queryFetched;
bool queryEnd; bool queryEnd;
bool queryContinue; bool queryContinue;
bool queryInQueue; bool queryInQueue;
...@@ -228,6 +229,7 @@ typedef struct SQWorkerMgmt { ...@@ -228,6 +229,7 @@ typedef struct SQWorkerMgmt {
#define QW_SET_EVENT_PROCESSED(ctx, event) atomic_store_8(&(ctx)->events[event], QW_EVENT_PROCESSED) #define QW_SET_EVENT_PROCESSED(ctx, event) atomic_store_8(&(ctx)->events[event], QW_EVENT_PROCESSED)
#define QW_GET_PHASE(ctx) atomic_load_8(&(ctx)->phase) #define QW_GET_PHASE(ctx) atomic_load_8(&(ctx)->phase)
#define QW_SET_PHASE(ctx, _value) do { if ((_value) != QW_PHASE_PRE_FETCH && (_value) != QW_PHASE_POST_FETCH) { atomic_store_8(&(ctx)->phase, _value); } } while (0)
#define QW_SET_RSP_CODE(ctx, code) atomic_store_32(&(ctx)->rspCode, code) #define QW_SET_RSP_CODE(ctx, code) atomic_store_32(&(ctx)->rspCode, code)
#define QW_UPDATE_RSP_CODE(ctx, code) atomic_val_compare_exchange_32(&(ctx)->rspCode, 0, code) #define QW_UPDATE_RSP_CODE(ctx, code) atomic_val_compare_exchange_32(&(ctx)->rspCode, 0, code)
...@@ -362,7 +364,7 @@ int32_t qwAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx); ...@@ -362,7 +364,7 @@ int32_t qwAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx);
int32_t qwGetTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx); int32_t qwGetTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx);
int32_t qwAddAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx); int32_t qwAddAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx);
void qwReleaseTaskCtx(SQWorker *mgmt, void *ctx); void qwReleaseTaskCtx(SQWorker *mgmt, void *ctx);
int32_t qwKillTaskHandle(QW_FPARAMS_DEF, SQWTaskCtx *ctx); int32_t qwKillTaskHandle(SQWTaskCtx *ctx);
int32_t qwUpdateTaskStatus(QW_FPARAMS_DEF, int8_t status); int32_t qwUpdateTaskStatus(QW_FPARAMS_DEF, int8_t status);
int32_t qwDropTask(QW_FPARAMS_DEF); int32_t qwDropTask(QW_FPARAMS_DEF);
void qwSaveTbVersionInfo(qTaskInfo_t pTaskInfo, SQWTaskCtx *ctx); void qwSaveTbVersionInfo(qTaskInfo_t pTaskInfo, SQWTaskCtx *ctx);
...@@ -372,13 +374,15 @@ int32_t qwUpdateTimeInQueue(SQWorker *mgmt, int64_t ts, EQueueType type); ...@@ -372,13 +374,15 @@ int32_t qwUpdateTimeInQueue(SQWorker *mgmt, int64_t ts, EQueueType type);
int64_t qwGetTimeInQueue(SQWorker *mgmt, EQueueType type); int64_t qwGetTimeInQueue(SQWorker *mgmt, EQueueType type);
void qwClearExpiredSch(SQWorker *mgmt, SArray* pExpiredSch); void qwClearExpiredSch(SQWorker *mgmt, SArray* pExpiredSch);
int32_t qwAcquireScheduler(SQWorker *mgmt, uint64_t sId, int32_t rwType, SQWSchStatus **sch); int32_t qwAcquireScheduler(SQWorker *mgmt, uint64_t sId, int32_t rwType, SQWSchStatus **sch);
void qwFreeTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx *ctx); void qwFreeTaskCtx(SQWTaskCtx *ctx);
void qwDbgDumpMgmtInfo(SQWorker *mgmt); void qwDbgDumpMgmtInfo(SQWorker *mgmt);
int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, bool *ignore); int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, bool *ignore);
int32_t qwDbgBuildAndSendRedirectRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code, SEpSet *pEpSet); int32_t qwDbgBuildAndSendRedirectRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code, SEpSet *pEpSet);
int32_t qwAddTaskCtx(QW_FPARAMS_DEF); int32_t qwAddTaskCtx(QW_FPARAMS_DEF);
int32_t qwDbgResponseRedirect(SQWMsg *qwMsg, SQWTaskCtx *ctx); void qwDbgSimulateRedirect(SQWMsg *qwMsg, SQWTaskCtx *ctx, bool *rsped);
void qwDbgSimulateSleep(void);
void qwDbgSimulateDead(QW_FPARAMS_DEF, SQWTaskCtx *ctx, bool *rsped);
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -40,11 +40,13 @@ void qwBuildFetchRsp(void *msg, SOutputData *input, int32_t len, bool qComple ...@@ -40,11 +40,13 @@ void qwBuildFetchRsp(void *msg, SOutputData *input, int32_t len, bool qComple
int32_t qwBuildAndSendCQueryMsg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn); int32_t qwBuildAndSendCQueryMsg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn);
int32_t qwBuildAndSendQueryRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code, SQWTaskCtx *ctx); int32_t qwBuildAndSendQueryRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code, SQWTaskCtx *ctx);
int32_t qwBuildAndSendExplainRsp(SRpcHandleInfo *pConn, SExplainExecInfo *execInfo, int32_t num); int32_t qwBuildAndSendExplainRsp(SRpcHandleInfo *pConn, SExplainExecInfo *execInfo, int32_t num);
int32_t qwBuildAndSendErrorRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code);
void qwFreeFetchRsp(void *msg); void qwFreeFetchRsp(void *msg);
int32_t qwMallocFetchRsp(int32_t length, SRetrieveTableRsp **rsp); int32_t qwMallocFetchRsp(int32_t length, SRetrieveTableRsp **rsp);
int32_t qwBuildAndSendHbRsp(SRpcHandleInfo *pConn, SSchedulerHbRsp *rsp, int32_t code); int32_t qwBuildAndSendHbRsp(SRpcHandleInfo *pConn, SSchedulerHbRsp *rsp, int32_t code);
int32_t qwRegisterQueryBrokenLinkArg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn); int32_t qwRegisterQueryBrokenLinkArg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn);
int32_t qwRegisterHbBrokenLinkArg(SQWorker *mgmt, uint64_t sId, SRpcHandleInfo *pConn); int32_t qwRegisterHbBrokenLinkArg(SQWorker *mgmt, uint64_t sId, SRpcHandleInfo *pConn);
int32_t qwBuildAndSendDropMsg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "tmsg.h" #include "tmsg.h"
#include "tname.h" #include "tname.h"
SQWDebug gQWDebug = {.statusEnable = true, .dumpEnable = false, .tmp = false}; SQWDebug gQWDebug = {.statusEnable = true, .dumpEnable = false, .redirectSimulate = false, .deadSimulate = false, .sleepSimulate = false};
int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, bool *ignore) { int32_t qwDbgValidateStatus(QW_FPARAMS_DEF, int8_t oriStatus, int8_t newStatus, bool *ignore) {
if (!gQWDebug.statusEnable) { if (!gQWDebug.statusEnable) {
...@@ -147,8 +147,17 @@ int32_t qwDbgBuildAndSendRedirectRsp(int32_t rspType, SRpcHandleInfo *pConn, int ...@@ -147,8 +147,17 @@ int32_t qwDbgBuildAndSendRedirectRsp(int32_t rspType, SRpcHandleInfo *pConn, int
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t qwDbgResponseRedirect(SQWMsg *qwMsg, SQWTaskCtx *ctx) { void qwDbgSimulateRedirect(SQWMsg *qwMsg, SQWTaskCtx *ctx, bool *rsped) {
if (gQWDebug.tmp) { static int32_t ignoreTime = 0;
if (*rsped) {
return;
}
if (gQWDebug.redirectSimulate) {
if (++ignoreTime <= 10) {
return;
}
if (TDMT_SCH_QUERY == qwMsg->msgType && (0 == taosRand() % 3)) { if (TDMT_SCH_QUERY == qwMsg->msgType && (0 == taosRand() % 3)) {
SEpSet epSet = {0}; SEpSet epSet = {0};
epSet.inUse = 1; epSet.inUse = 1;
...@@ -162,42 +171,94 @@ int32_t qwDbgResponseRedirect(SQWMsg *qwMsg, SQWTaskCtx *ctx) { ...@@ -162,42 +171,94 @@ int32_t qwDbgResponseRedirect(SQWMsg *qwMsg, SQWTaskCtx *ctx) {
ctx->phase = QW_PHASE_POST_QUERY; ctx->phase = QW_PHASE_POST_QUERY;
qwDbgBuildAndSendRedirectRsp(qwMsg->msgType + 1, &qwMsg->connInfo, TSDB_CODE_RPC_REDIRECT, &epSet); qwDbgBuildAndSendRedirectRsp(qwMsg->msgType + 1, &qwMsg->connInfo, TSDB_CODE_RPC_REDIRECT, &epSet);
return TSDB_CODE_SUCCESS; *rsped = true;
return;
} }
if (TDMT_SCH_MERGE_QUERY == qwMsg->msgType && (0 == taosRand() % 3)) { if (TDMT_SCH_MERGE_QUERY == qwMsg->msgType && (0 == taosRand() % 3)) {
ctx->phase = QW_PHASE_POST_QUERY; QW_SET_PHASE(ctx, QW_PHASE_POST_QUERY);
qwDbgBuildAndSendRedirectRsp(qwMsg->msgType + 1, &qwMsg->connInfo, TSDB_CODE_RPC_REDIRECT, NULL); qwDbgBuildAndSendRedirectRsp(qwMsg->msgType + 1, &qwMsg->connInfo, TSDB_CODE_RPC_REDIRECT, NULL);
return TSDB_CODE_SUCCESS; *rsped = true;
return;
}
if ((TDMT_SCH_FETCH == qwMsg->msgType) && (0 == taosRand() % 9)) {
qwDbgBuildAndSendRedirectRsp(qwMsg->msgType + 1, &qwMsg->connInfo, TSDB_CODE_RPC_REDIRECT, NULL);
*rsped = true;
return;
} }
} }
}
return TSDB_CODE_SUCCESS; void qwDbgSimulateSleep(void) {
if (!gQWDebug.sleepSimulate) {
return;
}
static int32_t ignoreTime = 0;
if (++ignoreTime > 10) {
taosSsleep(taosRand() % 20);
}
}
void qwDbgSimulateDead(QW_FPARAMS_DEF, SQWTaskCtx *ctx, bool *rsped) {
if (!gQWDebug.deadSimulate) {
return;
}
if (*rsped) {
return;
}
static int32_t ignoreTime = 0;
if (++ignoreTime > 10 && 0 == taosRand() % 9) {
SRpcHandleInfo *pConn = ((ctx->msgType == TDMT_SCH_FETCH || ctx->msgType == TDMT_SCH_MERGE_FETCH) ? &ctx->dataConnInfo : &ctx->ctrlConnInfo);
qwBuildAndSendErrorRsp(ctx->msgType + 1, pConn, TSDB_CODE_RPC_BROKEN_LINK);
qwBuildAndSendDropMsg(QW_FPARAMS(), pConn);
*rsped = true;
return;
}
} }
int32_t qwDbgEnableDebug(char *option) { int32_t qwDbgEnableDebug(char *option) {
if (0 == strcasecmp(option, "lock")) { if (0 == strcasecmp(option, "lock")) {
gQWDebug.lockEnable = true; gQWDebug.lockEnable = true;
qDebug("qw lock debug enabled"); qError("qw lock debug enabled");
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
if (0 == strcasecmp(option, "status")) { if (0 == strcasecmp(option, "status")) {
gQWDebug.statusEnable = true; gQWDebug.statusEnable = true;
qDebug("qw status debug enabled"); qError("qw status debug enabled");
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
if (0 == strcasecmp(option, "dump")) { if (0 == strcasecmp(option, "dump")) {
gQWDebug.dumpEnable = true; gQWDebug.dumpEnable = true;
qDebug("qw dump debug enabled"); qError("qw dump debug enabled");
return TSDB_CODE_SUCCESS;
}
if (0 == strcasecmp(option, "sleep")) {
gQWDebug.sleepSimulate = true;
qError("qw sleep debug enabled");
return TSDB_CODE_SUCCESS;
}
if (0 == strcasecmp(option, "dead")) {
gQWDebug.sleepSimulate = true;
qError("qw dead debug enabled");
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
if (0 == strcasecmp(option, "tmp")) { if (0 == strcasecmp(option, "redirect")) {
gQWDebug.tmp = true; gQWDebug.redirectSimulate = true;
qDebug("qw tmp debug enabled"); qError("qw redirect debug enabled");
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
......
...@@ -43,6 +43,20 @@ void qwFreeFetchRsp(void *msg) { ...@@ -43,6 +43,20 @@ void qwFreeFetchRsp(void *msg) {
} }
} }
int32_t qwBuildAndSendErrorRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code) {
SRpcMsg rpcRsp = {
.msgType = rspType,
.pCont = NULL,
.contLen = 0,
.code = code,
.info = *pConn,
};
tmsgSendRsp(&rpcRsp);
return TSDB_CODE_SUCCESS;
}
int32_t qwBuildAndSendQueryRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code, SQWTaskCtx *ctx) { int32_t qwBuildAndSendQueryRsp(int32_t rspType, SRpcHandleInfo *pConn, int32_t code, SQWTaskCtx *ctx) {
STbVerInfo* tbInfo = ctx ? &ctx->tbInfo : NULL; STbVerInfo* tbInfo = ctx ? &ctx->tbInfo : NULL;
int64_t affectedRows = ctx ? ctx->affectedRows : 0; int64_t affectedRows = ctx ? ctx->affectedRows : 0;
...@@ -184,7 +198,6 @@ int32_t qwBuildAndSendDropMsg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn) { ...@@ -184,7 +198,6 @@ int32_t qwBuildAndSendDropMsg(QW_FPARAMS_DEF, SRpcHandleInfo *pConn) {
int32_t code = tmsgPutToQueue(&mgmt->msgCb, FETCH_QUEUE, &pNewMsg); int32_t code = tmsgPutToQueue(&mgmt->msgCb, FETCH_QUEUE, &pNewMsg);
if (TSDB_CODE_SUCCESS != code) { if (TSDB_CODE_SUCCESS != code) {
QW_SCH_TASK_ELOG("put drop task msg to queue failed, vgId:%d, code:%s", mgmt->nodeId, tstrerror(code)); QW_SCH_TASK_ELOG("put drop task msg to queue failed, vgId:%d, code:%s", mgmt->nodeId, tstrerror(code));
rpcFreeCont(req);
QW_ERR_RET(code); QW_ERR_RET(code);
} }
...@@ -374,8 +387,7 @@ int32_t qWorkerProcessQueryMsg(void *node, void *qWorkerMgmt, SRpcMsg *pMsg, int ...@@ -374,8 +387,7 @@ int32_t qWorkerProcessQueryMsg(void *node, void *qWorkerMgmt, SRpcMsg *pMsg, int
qwMsg.msgInfo.needFetch = msg->needFetch; qwMsg.msgInfo.needFetch = msg->needFetch;
char * sql = strndup(msg->msg, msg->sqlLen); char * sql = strndup(msg->msg, msg->sqlLen);
QW_SCH_TASK_DLOG("processQuery start, node:%p, type:%s, handle:%p, sql:%s", node, TMSG_INFO(pMsg->msgType), pMsg->info.handle, sql); QW_SCH_TASK_DLOG("processQuery start, node:%p, type:%s, handle:%p, SQL:%s", node, TMSG_INFO(pMsg->msgType), pMsg->info.handle, sql);
QW_ERR_RET(qwProcessQuery(QW_FPARAMS(), &qwMsg, sql)); QW_ERR_RET(qwProcessQuery(QW_FPARAMS(), &qwMsg, sql));
QW_SCH_TASK_DLOG("processQuery end, node:%p", node); QW_SCH_TASK_DLOG("processQuery end, node:%p", node);
......
...@@ -270,7 +270,7 @@ int32_t qwAddAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx) { return qwAddTask ...@@ -270,7 +270,7 @@ int32_t qwAddAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx) { return qwAddTask
void qwReleaseTaskCtx(SQWorker *mgmt, void *ctx) { taosHashRelease(mgmt->ctxHash, ctx); } void qwReleaseTaskCtx(SQWorker *mgmt, void *ctx) { taosHashRelease(mgmt->ctxHash, ctx); }
void qwFreeTaskHandle(QW_FPARAMS_DEF, qTaskInfo_t *taskHandle) { void qwFreeTaskHandle(qTaskInfo_t *taskHandle) {
// Note: free/kill may in RC // Note: free/kill may in RC
qTaskInfo_t otaskHandle = atomic_load_ptr(taskHandle); qTaskInfo_t otaskHandle = atomic_load_ptr(taskHandle);
if (otaskHandle && atomic_val_compare_exchange_ptr(taskHandle, otaskHandle, NULL)) { if (otaskHandle && atomic_val_compare_exchange_ptr(taskHandle, otaskHandle, NULL)) {
...@@ -278,7 +278,7 @@ void qwFreeTaskHandle(QW_FPARAMS_DEF, qTaskInfo_t *taskHandle) { ...@@ -278,7 +278,7 @@ void qwFreeTaskHandle(QW_FPARAMS_DEF, qTaskInfo_t *taskHandle) {
} }
} }
int32_t qwKillTaskHandle(QW_FPARAMS_DEF, SQWTaskCtx *ctx) { int32_t qwKillTaskHandle(SQWTaskCtx *ctx) {
int32_t code = 0; int32_t code = 0;
// Note: free/kill may in RC // Note: free/kill may in RC
qTaskInfo_t taskHandle = atomic_load_ptr(&ctx->taskHandle); qTaskInfo_t taskHandle = atomic_load_ptr(&ctx->taskHandle);
...@@ -290,7 +290,7 @@ int32_t qwKillTaskHandle(QW_FPARAMS_DEF, SQWTaskCtx *ctx) { ...@@ -290,7 +290,7 @@ int32_t qwKillTaskHandle(QW_FPARAMS_DEF, SQWTaskCtx *ctx) {
QW_RET(code); QW_RET(code);
} }
void qwFreeTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx *ctx) { void qwFreeTaskCtx(SQWTaskCtx *ctx) {
if (ctx->ctrlConnInfo.handle) { if (ctx->ctrlConnInfo.handle) {
tmsgReleaseHandle(&ctx->ctrlConnInfo, TAOS_CONN_SERVER); tmsgReleaseHandle(&ctx->ctrlConnInfo, TAOS_CONN_SERVER);
} }
...@@ -300,7 +300,7 @@ void qwFreeTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx *ctx) { ...@@ -300,7 +300,7 @@ void qwFreeTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx *ctx) {
// NO need to release dataConnInfo // NO need to release dataConnInfo
qwFreeTaskHandle(QW_FPARAMS(), &ctx->taskHandle); qwFreeTaskHandle(&ctx->taskHandle);
if (ctx->sinkHandle) { if (ctx->sinkHandle) {
dsDestroyDataSinker(ctx->sinkHandle); dsDestroyDataSinker(ctx->sinkHandle);
...@@ -336,7 +336,7 @@ int32_t qwDropTaskCtx(QW_FPARAMS_DEF) { ...@@ -336,7 +336,7 @@ int32_t qwDropTaskCtx(QW_FPARAMS_DEF) {
QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST); QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST);
} }
qwFreeTaskCtx(QW_FPARAMS(), &octx); qwFreeTaskCtx(&octx);
QW_TASK_DLOG_E("task ctx dropped"); QW_TASK_DLOG_E("task ctx dropped");
...@@ -463,13 +463,21 @@ void qwDestroyImpl(void *pMgmt) { ...@@ -463,13 +463,21 @@ void qwDestroyImpl(void *pMgmt) {
mgmt->hbTimer = NULL; mgmt->hbTimer = NULL;
taosTmrCleanUp(mgmt->timer); taosTmrCleanUp(mgmt->timer);
// TODO STOP ALL QUERY uint64_t qId, tId;
int32_t eId;
// TODO FREE ALL void *pIter = taosHashIterate(mgmt->ctxHash, NULL);
while (pIter) {
SQWTaskCtx *ctx = (SQWTaskCtx *)pIter;
void *key = taosHashGetKey(pIter, NULL);
QW_GET_QTID(key, qId, tId, eId);
qwFreeTaskCtx(ctx);
QW_TASK_DLOG_E("task ctx freed");
pIter = taosHashIterate(mgmt->ctxHash, pIter);
}
taosHashCleanup(mgmt->ctxHash); taosHashCleanup(mgmt->ctxHash);
void *pIter = taosHashIterate(mgmt->schHash, NULL); pIter = taosHashIterate(mgmt->schHash, NULL);
while (pIter) { while (pIter) {
SQWSchStatus *sch = (SQWSchStatus *)pIter; SQWSchStatus *sch = (SQWSchStatus *)pIter;
qwDestroySchStatus(sch); qwDestroySchStatus(sch);
......
...@@ -83,6 +83,7 @@ int32_t qwExecTask(QW_FPARAMS_DEF, SQWTaskCtx *ctx, bool *queryEnd) { ...@@ -83,6 +83,7 @@ int32_t qwExecTask(QW_FPARAMS_DEF, SQWTaskCtx *ctx, bool *queryEnd) {
// if *taskHandle is NULL, it's killed right now // if *taskHandle is NULL, it's killed right now
if (taskHandle) { if (taskHandle) {
qwDbgSimulateSleep();
code = qExecTask(taskHandle, &pRes, &useconds); code = qExecTask(taskHandle, &pRes, &useconds);
if (code) { if (code) {
if (code != TSDB_CODE_OPS_NOT_SUPPORT) { if (code != TSDB_CODE_OPS_NOT_SUPPORT) {
...@@ -293,11 +294,7 @@ int32_t qwHandlePrePhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inpu ...@@ -293,11 +294,7 @@ int32_t qwHandlePrePhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inpu
QW_LOCK(QW_WRITE, &ctx->lock); QW_LOCK(QW_WRITE, &ctx->lock);
if (QW_PHASE_PRE_FETCH == phase) { QW_SET_PHASE(ctx, phase);
atomic_store_8((int8_t *)&ctx->queryFetched, true);
} else {
atomic_store_8(&ctx->phase, phase);
}
if (atomic_load_8((int8_t *)&ctx->queryEnd)) { if (atomic_load_8((int8_t *)&ctx->queryEnd)) {
QW_TASK_ELOG_E("query already end"); QW_TASK_ELOG_E("query already end");
...@@ -370,6 +367,7 @@ int32_t qwHandlePrePhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inpu ...@@ -370,6 +367,7 @@ int32_t qwHandlePrePhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inpu
} }
_return: _return:
if (ctx) { if (ctx) {
QW_UPDATE_RSP_CODE(ctx, code); QW_UPDATE_RSP_CODE(ctx, code);
...@@ -390,7 +388,6 @@ int32_t qwHandlePostPhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inp ...@@ -390,7 +388,6 @@ int32_t qwHandlePostPhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inp
int32_t code = 0; int32_t code = 0;
SQWTaskCtx *ctx = NULL; SQWTaskCtx *ctx = NULL;
SRpcHandleInfo connInfo = {0}; SRpcHandleInfo connInfo = {0};
SRpcHandleInfo *rspConnection = NULL;
QW_TASK_DLOG("start to handle event at phase %s", qwPhaseStr(phase)); QW_TASK_DLOG("start to handle event at phase %s", qwPhaseStr(phase));
...@@ -403,13 +400,6 @@ int32_t qwHandlePostPhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inp ...@@ -403,13 +400,6 @@ int32_t qwHandlePostPhaseEvents(QW_FPARAMS_DEF, int8_t phase, SQWPhaseInput *inp
QW_ERR_JRET(TSDB_CODE_QRY_TASK_DROPPED); QW_ERR_JRET(TSDB_CODE_QRY_TASK_DROPPED);
} }
if (QW_PHASE_POST_QUERY == phase) {
connInfo = ctx->ctrlConnInfo;
rspConnection = &connInfo;
ctx->queryRsped = true;
}
if (QW_EVENT_RECEIVED(ctx, QW_EVENT_DROP)) { if (QW_EVENT_RECEIVED(ctx, QW_EVENT_DROP)) {
if (QW_PHASE_POST_FETCH == phase) { if (QW_PHASE_POST_FETCH == phase) {
QW_TASK_WLOG("drop received at wrong phase %s", qwPhaseStr(phase)); QW_TASK_WLOG("drop received at wrong phase %s", qwPhaseStr(phase));
...@@ -437,17 +427,23 @@ _return: ...@@ -437,17 +427,23 @@ _return:
qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_PART_SUCC); qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_PART_SUCC);
} }
if (rspConnection) { if (QW_PHASE_POST_QUERY == phase && ctx) {
qwBuildAndSendQueryRsp(input->msgType + 1, rspConnection, code, ctx); ctx->queryRsped = true;
QW_TASK_DLOG("query msg rsped, handle:%p, code:%x - %s", rspConnection->handle, code, tstrerror(code));
bool rsped = false;
SQWMsg qwMsg = {.msgType = ctx->msgType, .connInfo = ctx->ctrlConnInfo};
qwDbgSimulateRedirect(&qwMsg, ctx, &rsped);
qwDbgSimulateDead(QW_FPARAMS(), ctx, &rsped);
if (!rsped) {
qwBuildAndSendQueryRsp(input->msgType + 1, &ctx->ctrlConnInfo, code, ctx);
QW_TASK_DLOG("query msg rsped, handle:%p, code:%x - %s", ctx->ctrlConnInfo.handle, code, tstrerror(code));
}
} }
if (ctx) { if (ctx) {
QW_UPDATE_RSP_CODE(ctx, code); QW_UPDATE_RSP_CODE(ctx, code);
if (QW_PHASE_POST_FETCH != phase) { QW_SET_PHASE(ctx, phase);
atomic_store_8(&ctx->phase, phase);
}
QW_UNLOCK(QW_WRITE, &ctx->lock); QW_UNLOCK(QW_WRITE, &ctx->lock);
qwReleaseTaskCtx(mgmt, ctx); qwReleaseTaskCtx(mgmt, ctx);
...@@ -488,8 +484,6 @@ int32_t qwPreprocessQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg) { ...@@ -488,8 +484,6 @@ int32_t qwPreprocessQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg) {
QW_ERR_JRET(qwAddTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_INIT)); QW_ERR_JRET(qwAddTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_INIT));
qwDbgResponseRedirect(qwMsg, ctx);
_return: _return:
if (ctx) { if (ctx) {
...@@ -517,7 +511,7 @@ int32_t qwProcessQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg, const char* sql) { ...@@ -517,7 +511,7 @@ int32_t qwProcessQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg, const char* sql) {
ctx->taskType = qwMsg->msgInfo.taskType; ctx->taskType = qwMsg->msgInfo.taskType;
ctx->explain = qwMsg->msgInfo.explain; ctx->explain = qwMsg->msgInfo.explain;
ctx->needFetch = qwMsg->msgInfo.needFetch; ctx->needFetch = qwMsg->msgInfo.needFetch;
ctx->queryType = qwMsg->msgType; ctx->msgType = qwMsg->msgType;
//QW_TASK_DLOGL("subplan json string, len:%d, %s", qwMsg->msgLen, qwMsg->msg); //QW_TASK_DLOGL("subplan json string, len:%d, %s", qwMsg->msgLen, qwMsg->msg);
...@@ -636,8 +630,8 @@ int32_t qwProcessCQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg) { ...@@ -636,8 +630,8 @@ int32_t qwProcessCQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg) {
QW_LOCK(QW_WRITE, &ctx->lock); QW_LOCK(QW_WRITE, &ctx->lock);
if (queryEnd || code || 0 == atomic_load_8((int8_t *)&ctx->queryContinue)) { if (queryEnd || code || 0 == atomic_load_8((int8_t *)&ctx->queryContinue)) {
// Note: if necessary, fetch need to put cquery to queue again // Note: query is not running anymore
atomic_store_8(&ctx->phase, 0); QW_SET_PHASE(ctx, 0);
QW_UNLOCK(QW_WRITE, &ctx->lock); QW_UNLOCK(QW_WRITE, &ctx->lock);
break; break;
} }
...@@ -662,14 +656,13 @@ int32_t qwProcessFetch(QW_FPARAMS_DEF, SQWMsg *qwMsg) { ...@@ -662,14 +656,13 @@ int32_t qwProcessFetch(QW_FPARAMS_DEF, SQWMsg *qwMsg) {
QW_ERR_JRET(qwGetTaskCtx(QW_FPARAMS(), &ctx)); QW_ERR_JRET(qwGetTaskCtx(QW_FPARAMS(), &ctx));
ctx->queryType = qwMsg->msgType; ctx->msgType = qwMsg->msgType;
ctx->dataConnInfo = qwMsg->connInfo;
SOutputData sOutput = {0}; SOutputData sOutput = {0};
QW_ERR_JRET(qwGetQueryResFromSink(QW_FPARAMS(), ctx, &dataLen, &rsp, &sOutput)); QW_ERR_JRET(qwGetQueryResFromSink(QW_FPARAMS(), ctx, &dataLen, &rsp, &sOutput));
if (NULL == rsp) { if (NULL == rsp) {
ctx->dataConnInfo = qwMsg->connInfo;
QW_SET_EVENT_RECEIVED(ctx, QW_EVENT_FETCH); QW_SET_EVENT_RECEIVED(ctx, QW_EVENT_FETCH);
} else { } else {
bool qComplete = (DS_BUF_EMPTY == sOutput.bufStatus && sOutput.queryEnd); bool qComplete = (DS_BUF_EMPTY == sOutput.bufStatus && sOutput.queryEnd);
...@@ -714,9 +707,16 @@ _return: ...@@ -714,9 +707,16 @@ _return:
} }
if (code || rsp) { if (code || rsp) {
qwBuildAndSendFetchRsp(qwMsg->msgType + 1, &qwMsg->connInfo, rsp, dataLen, code); bool rsped = false;
QW_TASK_DLOG("%s send, handle:%p, code:%x - %s, dataLen:%d", TMSG_INFO(qwMsg->msgType + 1), qwMsg->connInfo.handle, code, tstrerror(code), if (ctx) {
dataLen); qwDbgSimulateRedirect(qwMsg, ctx, &rsped);
qwDbgSimulateDead(QW_FPARAMS(), ctx, &rsped);
}
if (!rsped) {
qwBuildAndSendFetchRsp(qwMsg->msgType + 1, &qwMsg->connInfo, rsp, dataLen, code);
QW_TASK_DLOG("%s send, handle:%p, code:%x - %s, dataLen:%d", TMSG_INFO(qwMsg->msgType + 1), qwMsg->connInfo.handle, code, tstrerror(code),
dataLen);
}
} }
QW_RET(TSDB_CODE_SUCCESS); QW_RET(TSDB_CODE_SUCCESS);
...@@ -724,7 +724,7 @@ _return: ...@@ -724,7 +724,7 @@ _return:
int32_t qwProcessDrop(QW_FPARAMS_DEF, SQWMsg *qwMsg) { int32_t qwProcessDrop(QW_FPARAMS_DEF, SQWMsg *qwMsg) {
int32_t code = 0; int32_t code = 0;
bool rsped = false; bool dropped = false;
SQWTaskCtx *ctx = NULL; SQWTaskCtx *ctx = NULL;
bool locked = false; bool locked = false;
...@@ -740,18 +740,14 @@ int32_t qwProcessDrop(QW_FPARAMS_DEF, SQWMsg *qwMsg) { ...@@ -740,18 +740,14 @@ int32_t qwProcessDrop(QW_FPARAMS_DEF, SQWMsg *qwMsg) {
} }
if (QW_QUERY_RUNNING(ctx)) { if (QW_QUERY_RUNNING(ctx)) {
QW_ERR_JRET(qwKillTaskHandle(QW_FPARAMS(), ctx)); QW_ERR_JRET(qwKillTaskHandle(ctx));
qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_DROP); qwUpdateTaskStatus(QW_FPARAMS(), JOB_TASK_STATUS_DROP);
} else if (ctx->phase > 0) {
QW_ERR_JRET(qwDropTask(QW_FPARAMS()));
rsped = true;
} else { } else {
// task not started QW_ERR_JRET(qwDropTask(QW_FPARAMS()));
dropped = true;
} }
if (!rsped) { if (!dropped) {
ctx->ctrlConnInfo = qwMsg->connInfo;
QW_SET_EVENT_RECEIVED(ctx, QW_EVENT_DROP); QW_SET_EVENT_RECEIVED(ctx, QW_EVENT_DROP);
} }
...@@ -954,7 +950,7 @@ int32_t qwProcessDelete(QW_FPARAMS_DEF, SQWMsg *qwMsg, SDeleteRes *pRes) { ...@@ -954,7 +950,7 @@ int32_t qwProcessDelete(QW_FPARAMS_DEF, SQWMsg *qwMsg, SDeleteRes *pRes) {
_return: _return:
qwFreeTaskCtx(QW_FPARAMS(), &ctx); qwFreeTaskCtx(&ctx);
QW_RET(TSDB_CODE_SUCCESS); QW_RET(TSDB_CODE_SUCCESS);
} }
......
...@@ -55,13 +55,11 @@ typedef enum { ...@@ -55,13 +55,11 @@ typedef enum {
#define SCHEDULE_DEFAULT_MAX_TASK_NUM 1000 #define SCHEDULE_DEFAULT_MAX_TASK_NUM 1000
#define SCHEDULE_DEFAULT_MAX_NODE_TABLE_NUM 200 // unit is TSDB_TABLE_NUM_UNIT #define SCHEDULE_DEFAULT_MAX_NODE_TABLE_NUM 200 // unit is TSDB_TABLE_NUM_UNIT
#define SCHEDULE_DEFAULT_POLICY SCH_LOAD_SEQ #define SCHEDULE_DEFAULT_POLICY SCH_LOAD_SEQ
#define SCHEDULE_DEFAULT_MAX_NODE_NUM 20
#define SCH_DEFAULT_TASK_TIMEOUT_USEC 10000000 #define SCH_DEFAULT_TASK_TIMEOUT_USEC 10000000
#define SCH_MAX_TASK_TIMEOUT_USEC 60000000 #define SCH_MAX_TASK_TIMEOUT_USEC 60000000
#define SCH_MAX_CANDIDATE_EP_NUM TSDB_MAX_REPLICA #define SCH_DEFAULT_MAX_RETRY_NUM 6
typedef struct SSchDebug { typedef struct SSchDebug {
bool lockEnable; bool lockEnable;
...@@ -211,6 +209,7 @@ typedef struct SSchTask { ...@@ -211,6 +209,7 @@ typedef struct SSchTask {
int32_t maxExecTimes; // task max exec times int32_t maxExecTimes; // task max exec times
int32_t maxRetryTimes; // task max retry times int32_t maxRetryTimes; // task max retry times
int32_t retryTimes; // task retry times int32_t retryTimes; // task retry times
bool waitRetry; // wait for retry
int32_t execId; // task current execute index int32_t execId; // task current execute index
SSchLevel *level; // level SSchLevel *level; // level
SRWLatch planLock; // task update plan lock SRWLatch planLock; // task update plan lock
...@@ -274,7 +273,8 @@ typedef struct SSchJob { ...@@ -274,7 +273,8 @@ typedef struct SSchJob {
int32_t errCode; int32_t errCode;
SRWLatch resLock; SRWLatch resLock;
SExecResult execRes; SExecResult execRes;
void *resData; //TODO free it or not void *fetchRes; //TODO free it or not
bool fetched;
int32_t resNumOfRows; int32_t resNumOfRows;
SSchResInfo userRes; SSchResInfo userRes;
const char *sql; const char *sql;
...@@ -326,7 +326,7 @@ extern SSchedulerMgmt schMgmt; ...@@ -326,7 +326,7 @@ extern SSchedulerMgmt schMgmt;
#define SCH_IS_EXPLAIN_JOB(_job) (EXPLAIN_MODE_ANALYZE == (_job)->attr.explainMode) #define SCH_IS_EXPLAIN_JOB(_job) (EXPLAIN_MODE_ANALYZE == (_job)->attr.explainMode)
#define SCH_NETWORK_ERR(_code) ((_code) == TSDB_CODE_RPC_BROKEN_LINK || (_code) == TSDB_CODE_RPC_NETWORK_UNAVAIL) #define SCH_NETWORK_ERR(_code) ((_code) == TSDB_CODE_RPC_BROKEN_LINK || (_code) == TSDB_CODE_RPC_NETWORK_UNAVAIL)
#define SCH_MERGE_TASK_NETWORK_ERR(_task, _code, _len) (SCH_NETWORK_ERR(_code) && (((_len) > 0) || (!SCH_IS_DATA_BIND_TASK(_task)))) #define SCH_MERGE_TASK_NETWORK_ERR(_task, _code, _len) (SCH_NETWORK_ERR(_code) && (((_len) > 0) || (!SCH_IS_DATA_BIND_TASK(_task))))
#define SCH_REDIRECT_MSGTYPE(_msgType) ((_msgType) == TDMT_SCH_QUERY || (_msgType) == TDMT_SCH_MERGE_QUERY || (_msgType) == TDMT_SCH_FETCH || (_msgType) == TDMT_SCH_MERGE_FETCH) #define SCH_REDIRECT_MSGTYPE(_msgType) ((_msgType) == TDMT_SCH_LINK_BROKEN || (_msgType) == TDMT_SCH_QUERY || (_msgType) == TDMT_SCH_MERGE_QUERY || (_msgType) == TDMT_SCH_FETCH || (_msgType) == TDMT_SCH_MERGE_FETCH)
#define SCH_TASK_NEED_REDIRECT(_task, _msgType, _code, _rspLen) (SCH_REDIRECT_MSGTYPE(_msgType) && (NEED_SCHEDULER_REDIRECT_ERROR(_code) || SCH_MERGE_TASK_NETWORK_ERR((_task), (_code), (_rspLen)))) #define SCH_TASK_NEED_REDIRECT(_task, _msgType, _code, _rspLen) (SCH_REDIRECT_MSGTYPE(_msgType) && (NEED_SCHEDULER_REDIRECT_ERROR(_code) || SCH_MERGE_TASK_NETWORK_ERR((_task), (_code), (_rspLen))))
#define SCH_NEED_RETRY(_msgType, _code) ((SCH_NETWORK_ERR(_code) && SCH_REDIRECT_MSGTYPE(_msgType)) || (_code) == TSDB_CODE_SCH_TIMEOUT_ERROR) #define SCH_NEED_RETRY(_msgType, _code) ((SCH_NETWORK_ERR(_code) && SCH_REDIRECT_MSGTYPE(_msgType)) || (_code) == TSDB_CODE_SCH_TIMEOUT_ERROR)
...@@ -368,6 +368,8 @@ extern SSchedulerMgmt schMgmt; ...@@ -368,6 +368,8 @@ extern SSchedulerMgmt schMgmt;
qError("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__) qError("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__)
#define SCH_TASK_DLOG(param, ...) \ #define SCH_TASK_DLOG(param, ...) \
qDebug("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__) qDebug("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__)
#define SCH_TASK_TLOG(param, ...) \
qTrace("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__)
#define SCH_TASK_DLOGL(param, ...) \ #define SCH_TASK_DLOGL(param, ...) \
qDebugL("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__) qDebugL("QID:0x%" PRIx64 ",TID:0x%" PRIx64 ",EID:%d " param, pJob->queryId, SCH_TASK_ID(pTask), SCH_TASK_EID(pTask),__VA_ARGS__)
#define SCH_TASK_WLOG(param, ...) \ #define SCH_TASK_WLOG(param, ...) \
...@@ -441,7 +443,7 @@ void schFreeRpcCtx(SRpcCtx *pCtx); ...@@ -441,7 +443,7 @@ void schFreeRpcCtx(SRpcCtx *pCtx);
int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp); int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp);
bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus); bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus);
int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask); int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask);
int32_t schSaveJobQueryRes(SSchJob *pJob, SQueryTableRsp *rsp); int32_t schSaveJobExecRes(SSchJob *pJob, SQueryTableRsp *rsp);
int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp); int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp);
void schProcessOnDataFetched(SSchJob *job); void schProcessOnDataFetched(SSchJob *job);
int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask); int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask);
...@@ -492,7 +494,7 @@ int32_t schSwitchTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask); ...@@ -492,7 +494,7 @@ int32_t schSwitchTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask);
void schDirectPostJobRes(SSchedulerReq* pReq, int32_t errCode); void schDirectPostJobRes(SSchedulerReq* pReq, int32_t errCode);
int32_t schHandleJobFailure(SSchJob *pJob, int32_t errCode); int32_t schHandleJobFailure(SSchJob *pJob, int32_t errCode);
int32_t schHandleJobDrop(SSchJob *pJob, int32_t errCode); int32_t schHandleJobDrop(SSchJob *pJob, int32_t errCode);
bool schChkCurrentOp(SSchJob *pJob, int32_t op, bool sync); bool schChkCurrentOp(SSchJob *pJob, int32_t op, int8_t sync);
extern SSchDebug gSCHDebug; extern SSchDebug gSCHDebug;
......
...@@ -110,7 +110,7 @@ int32_t schUpdateJobStatus(SSchJob *pJob, int8_t newStatus) { ...@@ -110,7 +110,7 @@ int32_t schUpdateJobStatus(SSchJob *pJob, int8_t newStatus) {
break; break;
case JOB_TASK_STATUS_PART_SUCC: case JOB_TASK_STATUS_PART_SUCC:
if (newStatus != JOB_TASK_STATUS_FAIL && newStatus != JOB_TASK_STATUS_SUCC && if (newStatus != JOB_TASK_STATUS_FAIL && newStatus != JOB_TASK_STATUS_SUCC &&
newStatus != JOB_TASK_STATUS_DROP) { newStatus != JOB_TASK_STATUS_DROP && newStatus != JOB_TASK_STATUS_EXEC) {
SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR); SCH_ERR_JRET(TSDB_CODE_QRY_APP_ERROR);
} }
...@@ -389,13 +389,18 @@ int32_t schDumpJobExecRes(SSchJob* pJob, SExecResult* pRes) { ...@@ -389,13 +389,18 @@ int32_t schDumpJobExecRes(SSchJob* pJob, SExecResult* pRes) {
int32_t schDumpJobFetchRes(SSchJob* pJob, void** pData) { int32_t schDumpJobFetchRes(SSchJob* pJob, void** pData) {
int32_t code = 0; int32_t code = 0;
if (pJob->resData && ((SRetrieveTableRsp *)pJob->resData)->completed) {
SCH_ERR_RET(schSwitchJobStatus(pJob, JOB_TASK_STATUS_SUCC, NULL)); SCH_LOCK(SCH_WRITE, &pJob->resLock);
pJob->fetched = true;
if (pJob->fetchRes && ((SRetrieveTableRsp *)pJob->fetchRes)->completed) {
SCH_ERR_JRET(schSwitchJobStatus(pJob, JOB_TASK_STATUS_SUCC, NULL));
} }
while (true) { while (true) {
*pData = atomic_load_ptr(&pJob->resData); *pData = atomic_load_ptr(&pJob->fetchRes);
if (*pData != atomic_val_compare_exchange_ptr(&pJob->resData, *pData, NULL)) { if (*pData != atomic_val_compare_exchange_ptr(&pJob->fetchRes, *pData, NULL)) {
continue; continue;
} }
...@@ -414,7 +419,11 @@ int32_t schDumpJobFetchRes(SSchJob* pJob, void** pData) { ...@@ -414,7 +419,11 @@ int32_t schDumpJobFetchRes(SSchJob* pJob, void** pData) {
SCH_JOB_DLOG("fetch done, totalRows:%d", pJob->resNumOfRows); SCH_JOB_DLOG("fetch done, totalRows:%d", pJob->resNumOfRows);
return TSDB_CODE_SUCCESS; _return:
SCH_UNLOCK(SCH_WRITE, &pJob->resLock);
return code;
} }
int32_t schNotifyUserExecRes(SSchJob* pJob) { int32_t schNotifyUserExecRes(SSchJob* pJob) {
...@@ -512,8 +521,12 @@ int32_t schHandleJobDrop(SSchJob *pJob, int32_t errCode) { ...@@ -512,8 +521,12 @@ int32_t schHandleJobDrop(SSchJob *pJob, int32_t errCode) {
} }
int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) { int32_t schProcessOnJobPartialSuccess(SSchJob *pJob) {
schPostJobRes(pJob, SCH_OP_EXEC); if (schChkCurrentOp(pJob, SCH_OP_FETCH, -1)) {
SCH_ERR_RET(schLaunchFetchTask(pJob));
} else {
schPostJobRes(pJob, 0);
}
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
...@@ -526,7 +539,7 @@ int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRs ...@@ -526,7 +539,7 @@ int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRs
SCH_TASK_DLOG("got explain rsp, rows:%d, complete:%d", htonl(pRsp->numOfRows), pRsp->completed); SCH_TASK_DLOG("got explain rsp, rows:%d, complete:%d", htonl(pRsp->numOfRows), pRsp->completed);
atomic_store_32(&pJob->resNumOfRows, htonl(pRsp->numOfRows)); atomic_store_32(&pJob->resNumOfRows, htonl(pRsp->numOfRows));
atomic_store_ptr(&pJob->resData, pRsp); atomic_store_ptr(&pJob->fetchRes, pRsp);
SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCC); SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_SUCC);
...@@ -561,7 +574,7 @@ int32_t schLaunchJobLowerLevel(SSchJob *pJob, SSchTask *pTask) { ...@@ -561,7 +574,7 @@ int32_t schLaunchJobLowerLevel(SSchJob *pJob, SSchTask *pTask) {
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
int32_t schSaveJobQueryRes(SSchJob *pJob, SQueryTableRsp *rsp) { int32_t schSaveJobExecRes(SSchJob *pJob, SQueryTableRsp *rsp) {
if (rsp->tbFName[0]) { if (rsp->tbFName[0]) {
SCH_LOCK(SCH_WRITE, &pJob->resLock); SCH_LOCK(SCH_WRITE, &pJob->resLock);
...@@ -600,7 +613,7 @@ int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask) { ...@@ -600,7 +613,7 @@ int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask) {
int32_t schLaunchJob(SSchJob *pJob) { int32_t schLaunchJob(SSchJob *pJob) {
if (EXPLAIN_MODE_STATIC == pJob->attr.explainMode) { if (EXPLAIN_MODE_STATIC == pJob->attr.explainMode) {
SCH_ERR_RET(qExecStaticExplain(pJob->pDag, (SRetrieveTableRsp **)&pJob->resData)); SCH_ERR_RET(qExecStaticExplain(pJob->pDag, (SRetrieveTableRsp **)&pJob->fetchRes));
SCH_ERR_RET(schSwitchJobStatus(pJob, JOB_TASK_STATUS_PART_SUCC, NULL)); SCH_ERR_RET(schSwitchJobStatus(pJob, JOB_TASK_STATUS_PART_SUCC, NULL));
} else { } else {
SSchLevel *level = taosArrayGet(pJob->levels, pJob->levelIdx); SSchLevel *level = taosArrayGet(pJob->levels, pJob->levelIdx);
...@@ -661,7 +674,7 @@ void schFreeJobImpl(void *job) { ...@@ -661,7 +674,7 @@ void schFreeJobImpl(void *job) {
qDestroyQueryPlan(pJob->pDag); qDestroyQueryPlan(pJob->pDag);
taosMemoryFreeClear(pJob->userRes.execRes); taosMemoryFreeClear(pJob->userRes.execRes);
taosMemoryFreeClear(pJob->resData); taosMemoryFreeClear(pJob->fetchRes);
taosMemoryFree(pJob); taosMemoryFree(pJob);
int32_t jobNum = atomic_sub_fetch_32(&schMgmt.jobNum, 1); int32_t jobNum = atomic_sub_fetch_32(&schMgmt.jobNum, 1);
...@@ -795,9 +808,14 @@ void schDirectPostJobRes(SSchedulerReq* pReq, int32_t errCode) { ...@@ -795,9 +808,14 @@ void schDirectPostJobRes(SSchedulerReq* pReq, int32_t errCode) {
} }
} }
bool schChkCurrentOp(SSchJob *pJob, int32_t op, bool sync) { bool schChkCurrentOp(SSchJob *pJob, int32_t op, int8_t sync) {
bool r = false;
SCH_LOCK(SCH_READ, &pJob->opStatus.lock); SCH_LOCK(SCH_READ, &pJob->opStatus.lock);
bool r = (pJob->opStatus.op == op) && (pJob->opStatus.syncReq == sync); if (sync >= 0) {
r = (pJob->opStatus.op == op) && (pJob->opStatus.syncReq == sync);
} else {
r = (pJob->opStatus.op == op);
}
SCH_UNLOCK(SCH_READ, &pJob->opStatus.lock); SCH_UNLOCK(SCH_READ, &pJob->opStatus.lock);
return r; return r;
......
...@@ -256,7 +256,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa ...@@ -256,7 +256,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa
SCH_ERR_JRET(rsp->code); SCH_ERR_JRET(rsp->code);
SCH_ERR_JRET(schSaveJobQueryRes(pJob, rsp)); SCH_ERR_JRET(schSaveJobExecRes(pJob, rsp));
atomic_add_fetch_32(&pJob->resNumOfRows, rsp->affectedRows); atomic_add_fetch_32(&pJob->resNumOfRows, rsp->affectedRows);
...@@ -277,8 +277,8 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa ...@@ -277,8 +277,8 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa
SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT); SCH_ERR_JRET(TSDB_CODE_QRY_INVALID_INPUT);
} }
if (pJob->resData) { if (pJob->fetchRes) {
SCH_TASK_ELOG("explain result is already generated, res:%p", pJob->resData); SCH_TASK_ELOG("explain result is already generated, res:%p", pJob->fetchRes);
SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR);
} }
...@@ -325,13 +325,13 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa ...@@ -325,13 +325,13 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
if (pJob->resData) { if (pJob->fetchRes) {
SCH_TASK_ELOG("got fetch rsp while res already exists, res:%p", pJob->resData); SCH_TASK_ELOG("got fetch rsp while res already exists, res:%p", pJob->fetchRes);
taosMemoryFreeClear(rsp); taosMemoryFreeClear(rsp);
SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR); SCH_ERR_JRET(TSDB_CODE_SCH_STATUS_ERROR);
} }
atomic_store_ptr(&pJob->resData, rsp); atomic_store_ptr(&pJob->fetchRes, rsp);
atomic_add_fetch_32(&pJob->resNumOfRows, htonl(rsp->numOfRows)); atomic_add_fetch_32(&pJob->resNumOfRows, htonl(rsp->numOfRows));
if (rsp->completed) { if (rsp->completed) {
...@@ -1010,6 +1010,7 @@ int32_t schBuildAndSendMsg(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, ...@@ -1010,6 +1010,7 @@ int32_t schBuildAndSendMsg(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr,
memcpy(pMsg->msg + len, pTask->msg, pTask->msgLen); memcpy(pMsg->msg + len, pTask->msg, pTask->msgLen);
persistHandle = true; persistHandle = true;
SCH_SET_TASK_HANDLE(pTask, rpcAllocHandle());
break; break;
} }
case TDMT_SCH_FETCH: case TDMT_SCH_FETCH:
......
...@@ -47,10 +47,10 @@ void schFreeTask(SSchJob *pJob, SSchTask *pTask) { ...@@ -47,10 +47,10 @@ void schFreeTask(SSchJob *pJob, SSchTask *pTask) {
void schInitTaskRetryTimes(SSchJob *pJob, SSchTask *pTask, SSchLevel *pLevel) { void schInitTaskRetryTimes(SSchJob *pJob, SSchTask *pTask, SSchLevel *pLevel) {
if (SCH_IS_DATA_BIND_TASK(pTask) || (!SCH_IS_QUERY_JOB(pJob)) || (SCH_ALL != schMgmt.cfg.schPolicy)) { if (SCH_IS_DATA_BIND_TASK(pTask) || (!SCH_IS_QUERY_JOB(pJob)) || (SCH_ALL != schMgmt.cfg.schPolicy)) {
pTask->maxRetryTimes = SCH_MAX_CANDIDATE_EP_NUM; pTask->maxRetryTimes = SCH_DEFAULT_MAX_RETRY_NUM;
} else { } else {
int32_t nodeNum = taosArrayGetSize(pJob->nodeList); int32_t nodeNum = taosArrayGetSize(pJob->nodeList);
pTask->maxRetryTimes = TMAX(nodeNum, SCH_MAX_CANDIDATE_EP_NUM); pTask->maxRetryTimes = TMAX(nodeNum, SCH_DEFAULT_MAX_RETRY_NUM);
} }
pTask->maxExecTimes = pTask->maxRetryTimes * (pLevel->level + 1); pTask->maxExecTimes = pTask->maxRetryTimes * (pLevel->level + 1);
...@@ -64,11 +64,11 @@ int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel * ...@@ -64,11 +64,11 @@ int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *
pTask->execId = -1; pTask->execId = -1;
pTask->timeoutUsec = SCH_DEFAULT_TASK_TIMEOUT_USEC; pTask->timeoutUsec = SCH_DEFAULT_TASK_TIMEOUT_USEC;
pTask->taskId = schGenTaskId(); pTask->taskId = schGenTaskId();
pTask->execNodes =
taosHashInit(SCH_MAX_CANDIDATE_EP_NUM, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_NO_LOCK);
schInitTaskRetryTimes(pJob, pTask, pLevel); schInitTaskRetryTimes(pJob, pTask, pLevel);
pTask->execNodes =
taosHashInit(pTask->maxExecTimes, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_NO_LOCK);
pTask->profile.execTime = taosArrayInit(pTask->maxExecTimes, sizeof(int64_t)); pTask->profile.execTime = taosArrayInit(pTask->maxExecTimes, sizeof(int64_t));
if (NULL == pTask->execNodes || NULL == pTask->profile.execTime) { if (NULL == pTask->execNodes || NULL == pTask->profile.execTime) {
SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY);
...@@ -125,8 +125,8 @@ int32_t schDropTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_ ...@@ -125,8 +125,8 @@ int32_t schDropTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_
SCH_TASK_DLOG("execId %d removed from execNodeList", execId); SCH_TASK_DLOG("execId %d removed from execNodeList", execId);
} }
if (execId != pTask->execId) { // ignore it if ((execId != pTask->execId) || pTask->waitRetry) { // ignore it
SCH_TASK_DLOG("execId %d is not current execId %d", execId, pTask->execId); SCH_TASK_DLOG("execId %d is already not current execId %d, waitRetry %d", execId, pTask->execId, pTask->waitRetry);
SCH_ERR_RET(TSDB_CODE_SCH_IGNORE_ERROR); SCH_ERR_RET(TSDB_CODE_SCH_IGNORE_ERROR);
} }
...@@ -138,7 +138,17 @@ int32_t schUpdateTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int3 ...@@ -138,7 +138,17 @@ int32_t schUpdateTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int3
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
if ((execId != pTask->execId) || pTask->waitRetry) { // ignore it
SCH_TASK_DLOG("handle not updated since execId %d is already not current execId %d, waitRetry %d", execId, pTask->execId, pTask->waitRetry);
return TSDB_CODE_SUCCESS;
}
SSchNodeInfo *nodeInfo = taosHashGet(pTask->execNodes, &execId, sizeof(execId)); SSchNodeInfo *nodeInfo = taosHashGet(pTask->execNodes, &execId, sizeof(execId));
if (NULL == nodeInfo) { // ignore it
SCH_TASK_DLOG("handle not updated since execId %d already not exist, current execId %d, waitRetry %d", execId, pTask->execId, pTask->waitRetry);
return TSDB_CODE_SUCCESS;
}
nodeInfo->handle = handle; nodeInfo->handle = handle;
SCH_TASK_DLOG("handle updated to %p for execId %d", handle, execId); SCH_TASK_DLOG("handle updated to %p for execId %d", handle, execId);
...@@ -335,6 +345,7 @@ int32_t schDoTaskRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf *pData, int32 ...@@ -335,6 +345,7 @@ int32_t schDoTaskRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf *pData, int32
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
pTask->waitRetry = true;
schDropTaskOnExecNode(pJob, pTask); schDropTaskOnExecNode(pJob, pTask);
taosHashClear(pTask->execNodes); taosHashClear(pTask->execNodes);
SCH_ERR_JRET(schRemoveTaskFromExecList(pJob, pTask)); SCH_ERR_JRET(schRemoveTaskFromExecList(pJob, pTask));
...@@ -394,6 +405,18 @@ _return: ...@@ -394,6 +405,18 @@ _return:
int32_t schHandleRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf *pData, int32_t rspCode) { int32_t schHandleRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf *pData, int32_t rspCode) {
int32_t code = 0; int32_t code = 0;
if (JOB_TASK_STATUS_PART_SUCC == pJob->status) {
SCH_LOCK(SCH_WRITE, &pJob->resLock);
if (pJob->fetched) {
SCH_UNLOCK(SCH_WRITE, &pJob->resLock);
SCH_TASK_ELOG("already fetched while got error %s", tstrerror(rspCode));
SCH_ERR_RET(rspCode);
}
SCH_UNLOCK(SCH_WRITE, &pJob->resLock);
schUpdateJobStatus(pJob, JOB_TASK_STATUS_EXEC);
}
if (SCH_IS_DATA_BIND_TASK(pTask)) { if (SCH_IS_DATA_BIND_TASK(pTask)) {
if (NULL == pData->pEpSet) { if (NULL == pData->pEpSet) {
SCH_TASK_ELOG("no epset updated while got error %s", tstrerror(rspCode)); SCH_TASK_ELOG("no epset updated while got error %s", tstrerror(rspCode));
...@@ -591,7 +614,7 @@ int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) { ...@@ -591,7 +614,7 @@ int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) {
if (pJob->nodeList) { if (pJob->nodeList) {
nodeNum = taosArrayGetSize(pJob->nodeList); nodeNum = taosArrayGetSize(pJob->nodeList);
for (int32_t i = 0; i < nodeNum && addNum < SCH_MAX_CANDIDATE_EP_NUM; ++i) { for (int32_t i = 0; i < nodeNum; ++i) {
SQueryNodeLoad *nload = taosArrayGet(pJob->nodeList, i); SQueryNodeLoad *nload = taosArrayGet(pJob->nodeList, i);
SQueryNodeAddr *naddr = &nload->addr; SQueryNodeAddr *naddr = &nload->addr;
...@@ -600,8 +623,8 @@ int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) { ...@@ -600,8 +623,8 @@ int32_t schSetAddrsFromNodeList(SSchJob *pJob, SSchTask *pTask) {
SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY);
} }
SCH_TASK_DLOG("set %dth candidate addr, id %d, fqdn:%s, port:%d", i, naddr->nodeId, SCH_GET_CUR_EP(naddr)->fqdn, SCH_TASK_TLOG("set %dth candidate addr, id %d, inUse:%d/%d, fqdn:%s, port:%d", i, naddr->nodeId, naddr->epSet.inUse, naddr->epSet.numOfEps,
SCH_GET_CUR_EP(naddr)->port); SCH_GET_CUR_EP(naddr)->fqdn, SCH_GET_CUR_EP(naddr)->port);
++addNum; ++addNum;
} }
...@@ -621,9 +644,9 @@ int32_t schSetTaskCandidateAddrs(SSchJob *pJob, SSchTask *pTask) { ...@@ -621,9 +644,9 @@ int32_t schSetTaskCandidateAddrs(SSchJob *pJob, SSchTask *pTask) {
} }
pTask->candidateIdx = 0; pTask->candidateIdx = 0;
pTask->candidateAddrs = taosArrayInit(SCH_MAX_CANDIDATE_EP_NUM, sizeof(SQueryNodeAddr)); pTask->candidateAddrs = taosArrayInit(SCHEDULE_DEFAULT_MAX_NODE_NUM, sizeof(SQueryNodeAddr));
if (NULL == pTask->candidateAddrs) { if (NULL == pTask->candidateAddrs) {
SCH_TASK_ELOG("taosArrayInit %d condidate addrs failed", SCH_MAX_CANDIDATE_EP_NUM); SCH_TASK_ELOG("taosArrayInit %d condidate addrs failed", SCHEDULE_DEFAULT_MAX_NODE_NUM);
SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY); SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY);
} }
...@@ -790,6 +813,7 @@ int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) { ...@@ -790,6 +813,7 @@ int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask) {
atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1); atomic_add_fetch_32(&pTask->level->taskLaunchedNum, 1);
pTask->execId++; pTask->execId++;
pTask->retryTimes++; pTask->retryTimes++;
pTask->waitRetry = false;
SCH_TASK_DLOG("start to launch task, execId %d, retry %d", pTask->execId, pTask->retryTimes); SCH_TASK_DLOG("start to launch task, execId %d, retry %d", pTask->execId, pTask->retryTimes);
...@@ -885,9 +909,9 @@ void schDropTaskInHashList(SSchJob *pJob, SHashObj *list) { ...@@ -885,9 +909,9 @@ void schDropTaskInHashList(SSchJob *pJob, SHashObj *list) {
int32_t schLaunchFetchTask(SSchJob *pJob) { int32_t schLaunchFetchTask(SSchJob *pJob) {
int32_t code = 0; int32_t code = 0;
void *resData = atomic_load_ptr(&pJob->resData); void *fetchRes = atomic_load_ptr(&pJob->fetchRes);
if (resData) { if (fetchRes) {
SCH_JOB_DLOG("res already fetched, res:%p", resData); SCH_JOB_DLOG("res already fetched, res:%p", fetchRes);
return TSDB_CODE_SUCCESS; return TSDB_CODE_SUCCESS;
} }
......
...@@ -170,7 +170,7 @@ void rpcSetDefaultAddr(void* thandle, const char* ip, const char* fqdn) { ...@@ -170,7 +170,7 @@ void rpcSetDefaultAddr(void* thandle, const char* ip, const char* fqdn) {
transSetDefaultAddr(thandle, ip, fqdn); transSetDefaultAddr(thandle, ip, fqdn);
} }
int64_t rpcAllocHandle() { return transAllocHandle(); } void* rpcAllocHandle() { return (void*)transAllocHandle(); }
int32_t rpcInit() { int32_t rpcInit() {
transInit(); transInit();
......
/** Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com> /** Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
* *
* This program is free software: you can use, redistribute, and/or modify * This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3 * it under the terms of the GNU Affero General Public License, version 3
...@@ -809,7 +810,7 @@ SCliConn* cliGetConn(SCliMsg* pMsg, SCliThrd* pThrd, bool* ignore) { ...@@ -809,7 +810,7 @@ SCliConn* cliGetConn(SCliMsg* pMsg, SCliThrd* pThrd, bool* ignore) {
conn = exh->handle; conn = exh->handle;
if (conn == NULL) { if (conn == NULL) {
conn = getConnFromPool(pThrd->pool, EPSET_GET_INUSE_IP(&pCtx->epSet), EPSET_GET_INUSE_PORT(&pCtx->epSet)); conn = getConnFromPool(pThrd->pool, EPSET_GET_INUSE_IP(&pCtx->epSet), EPSET_GET_INUSE_PORT(&pCtx->epSet));
*ignore = (conn && 0 == specifyConnRef(conn, true, refId)) ? false : true; if (conn != NULL) specifyConnRef(conn, true, refId);
} }
transReleaseExHandle(transGetRefMgt(), refId); transReleaseExHandle(transGetRefMgt(), refId);
} }
...@@ -849,14 +850,20 @@ void cliHandleReq(SCliMsg* pMsg, SCliThrd* pThrd) { ...@@ -849,14 +850,20 @@ void cliHandleReq(SCliMsg* pMsg, SCliThrd* pThrd) {
bool ignore = false; bool ignore = false;
SCliConn* conn = cliGetConn(pMsg, pThrd, &ignore); SCliConn* conn = cliGetConn(pMsg, pThrd, &ignore);
if (ignore == true) { if (ignore == true) {
tError("ignore msg");
return; return;
} }
if (conn != NULL) { if (conn != NULL) {
transCtxMerge(&conn->ctx, &pCtx->appCtx); transCtxMerge(&conn->ctx, &pCtx->appCtx);
transQueuePush(&conn->cliMsgs, pMsg); transQueuePush(&conn->cliMsgs, pMsg);
cliSend(conn); cliSend(conn);
} else { } else {
conn = cliCreateConn(pThrd); conn = cliCreateConn(pThrd);
int64_t refId = (int64_t)pMsg->msg.info.handle;
if (refId != 0) specifyConnRef(conn, true, refId);
transCtxMerge(&conn->ctx, &pCtx->appCtx); transCtxMerge(&conn->ctx, &pCtx->appCtx);
transQueuePush(&conn->cliMsgs, pMsg); transQueuePush(&conn->cliMsgs, pMsg);
...@@ -1206,7 +1213,13 @@ SCliThrd* transGetWorkThrd(STrans* trans, int64_t handle, bool* validHandle) { ...@@ -1206,7 +1213,13 @@ SCliThrd* transGetWorkThrd(STrans* trans, int64_t handle, bool* validHandle) {
if (idx < 0) return NULL; if (idx < 0) return NULL;
return ((SCliObj*)trans->tcphandle)->pThreadObj[idx]; return ((SCliObj*)trans->tcphandle)->pThreadObj[idx];
} }
return transGetWorkThrdFromHandle(handle, validHandle); SCliThrd* pThrd = transGetWorkThrdFromHandle(handle, validHandle);
if (*validHandle == true && pThrd == NULL) {
int idx = cliRBChoseIdx(trans);
if (idx < 0) return NULL;
pThrd = ((SCliObj*)trans->tcphandle)->pThreadObj[idx];
}
return pThrd;
} }
void transReleaseCliHandle(void* handle) { void transReleaseCliHandle(void* handle) {
int idx = -1; int idx = -1;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册