提交 e733f806 编写于 作者: D dapan1121

enh: enhance query redirect

上级 017452f4
......@@ -325,11 +325,13 @@ int32_t updateQnodeList(SAppInstInfo* pInfo, SArray* pNodeList) {
if (pInfo->pQnodeList) {
taosArrayDestroy(pInfo->pQnodeList);
pInfo->pQnodeList = NULL;
tscDebug("QnodeList cleared in cluster 0x%" PRIx64, pInfo->clusterId);
}
if (pNodeList) {
pInfo->pQnodeList = taosArrayDup(pNodeList);
taosArraySort(pInfo->pQnodeList, compareQueryNodeLoad);
tscDebug("QnodeList updated in cluster 0x%" PRIx64 ", num:%d", pInfo->clusterId, taosArrayGetSize(pInfo->pQnodeList));
}
taosThreadMutexUnlock(&pInfo->qnodeMutex);
......
......@@ -381,11 +381,13 @@ static int32_t tDeserializeSClientHbRsp(SDecoder *pDecoder, SClientHbRsp *pRsp)
if (pQnodeNum > 0) {
pRsp->query->pQnodeList = taosArrayInit(pQnodeNum, sizeof(SQueryNodeLoad));
if (NULL == pRsp->query->pQnodeList) return -1;
for (int32_t i = 0; i < pQnodeNum; ++i) {
SQueryNodeLoad load = {0};
if (tDecodeSQueryNodeLoad(pDecoder, &load) < 0) return -1;
taosArrayPush(pRsp->query->pQnodeList, &load);
}
}
}
int32_t kvNum = 0;
if (tDecodeI32(pDecoder, &kvNum) < 0) return -1;
......
......@@ -64,16 +64,11 @@ int32_t qndGetLoad(SQnode *pQnode, SQnodeLoad *pLoad) {
return 0;
}
int32_t qnodeCrash = 1;
int32_t qndPreprocessQueryMsg(SQnode *pQnode, SRpcMsg * pMsg) {
if (TDMT_SCH_QUERY != pMsg->msgType && TDMT_SCH_MERGE_QUERY != pMsg->msgType) {
return 0;
}
if (qnodeCrash) {
ASSERT(0);
}
return qWorkerPreprocessQueryMsg(pQnode->pQuery, pMsg);
}
......
......@@ -35,7 +35,6 @@ extern "C" {
#define SCH_DEFAULT_TASK_TIMEOUT_USEC 10000000
#define SCH_MAX_TASK_TIMEOUT_USEC 60000000
#define SCH_TASK_MAX_EXEC_TIMES 8
#define SCH_MAX_CANDIDATE_EP_NUM TSDB_MAX_REPLICA
enum {
......@@ -180,7 +179,7 @@ typedef struct SSchLevel {
typedef struct SSchTaskProfile {
int64_t startTs;
int64_t execUseTime[SCH_TASK_MAX_EXEC_TIMES];
int64_t* execTime;
int64_t waitTime;
int64_t endTs;
} SSchTaskProfile;
......@@ -260,33 +259,7 @@ typedef struct SSchJob {
extern SSchedulerMgmt schMgmt;
#define SCH_LOG_TASK_START_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execId % SCH_TASK_MAX_EXEC_TIMES; \
(_task)->profile.execUseTime[idx] = us; \
if (0 == (_task)->execId) { \
(_task)->profile.startTs = us; \
} \
} while (0)
#define SCH_LOG_TASK_WAIT_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execId % SCH_TASK_MAX_EXEC_TIMES; \
(_task)->profile.waitTime += us - (_task)->profile.execUseTime[idx]; \
} while (0)
#define SCH_LOG_TASK_END_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execId % SCH_TASK_MAX_EXEC_TIMES; \
(_task)->profile.execUseTime[idx] = us - (_task)->profile.execUseTime[idx]; \
(_task)->profile.endTs = us; \
} while (0)
#define SCH_TASK_TIMEOUT(_task) ((taosGetTimestampUs() - (_task)->profile.execUseTime[(_task)->execId % SCH_TASK_MAX_EXEC_TIMES]) > (_task)->timeoutUsec)
#define SCH_TASK_TIMEOUT(_task) ((taosGetTimestampUs() - (_task)->profile.execTime[(_task)->execId % (_task)->maxExecTimes]) > (_task)->timeoutUsec)
#define SCH_TASK_READY_FOR_LAUNCH(readyNum, task) ((readyNum) >= taosArrayGetSize((task)->children))
......@@ -320,6 +293,7 @@ extern SSchedulerMgmt schMgmt;
#define SCH_TASK_NEED_FLOW_CTRL(_job, _task) (SCH_IS_DATA_BIND_QRY_TASK(_task) && SCH_JOB_NEED_FLOW_CTRL(_job) && SCH_IS_LEVEL_UNFINISHED((_task)->level))
#define SCH_FETCH_TYPE(_pSrcTask) (SCH_IS_DATA_BIND_QRY_TASK(_pSrcTask) ? TDMT_SCH_FETCH : TDMT_SCH_MERGE_FETCH)
#define SCH_TASK_NEED_FETCH(_task) ((_task)->plan->subplanType != SUBPLAN_TYPE_MODIFY)
#define SCH_TASK_MAX_EXEC_TIMES(_levelIdx, _levelNum) (SCH_MAX_CANDIDATE_EP_NUM * ((_levelNum) - (_levelIdx)))
#define SCH_SET_JOB_TYPE(_job, type) do { if ((type) != SUBPLAN_TYPE_MODIFY) { (_job)->attr.queryJob = true; } } while (0)
#define SCH_IS_QUERY_JOB(_job) ((_job)->attr.queryJob)
......@@ -328,7 +302,7 @@ extern SSchedulerMgmt schMgmt;
#define SCH_JOB_NEED_DROP(_job) (SCH_IS_QUERY_JOB(_job))
#define SCH_IS_EXPLAIN_JOB(_job) (EXPLAIN_MODE_ANALYZE == (_job)->attr.explainMode)
#define SCH_NETWORK_ERR(_code) ((_code) == TSDB_CODE_RPC_BROKEN_LINK || (_code) == TSDB_CODE_RPC_NETWORK_UNAVAIL)
#define SCH_MERGE_TASK_NETWORK_ERR(_task, _code, _len) (SCH_NETWORK_ERR(_code) && ((_len) > 0 || SCH_IS_DATA_BIND_TASK(_task)))
#define SCH_MERGE_TASK_NETWORK_ERR(_task, _code, _len) (SCH_NETWORK_ERR(_code) && (((_len) > 0) || (!SCH_IS_DATA_BIND_TASK(_task))))
#define SCH_REDIRECT_MSGTYPE(_msgType) ((_msgType) == TDMT_SCH_QUERY || (_msgType) == TDMT_SCH_MERGE_QUERY || (_msgType) == TDMT_SCH_FETCH || (_msgType) == TDMT_SCH_MERGE_FETCH)
#define SCH_TASK_NEED_REDIRECT(_task, _msgType, _code, _rspLen) (SCH_REDIRECT_MSGTYPE(_msgType) && (NEED_SCHEDULER_REDIRECT_ERROR(_code) || SCH_MERGE_TASK_NETWORK_ERR((_task), (_code), (_rspLen))))
#define SCH_NEED_RETRY(_msgType, _code) ((SCH_NETWORK_ERR(_code) && SCH_REDIRECT_MSGTYPE(_msgType)) || (_code) == TSDB_CODE_SCH_TIMEOUT_ERROR)
......@@ -338,6 +312,33 @@ extern SSchedulerMgmt schMgmt;
#define SCH_SWITCH_EPSET(_addr) ((_addr)->epSet.inUse = ((_addr)->epSet.inUse + 1) % (_addr)->epSet.numOfEps)
#define SCH_TASK_NUM_OF_EPS(_addr) ((_addr)->epSet.numOfEps)
#define SCH_LOG_TASK_START_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execId % (_task)->maxExecTimes; \
(_task)->profile.execTime[idx] = us; \
if (0 == (_task)->execId) { \
(_task)->profile.startTs = us; \
} \
} while (0)
#define SCH_LOG_TASK_WAIT_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execId % (_task)->maxExecTimes; \
(_task)->profile.waitTime += us - (_task)->profile.execTime[idx]; \
} while (0)
#define SCH_LOG_TASK_END_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execId % (_task)->maxExecTimes; \
(_task)->profile.execTime[idx] = us - (_task)->profile.execTime[idx]; \
(_task)->profile.endTs = us; \
} while (0)
#define SCH_JOB_ELOG(param, ...) qError("QID:0x%" PRIx64 " " param, pJob->queryId, __VA_ARGS__)
#define SCH_JOB_DLOG(param, ...) qDebug("QID:0x%" PRIx64 " " param, pJob->queryId, __VA_ARGS__)
......@@ -431,7 +432,8 @@ void schFreeTask(SSchJob *pJob, SSchTask *pTask);
void schDropTaskInHashList(SSchJob *pJob, SHashObj *list);
int32_t schLaunchLevelTasks(SSchJob *pJob, SSchLevel *level);
int32_t schGetTaskFromList(SHashObj *pTaskList, uint64_t taskId, SSchTask **pTask);
int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel);
int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel, int32_t levelNum);
int32_t schSwitchTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask);
#ifdef __cplusplus
......
......@@ -337,7 +337,7 @@ int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob) {
SCH_SET_JOB_TYPE(pJob, plan->subplanType);
SSchTask task = {0};
SCH_ERR_JRET(schInitTask(pJob, &task, plan, pLevel));
SCH_ERR_JRET(schInitTask(pJob, &task, plan, pLevel, levelNum));
SSchTask *pTask = taosArrayPush(pLevel->subTasks, &task);
if (NULL == pTask) {
......
......@@ -46,21 +46,31 @@ void schFreeTask(SSchJob *pJob, SSchTask *pTask) {
}
int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel) {
int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *pLevel, int32_t levelNum) {
int32_t code = 0;
pTask->plan = pPlan;
pTask->level = pLevel;
pTask->execId = -1;
pTask->maxExecTimes = SCH_TASK_MAX_EXEC_TIMES;
pTask->maxExecTimes = SCH_TASK_MAX_EXEC_TIMES(pLevel->level, levelNum);
pTask->timeoutUsec = SCH_DEFAULT_TASK_TIMEOUT_USEC;
SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT);
pTask->taskId = schGenTaskId();
pTask->execNodes = taosHashInit(SCH_MAX_CANDIDATE_EP_NUM, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), true, HASH_NO_LOCK);
if (NULL == pTask->execNodes) {
SCH_TASK_ELOG("taosHashInit %d execNodes failed", SCH_MAX_CANDIDATE_EP_NUM);
SCH_ERR_RET(TSDB_CODE_QRY_OUT_OF_MEMORY);
pTask->profile.execTime = taosMemoryCalloc(pTask->maxExecTimes, sizeof(int64_t));
if (NULL == pTask->execNodes || NULL == pTask->profile.execTime) {
SCH_ERR_JRET(TSDB_CODE_QRY_OUT_OF_MEMORY);
}
SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT);
return TSDB_CODE_SUCCESS;
_return:
taosMemoryFreeClear(pTask->profile.execTime);
taosHashCleanup(pTask->execNodes);
SCH_RET(code);
}
int32_t schRecordTaskSucceedNode(SSchJob *pJob, SSchTask *pTask) {
......@@ -338,6 +348,11 @@ int32_t schDoTaskRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf* pData, int32
qClearSubplanExecutionNode(pTask->plan);
// Note: current error task and upper level merge task
if ((pData && 0 == pData->len) || NULL == pData) {
SCH_ERR_JRET(schSwitchTaskCandidateAddr(pJob, pTask));
}
SCH_SET_TASK_STATUS(pTask, JOB_TASK_STATUS_INIT);
int32_t childrenNum = taosArrayGetSize(pTask->children);
......@@ -531,10 +546,7 @@ int32_t schHandleTaskRetry(SSchJob *pJob, SSchTask *pTask) {
if (SCH_IS_DATA_BIND_TASK(pTask)) {
SCH_SWITCH_EPSET(&pTask->plan->execNode);
} else {
int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs);
if (++pTask->candidateIdx >= candidateNum) {
pTask->candidateIdx = 0;
}
SCH_ERR_RET(schSwitchTaskCandidateAddr(pJob, pTask));
}
SCH_ERR_RET(schLaunchTask(pJob, pTask));
......@@ -633,6 +645,16 @@ int32_t schUpdateTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask, SEpSet* pEpSe
return TSDB_CODE_SUCCESS;
}
int32_t schSwitchTaskCandidateAddr(SSchJob *pJob, SSchTask *pTask) {
int32_t candidateNum = taosArrayGetSize(pTask->candidateAddrs);
if (++pTask->candidateIdx >= candidateNum) {
pTask->candidateIdx = 0;
}
SCH_TASK_DLOG("switch task candiateIdx to %d", pTask->candidateIdx);
return TSDB_CODE_SUCCESS;
}
int32_t schRemoveTaskFromExecList(SSchJob *pJob, SSchTask *pTask) {
int32_t code = taosHashRemove(pJob->execTasks, &pTask->taskId, sizeof(pTask->taskId));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册