Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
taosdata
TDengine
提交
b6e60082
T
TDengine
项目概览
taosdata
/
TDengine
接近 2 年 前同步成功
通知
1192
Star
22018
Fork
4786
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
TDengine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b6e60082
编写于
6月 03, 2022
作者:
D
dapan1121
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rescheduler timeout task
上级
f6c6083a
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
101 addition
and
52 deletion
+101
-52
include/util/taoserror.h
include/util/taoserror.h
+2
-1
source/client/src/clientEnv.c
source/client/src/clientEnv.c
+0
-1
source/client/src/clientMain.c
source/client/src/clientMain.c
+2
-1
source/libs/qworker/src/qwMsg.c
source/libs/qworker/src/qwMsg.c
+1
-1
source/libs/scheduler/inc/schedulerInt.h
source/libs/scheduler/inc/schedulerInt.h
+24
-8
source/libs/scheduler/src/schJob.c
source/libs/scheduler/src/schJob.c
+62
-32
source/libs/scheduler/src/schRemote.c
source/libs/scheduler/src/schRemote.c
+9
-4
source/libs/scheduler/src/scheduler.c
source/libs/scheduler/src/scheduler.c
+0
-4
source/util/src/terror.c
source/util/src/terror.c
+1
-0
未找到文件。
include/util/taoserror.h
浏览文件 @
b6e60082
...
...
@@ -564,7 +564,8 @@ int32_t* taosGetErrno();
#define TSDB_CODE_SCH_STATUS_ERROR TAOS_DEF_ERROR_CODE(0, 0x2501)
#define TSDB_CODE_SCH_INTERNAL_ERROR TAOS_DEF_ERROR_CODE(0, 0x2502)
#define TSDB_CODE_SCH_IGNORE_ERROR TAOS_DEF_ERROR_CODE(0, 0x2503)
#define TSDB_CODE_QW_MSG_ERROR TAOS_DEF_ERROR_CODE(0, 0x2504)
#define TSDB_CODE_SCH_TIMEOUT_ERROR TAOS_DEF_ERROR_CODE(0, 0x2504)
#define TSDB_CODE_QW_MSG_ERROR TAOS_DEF_ERROR_CODE(0, 0x2550)
//parser
#define TSDB_CODE_PAR_SYNTAX_ERROR TAOS_DEF_ERROR_CODE(0, 0x2600)
...
...
source/client/src/clientEnv.c
浏览文件 @
b6e60082
...
...
@@ -131,7 +131,6 @@ void destroyTscObj(void *pObj) {
hbDeregisterConn
(
pTscObj
->
pAppInfo
->
pAppHbMgr
,
connKey
);
atomic_sub_fetch_64
(
&
pTscObj
->
pAppInfo
->
numOfConns
,
1
);
closeAllRequests
(
pTscObj
->
pRequests
);
schedulerStopTransport
(
pTscObj
->
pAppInfo
->
pTransporter
);
tscDebug
(
"connObj 0x%"
PRIx64
" destroyed, totalConn:%"
PRId64
,
pTscObj
->
id
,
pTscObj
->
pAppInfo
->
numOfConns
);
taosThreadMutexDestroy
(
&
pTscObj
->
mutex
);
taosMemoryFreeClear
(
pTscObj
);
...
...
source/client/src/clientMain.c
浏览文件 @
b6e60082
...
...
@@ -66,10 +66,11 @@ void taos_cleanup(void) {
hbMgrCleanUp
();
rpcCleanup
();
catalogDestroy
();
schedulerDestroy
();
rpcCleanup
();
tscInfo
(
"all local resources released"
);
taosCleanupCfg
();
taosCloseLog
();
...
...
source/libs/qworker/src/qwMsg.c
浏览文件 @
b6e60082
...
...
@@ -274,7 +274,7 @@ int32_t qWorkerPreprocessQueryMsg(void *qWorkerMgmt, SRpcMsg *pMsg) {
uint64_t
tId
=
msg
->
taskId
;
int64_t
rId
=
msg
->
refId
;
SQWMsg
qwMsg
=
{.
node
=
node
,
.
msg
=
msg
->
msg
+
msg
->
sqlLen
,
.
msgLen
=
msg
->
phyLen
,
.
connInfo
=
pMsg
->
info
};
SQWMsg
qwMsg
=
{.
msg
=
msg
->
msg
+
msg
->
sqlLen
,
.
msgLen
=
msg
->
phyLen
,
.
connInfo
=
pMsg
->
info
};
QW_SCH_TASK_DLOG
(
"prerocessQuery start, handle:%p"
,
pMsg
->
info
.
handle
);
QW_ERR_RET
(
qwPrerocessQuery
(
QW_FPARAMS
(),
&
qwMsg
));
...
...
source/libs/scheduler/inc/schedulerInt.h
浏览文件 @
b6e60082
...
...
@@ -163,9 +163,11 @@ typedef struct SSchTaskProfile {
typedef
struct
SSchTask
{
uint64_t
taskId
;
// task id
int32_t
execIdx
;
// task current execute try index
SRWLatch
lock
;
// task lock
int32_t
maxExecTimes
;
// task may exec times
int32_t
execIdx
;
// task current execute try index
SSchLevel
*
level
;
// level
SRWLatch
planLock
;
// task update plan lock
SSubplan
*
plan
;
// subplan
char
*
msg
;
// operator tree
int32_t
msgLen
;
// msg length
...
...
@@ -230,26 +232,39 @@ typedef struct SSchJob {
extern
SSchedulerMgmt
schMgmt
;
#define SCH_LOG_TASK_START_TS(_task) \
#define SCH_LOG_TASK_START_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
int32_t idx = (_task)->execIdx % SCH_TASK_MAX_EXEC_TIMES; \
(_task)->profile.execUseTime[idx] = us; \
if (0 == (_task)->execIdx) { \
(_task)->profile.startTs = us; \
} \
} while (0)
#define SCH_LOG_TASK_WAIT_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
(_task)->profile.tryUseTime[(_task)->execIdx] = us; \
if (0 == (_task)->execIdx) { \
(_task)->profile.startTs = us; \
} \
int32_t idx = (_task)->execIdx % SCH_TASK_MAX_EXEC_TIMES; \
(_task)->profile.waitTime += us - (_task)->profile.execUseTime[idx]; \
} while (0)
#define SCH_LOG_TASK_END_TS(_task) \
do { \
int64_t us = taosGetTimestampUs(); \
(_task)->profile.tryUseTime[(_task)->execIdx] = us - (_task)->profile.tryUseTime[(_task)->execIdx]; \
int32_t idx = (_task)->execIdx % SCH_TASK_MAX_EXEC_TIMES; \
(_task)->profile.execUseTime[idx] = us - (_task)->profile.execUseTime[idx]; \
(_task)->profile.endTs = us; \
} while (0)
#define SCH_TASK_TIMEOUT(_task) ((taosGetTimestampUs() - (_task)->profile.
tryUseTime[(_task)->execIdx]) > (_taks
)->timeoutUsec)
#define SCH_TASK_TIMEOUT(_task) ((taosGetTimestampUs() - (_task)->profile.
execUseTime[(_task)->execIdx % SCH_TASK_MAX_EXEC_TIMES]) > (_task
)->timeoutUsec)
#define SCH_TASK_READY_FOR_LAUNCH(readyNum, task) ((readyNum) >= taosArrayGetSize((task)->children))
#define SCH_LOCK_TASK(_task) SCH_LOCK(SCH_WRITE, &(_task)->lock)
#define SCH_UNLOCK_TASK(_task) SCH_UNLOCK(SCH_WRITE, &(_task)->lock)
#define SCH_TASK_ID(_task) ((_task) ? (_task)->taskId : -1)
#define SCH_SET_TASK_LASTMSG_TYPE(_task, _type) do { if(_task) { atomic_store_32(&(_task)->lastMsgType, _type); } } while (0)
#define SCH_GET_TASK_LASTMSG_TYPE(_task) ((_task) ? atomic_load_32(&(_task)->lastMsgType) : -1)
...
...
@@ -351,6 +366,7 @@ int32_t schAsyncExecJob(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64
int32_t
schFetchRows
(
SSchJob
*
pJob
);
int32_t
schAsyncFetchRows
(
SSchJob
*
pJob
);
int32_t
schUpdateTaskHandle
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
int32_t
msgType
,
void
*
handle
,
int32_t
execIdx
);
int32_t
schProcessOnTaskStatusRsp
(
SQueryNodeEpId
*
pEpId
,
SArray
*
pStatusList
);
#ifdef __cplusplus
...
...
source/libs/scheduler/src/schJob.c
浏览文件 @
b6e60082
...
...
@@ -29,6 +29,7 @@ int32_t schInitTask(SSchJob *pJob, SSchTask *pTask, SSubplan *pPlan, SSchLevel *
pTask
->
plan
=
pPlan
;
pTask
->
level
=
pLevel
;
pTask
->
execIdx
=
-
1
;
pTask
->
maxExecTimes
=
SCH_TASK_MAX_EXEC_TIMES
;
pTask
->
timeoutUsec
=
SCH_DEFAULT_TASK_TIMEOUT_USEC
;
SCH_SET_TASK_STATUS
(
pTask
,
JOB_TASK_STATUS_NOT_START
);
pTask
->
taskId
=
schGenTaskId
();
...
...
@@ -142,6 +143,7 @@ void schDeregisterTaskHb(SSchJob *pJob, SSchTask *pTask) {
SSchHbTrans
*
hb
=
taosHashGet
(
schMgmt
.
hbConnections
,
&
epId
,
sizeof
(
SQueryNodeEpId
));
if
(
NULL
==
hb
)
{
SCH_TASK_ELOG
(
"nodeId %d fqdn %s port %d not in hb connections"
,
epId
.
nodeId
,
epId
.
ep
.
fqdn
,
epId
.
ep
.
port
);
return
;
}
atomic_sub_fetch_64
(
&
hb
->
taskNum
,
1
);
...
...
@@ -360,7 +362,7 @@ int32_t schRecordTaskSucceedNode(SSchJob *pJob, SSchTask *pTask) {
int32_t
schAppendTaskExecNode
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
SQueryNodeAddr
*
addr
,
int32_t
execIdx
)
{
SSchNodeInfo
nodeInfo
=
{.
addr
=
*
addr
,
.
handle
=
NULL
};
if
(
NULL
==
taosHashPut
(
pTask
->
execNodes
,
&
execIdx
,
sizeof
(
execIdx
),
&
nodeInfo
,
sizeof
(
nodeInfo
)))
{
if
(
taosHashPut
(
pTask
->
execNodes
,
&
execIdx
,
sizeof
(
execIdx
),
&
nodeInfo
,
sizeof
(
nodeInfo
)))
{
SCH_TASK_ELOG
(
"taosHashPut nodeInfo to execNodes failed, errno:%d"
,
errno
);
SCH_ERR_RET
(
TSDB_CODE_QRY_OUT_OF_MEMORY
);
}
...
...
@@ -384,7 +386,7 @@ int32_t schDropTaskExecNode(SSchJob *pJob, SSchTask *pTask, void *handle, int32_
}
int32_t
schUpdateTaskExecNode
(
SSchTask
*
pTask
,
void
*
handle
,
int32_t
execIdx
)
{
if
(
taos
Array
GetSize
(
pTask
->
execNodes
)
<=
0
)
{
if
(
taos
Hash
GetSize
(
pTask
->
execNodes
)
<=
0
)
{
return
TSDB_CODE_SUCCESS
;
}
...
...
@@ -714,7 +716,17 @@ int32_t schTaskCheckSetRetry(SSchJob *pJob, SSchTask *pTask, int32_t errCode, bo
return
TSDB_CODE_SUCCESS
;
}
if
((
pTask
->
execIdx
+
1
)
>=
SCH_TASK_MAX_EXEC_TIMES
)
{
if
(
TSDB_CODE_SCH_TIMEOUT_ERROR
==
errCode
)
{
pTask
->
maxExecTimes
++
;
if
(
pTask
->
timeoutUsec
<
SCH_MAX_TASK_TIMEOUT_USEC
)
{
pTask
->
timeoutUsec
*=
2
;
if
(
pTask
->
timeoutUsec
>
SCH_MAX_TASK_TIMEOUT_USEC
)
{
pTask
->
timeoutUsec
=
SCH_MAX_TASK_TIMEOUT_USEC
;
}
}
}
if
((
pTask
->
execIdx
+
1
)
>=
pTask
->
maxExecTimes
)
{
*
needRetry
=
false
;
SCH_TASK_DLOG
(
"task no more retry since reach max try times, execIdx:%d"
,
pTask
->
execIdx
);
return
TSDB_CODE_SUCCESS
;
...
...
@@ -737,7 +749,7 @@ int32_t schTaskCheckSetRetry(SSchJob *pJob, SSchTask *pTask, int32_t errCode, bo
}
else
{
int32_t
candidateNum
=
taosArrayGetSize
(
pTask
->
candidateAddrs
);
if
((
pTask
->
candidateIdx
+
1
)
>=
candidateNum
)
{
if
((
pTask
->
candidateIdx
+
1
)
>=
candidateNum
&&
(
TSDB_CODE_SCH_TIMEOUT_ERROR
!=
errCode
)
)
{
*
needRetry
=
false
;
SCH_TASK_DLOG
(
"task no more retry since all candiates tried, candidateIdx:%d, candidateNum:%d"
,
pTask
->
candidateIdx
,
candidateNum
);
...
...
@@ -767,7 +779,10 @@ int32_t schHandleTaskRetry(SSchJob *pJob, SSchTask *pTask) {
if
(
SCH_IS_DATA_SRC_TASK
(
pTask
))
{
SCH_SWITCH_EPSET
(
&
pTask
->
plan
->
execNode
);
}
else
{
++
pTask
->
candidateIdx
;
int32_t
candidateNum
=
taosArrayGetSize
(
pTask
->
candidateAddrs
);
if
(
++
pTask
->
candidateIdx
>=
candidateNum
)
{
pTask
->
candidateIdx
=
0
;
}
}
SCH_ERR_RET
(
schLaunchTask
(
pJob
,
pTask
));
...
...
@@ -942,8 +957,12 @@ void schProcessOnDataFetched(SSchJob *job) {
int32_t
schProcessOnTaskFailure
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
int32_t
errCode
)
{
int8_t
status
=
0
;
SCH_LOG_TASK_END_TS
(
pTask
);
if
(
errCode
==
TSDB_CODE_SCH_TIMEOUT_ERROR
)
{
SCH_LOG_TASK_WAIT_TS
(
pTask
);
}
else
{
SCH_LOG_TASK_END_TS
(
pTask
);
}
if
(
schJobNeedToStop
(
pJob
,
&
status
))
{
SCH_TASK_DLOG
(
"task failed not processed cause of job status, job status:%s"
,
jobTaskStatusStr
(
status
));
SCH_RET
(
atomic_load_32
(
&
pJob
->
errCode
));
...
...
@@ -1145,12 +1164,46 @@ int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRs
return
TSDB_CODE_SUCCESS
;
}
void
schDropTaskOnExecNode
(
SSchJob
*
pJob
,
SSchTask
*
pTask
)
{
if
(
NULL
==
pTask
->
execNodes
)
{
SCH_TASK_DLOG
(
"no exec address, status:%s"
,
SCH_GET_TASK_STATUS_STR
(
pTask
));
return
;
}
int32_t
size
=
(
int32_t
)
taosHashGetSize
(
pTask
->
execNodes
);
if
(
size
<=
0
)
{
SCH_TASK_DLOG
(
"task has no execNodes, no need to drop it, status:%s"
,
SCH_GET_TASK_STATUS_STR
(
pTask
));
return
;
}
SSchNodeInfo
*
nodeInfo
=
taosHashIterate
(
pTask
->
execNodes
,
NULL
);
while
(
nodeInfo
)
{
SCH_SET_TASK_HANDLE
(
pTask
,
nodeInfo
->
handle
);
schBuildAndSendMsg
(
pJob
,
pTask
,
&
nodeInfo
->
addr
,
TDMT_VND_DROP_TASK
);
nodeInfo
=
taosHashIterate
(
pTask
->
execNodes
,
nodeInfo
);
}
SCH_TASK_DLOG
(
"task has %d exec address"
,
size
);
}
int32_t
schRescheduleTask
(
SSchJob
*
pJob
,
SSchTask
*
pTask
)
{
if
(
SCH_IS_DATA_SRC_QRY_TASK
(
pTask
))
{
return
TSDB_CODE_SUCCESS
;
}
SCH_LOCK_TASK
(
pTask
);
if
(
JOB_TASK_STATUS_EXECUTING
==
pTask
->
status
&&
pJob
->
fetchTask
!=
pTask
)
{
schDropTaskOnExecNode
(
pJob
,
pTask
);
taosHashClear
(
pTask
->
execNodes
);
schProcessOnTaskFailure
(
pJob
,
pTask
,
TSDB_CODE_SCH_TIMEOUT_ERROR
);
}
SCH_UNLOCK_TASK
(
pTask
);
return
TSDB_CODE_SUCCESS
;
}
int32_t
schProcessOnTaskStatusRsp
(
SQueryNodeEpId
*
pEpId
,
SArray
*
pStatusList
)
{
...
...
@@ -1193,6 +1246,7 @@ int32_t schProcessOnTaskStatusRsp(SQueryNodeEpId* pEpId, SArray* pStatusList) {
schReleaseJob
(
taskStatus
->
refId
);
}
return
TSDB_CODE_SUCCESS
;
}
...
...
@@ -1339,30 +1393,6 @@ int32_t schLaunchJob(SSchJob *pJob) {
return
TSDB_CODE_SUCCESS
;
}
void
schDropTaskOnExecNode
(
SSchJob
*
pJob
,
SSchTask
*
pTask
)
{
if
(
NULL
==
pTask
->
execNodes
)
{
SCH_TASK_DLOG
(
"no exec address, status:%s"
,
SCH_GET_TASK_STATUS_STR
(
pTask
));
return
;
}
int32_t
size
=
(
int32_t
)
taosHashGetSize
(
pTask
->
execNodes
);
if
(
size
<=
0
)
{
SCH_TASK_DLOG
(
"task has no execNodes, no need to drop it, status:%s"
,
SCH_GET_TASK_STATUS_STR
(
pTask
));
return
;
}
SSchNodeInfo
*
nodeInfo
=
taosHashIterate
(
pTask
->
execNodes
,
NULL
);
while
(
nodeInfo
)
{
SCH_SET_TASK_HANDLE
(
pTask
,
nodeInfo
->
handle
);
schBuildAndSendMsg
(
pJob
,
pTask
,
&
nodeInfo
->
addr
,
TDMT_VND_DROP_TASK
);
nodeInfo
=
taosHashIterate
(
pTask
->
execNodes
,
nodeInfo
);
}
SCH_TASK_DLOG
(
"task has %d exec address"
,
size
);
}
void
schDropTaskInHashList
(
SSchJob
*
pJob
,
SHashObj
*
list
)
{
if
(
!
SCH_IS_NEED_DROP_JOB
(
pJob
))
{
...
...
source/libs/scheduler/src/schRemote.c
浏览文件 @
b6e60082
...
...
@@ -92,8 +92,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, ch
int8_t
status
=
0
;
if
(
schJobNeedToStop
(
pJob
,
&
status
))
{
SCH_TASK_ELOG
(
"rsp not processed cause of job status, job status:%s, rspCode:0x%x"
,
jobTaskStatusStr
(
status
),
rspCode
);
SCH_TASK_ELOG
(
"rsp not processed cause of job status, job status:%s, rspCode:0x%x"
,
jobTaskStatusStr
(
status
),
rspCode
);
taosMemoryFreeClear
(
msg
);
SCH_RET
(
atomic_load_32
(
&
pJob
->
errCode
));
}
...
...
@@ -344,7 +343,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t msgType, ch
_return:
taosMemoryFreeClear
(
msg
);
taosMemoryFreeClear
(
msg
);
SCH_RET
(
schProcessOnTaskFailure
(
pJob
,
pTask
,
code
));
}
...
...
@@ -364,6 +363,8 @@ int32_t schHandleCallback(void *param, const SDataBuf *pMsg, int32_t msgType, in
SCH_ERR_JRET
(
schGetTaskInJob
(
pJob
,
pParam
->
taskId
,
&
pTask
));
SCH_LOCK_TASK
(
pTask
);
SCH_TASK_DLOG
(
"rsp msg received, type:%s, handle:%p, code:%s"
,
TMSG_INFO
(
msgType
),
pMsg
->
handle
,
tstrerror
(
rspCode
));
if
(
pParam
->
execIdx
!=
pTask
->
execIdx
)
{
...
...
@@ -376,6 +377,10 @@ int32_t schHandleCallback(void *param, const SDataBuf *pMsg, int32_t msgType, in
SCH_ERR_JRET
(
schHandleResponseMsg
(
pJob
,
pTask
,
msgType
,
pMsg
->
pData
,
pMsg
->
len
,
rspCode
));
_return:
if
(
pTask
)
{
SCH_UNLOCK_TASK
(
pTask
);
}
if
(
pJob
)
{
schReleaseJob
(
pParam
->
refId
);
...
...
@@ -667,7 +672,7 @@ int32_t schRegisterHbConnection(SSchJob *pJob, SSchTask *pTask, SQueryNodeEpId *
}
int32_t
schBuildAndSendHbMsg
(
SQueryNodeEpId
*
nodeEpId
)
{
int32_t
schBuildAndSendHbMsg
(
SQueryNodeEpId
*
nodeEpId
,
SArray
*
taskAction
)
{
SSchedulerHbReq
req
=
{
0
};
int32_t
code
=
0
;
SRpcCtx
rpcCtx
=
{
0
};
...
...
source/libs/scheduler/src/scheduler.c
浏览文件 @
b6e60082
...
...
@@ -176,10 +176,6 @@ int32_t scheduleCancelJob(int64_t job) {
SCH_RET
(
code
);
}
void
schedulerStopTransport
(
void
*
pTrans
)
{
// CLOSE && REMOVE RELATED HB CONNECTIONS
}
void
schedulerFreeJob
(
int64_t
job
)
{
SSchJob
*
pJob
=
schAcquireJob
(
job
);
if
(
NULL
==
pJob
)
{
...
...
source/util/src/terror.c
浏览文件 @
b6e60082
...
...
@@ -444,6 +444,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_CTG_VG_META_MISMATCH, "table meta and vgroup
//scheduler
TAOS_DEFINE_ERROR
(
TSDB_CODE_SCH_STATUS_ERROR
,
"scheduler status error"
)
TAOS_DEFINE_ERROR
(
TSDB_CODE_SCH_INTERNAL_ERROR
,
"scheduler internal error"
)
TAOS_DEFINE_ERROR
(
TSDB_CODE_SCH_TIMEOUT_ERROR
,
"Task timeout"
)
TAOS_DEFINE_ERROR
(
TSDB_CODE_QW_MSG_ERROR
,
"Invalid msg order"
)
// parser
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录