Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
taosdata
TDengine
提交
19e5d637
T
TDengine
项目概览
taosdata
/
TDengine
1 年多 前同步成功
通知
1185
Star
22016
Fork
4786
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
TDengine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
19e5d637
编写于
3月 31, 2023
作者:
D
dapan1121
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix: add job retry
上级
01c2eb09
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
109 addition
and
17 deletion
+109
-17
source/libs/qworker/src/qwDbg.c
source/libs/qworker/src/qwDbg.c
+14
-3
source/libs/scheduler/inc/schInt.h
source/libs/scheduler/inc/schInt.h
+13
-3
source/libs/scheduler/src/schJob.c
source/libs/scheduler/src/schJob.c
+49
-4
source/libs/scheduler/src/schRemote.c
source/libs/scheduler/src/schRemote.c
+8
-1
source/libs/scheduler/src/schTask.c
source/libs/scheduler/src/schTask.c
+25
-6
未找到文件。
source/libs/qworker/src/qwDbg.c
浏览文件 @
19e5d637
...
...
@@ -259,15 +259,26 @@ void qwDbgSimulateDead(QW_FPARAMS_DEF, SQWTaskCtx *ctx, bool *rsped) {
static
int32_t
ignoreTime
=
0
;
if
(
++
ignoreTime
>
10
&&
0
==
taosRand
()
%
9
)
{
if
(
ctx
->
msgType
==
TDMT_SCH_FETCH
)
{
qwBuildAndSendErrorRsp
(
TDMT_SCH_LINK_BROKEN
,
&
ctx
->
ctrlConnInfo
,
TSDB_CODE_RPC_BROKEN_LINK
);
qwBuildAndSendErrorRsp
(
ctx
->
msgType
+
1
,
&
ctx
->
dataConnInfo
,
TSDB_CODE_QRY_TASK_CTX_NOT_EXIST
);
*
rsped
=
true
;
taosSsleep
(
3
);
return
;
}
#if 0
SRpcHandleInfo *pConn =
((ctx->msgType == TDMT_SCH_FETCH || ctx->msgType == TDMT_SCH_MERGE_FETCH) ? &ctx->dataConnInfo
:
&
ctx
->
ctrlConnInfo
);
: &ctx->ctrlConnInfo);
qwBuildAndSendErrorRsp(ctx->msgType + 1, pConn, TSDB_CODE_RPC_BROKEN_LINK);
qwBuildAndSendDropMsg(QW_FPARAMS(), pConn);
*rsped = true;
return;
#endif
}
}
...
...
source/libs/scheduler/inc/schInt.h
浏览文件 @
19e5d637
...
...
@@ -299,6 +299,7 @@ typedef struct SSchJob {
SExecResult
execRes
;
void
*
fetchRes
;
// TODO free it or not
bool
fetched
;
bool
noMoreRetry
;
int64_t
resNumOfRows
;
// from int32_t to int64_t
SSchResInfo
userRes
;
char
*
sql
;
...
...
@@ -333,8 +334,8 @@ extern SSchedulerMgmt schMgmt;
((_job)->attr.localExec && SCH_IS_QUERY_JOB(_job) && (!SCH_IS_INSERT_JOB(_job)) && \
(!SCH_IS_DATA_BIND_QRY_TASK(_task)))
#define SCH_UPDATE_REDICT_CODE(job, _code) atomic_val_compare_exchange_32(&((job)->redirectCode), 0, _code)
#define SCH_GET_REDICT_CODE(job, _code) (((!NO_RET_REDIRECT_ERROR(_code)) || (job)->redirectCode == 0) ? (_code) : (job)->redirectCode)
#define SCH_UPDATE_REDI
RE
CT_CODE(job, _code) atomic_val_compare_exchange_32(&((job)->redirectCode), 0, _code)
#define SCH_GET_REDI
RE
CT_CODE(job, _code) (((!NO_RET_REDIRECT_ERROR(_code)) || (job)->redirectCode == 0) ? (_code) : (job)->redirectCode)
#define SCH_SET_TASK_STATUS(task, st) atomic_store_8(&(task)->status, st)
#define SCH_GET_TASK_STATUS(task) atomic_load_8(&(task)->status)
...
...
@@ -398,7 +399,7 @@ extern SSchedulerMgmt schMgmt;
(NEED_SCHEDULER_REDIRECT_ERROR(_code) || SCH_LOW_LEVEL_NETWORK_ERR((_job), (_task), (_code)) || SCH_TASK_RETRY_NETWORK_ERR((_task), (_code))))
#define SCH_TASK_NEED_RETRY(_msgType, _code) \
((SCH_REDIRECT_MSGTYPE(_msgType) && SCH_NETWORK_ERR(_code)) || (_code) == TSDB_CODE_SCH_TIMEOUT_ERROR)
#define SCH_IS_LEVEL_UNFINISHED(_level) ((_level)->taskLaunchedNum < (_level)->taskNum)
#define SCH_GET_CUR_EP(_addr) (&(_addr)->epSet.eps[(_addr)->epSet.inUse])
...
...
@@ -522,6 +523,11 @@ extern SSchedulerMgmt schMgmt;
} \
} while (0)
#define SCH_RESET_JOB_LEVEL_IDX(_job) do { \
(_job)->levelIdx = (_job)->levelNum - 1; \
SCH_JOB_DLOG("set job levelIdx to %d", (_job)->levelIdx); \
} while (0)
void
schDeregisterTaskHb
(
SSchJob
*
pJob
,
SSchTask
*
pTask
);
void
schCleanClusterHb
(
void
*
pTrans
);
int32_t
schLaunchTask
(
SSchJob
*
job
,
SSchTask
*
task
);
...
...
@@ -603,6 +609,10 @@ int32_t schHandleJobDrop(SSchJob *pJob, int32_t errCode);
bool
schChkCurrentOp
(
SSchJob
*
pJob
,
int32_t
op
,
int8_t
sync
);
int32_t
schProcessFetchRsp
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
char
*
msg
,
int32_t
rspCode
);
int32_t
schProcessExplainRsp
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
SExplainRsp
*
rsp
);
int32_t
schHandleJobRetry
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
SDataBuf
*
pMsg
,
int32_t
rspCode
);
int32_t
schChkResetJobRetry
(
SSchJob
*
pJob
,
int32_t
rspCode
);
void
schResetTaskForRetry
(
SSchJob
*
pJob
,
SSchTask
*
pTask
);
int32_t
schChkUpdateRedirectCtx
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
SEpSet
*
pEpSet
,
int32_t
rspCode
);
extern
SSchDebug
gSCHDebug
;
...
...
source/libs/scheduler/src/schJob.c
浏览文件 @
19e5d637
...
...
@@ -296,7 +296,7 @@ int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob) {
}
pJob
->
levelNum
=
levelNum
;
pJob
->
levelIdx
=
levelNum
-
1
;
SCH_RESET_JOB_LEVEL_IDX
(
pJob
)
;
SSchLevel
level
=
{
0
};
SNodeListNode
*
plans
=
NULL
;
...
...
@@ -828,8 +828,9 @@ int32_t schChkResetJobRetry(SSchJob *pJob, int32_t rspCode) {
SCH_LOCK
(
SCH_WRITE
,
&
pJob
->
resLock
);
if
(
pJob
->
fetched
)
{
SCH_UNLOCK
(
SCH_WRITE
,
&
pJob
->
resLock
);
SCH_TASK_ELOG
(
"already fetched while got error %s"
,
tstrerror
(
rspCode
));
SCH_ERR_JRET
(
rspCode
);
pJob
->
noMoreRetry
=
true
;
SCH_JOB_ELOG
(
"already fetched while got error %s"
,
tstrerror
(
rspCode
));
SCH_ERR_RET
(
rspCode
);
}
SCH_UNLOCK
(
SCH_WRITE
,
&
pJob
->
resLock
);
...
...
@@ -839,12 +840,56 @@ int32_t schChkResetJobRetry(SSchJob *pJob, int32_t rspCode) {
return
TSDB_CODE_SUCCESS
;
}
int32_t
schResetJobForRetry
(
SSchJob
*
pJob
,
int32_t
rspCode
)
{
SCH_ERR_RET
(
schChkResetJobRetry
(
pJob
,
rspCode
));
int32_t
numOfLevels
=
taosArrayGetSize
(
pJob
->
levels
);
for
(
int32_t
i
=
0
;
i
<
numOfLevels
;
++
i
)
{
SSchLevel
*
pLevel
=
taosArrayGet
(
pJob
->
levels
,
i
);
pLevel
->
taskExecDoneNum
=
0
;
pLevel
->
taskLaunchedNum
=
0
;
int32_t
numOfTasks
=
taosArrayGetSize
(
pLevel
->
subTasks
);
for
(
int32_t
j
=
0
;
j
<
numOfTasks
;
++
j
)
{
SSchTask
*
pTask
=
taosArrayGet
(
pLevel
->
subTasks
,
j
);
SCH_LOCK_TASK
(
pTask
);
SCH_ERR_RET
(
schChkUpdateRedirectCtx
(
pJob
,
pTask
,
NULL
,
rspCode
));
qClearSubplanExecutionNode
(
pTask
->
plan
);
schResetTaskForRetry
(
pJob
,
pTask
);
SCH_UNLOCK_TASK
(
pTask
);
}
}
SCH_RESET_JOB_LEVEL_IDX
(
pJob
);
return
TSDB_CODE_SUCCESS
;
}
int32_t
schHandleJobRetry
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
SDataBuf
*
pMsg
,
int32_t
rspCode
)
{
int32_t
code
=
0
;
SCH_ERR_JRET
(
schChkResetJobRetry
(
pJob
,
rspCode
));
taosMemoryFreeClear
(
pMsg
->
pData
);
taosMemoryFreeClear
(
pMsg
->
pEpSet
);
SCH_UNLOCK_TASK
(
pTask
);
SCH_TASK_DLOG
(
"start to redirect all job tasks cause of error: %s"
,
tstrerror
(
rspCode
));
SCH_ERR_JRET
(
schResetJobForRetry
(
pJob
,
rspCode
));
SCH_ERR_JRET
(
schLaunchJob
(
pJob
));
SCH_LOCK_TASK
(
pTask
);
SCH_RET
(
code
);
_return:
SCH_LOCK_TASK
(
pTask
);
SCH_RET
(
schProcessOnTaskFailure
(
pJob
,
pTask
,
code
));
}
bool
schChkCurrentOp
(
SSchJob
*
pJob
,
int32_t
op
,
int8_t
sync
)
{
...
...
source/libs/scheduler/src/schRemote.c
浏览文件 @
19e5d637
...
...
@@ -416,6 +416,7 @@ _return:
int32_t
schHandleResponseMsg
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
int32_t
execId
,
SDataBuf
*
pMsg
,
int32_t
rspCode
)
{
int32_t
code
=
0
;
int32_t
msgType
=
pMsg
->
msgType
;
char
*
msg
=
pMsg
->
pData
;
bool
dropExecNode
=
(
msgType
==
TDMT_SCH_LINK_BROKEN
||
SCH_NETWORK_ERR
(
rspCode
));
if
(
SCH_IS_QUERY_JOB
(
pJob
))
{
...
...
@@ -426,7 +427,7 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa
int32_t
reqType
=
IsReq
(
pMsg
)
?
pMsg
->
msgType
:
(
pMsg
->
msgType
-
1
);
if
(
SCH_JOB_NEED_RETRY
(
pJob
,
pTask
,
reqType
,
rspCode
))
{
SCH_RET
(
schHandleJobRetry
());
SCH_RET
(
schHandleJobRetry
(
pJob
,
pTask
,
(
SDataBuf
*
)
pMsg
,
rspCode
));
}
else
if
(
SCH_TASKSET_NEED_RETRY
(
pJob
,
pTask
,
reqType
,
rspCode
))
{
SCH_RET
(
schHandleTaskSetRetry
(
pJob
,
pTask
,
(
SDataBuf
*
)
pMsg
,
rspCode
));
}
...
...
@@ -434,6 +435,12 @@ int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDa
pTask
->
redirectCtx
.
inRedirect
=
false
;
SCH_RET
(
schProcessResponseMsg
(
pJob
,
pTask
,
execId
,
pMsg
,
rspCode
));
_return:
taosMemoryFreeClear
(
msg
);
SCH_RET
(
schProcessOnTaskFailure
(
pJob
,
pTask
,
code
));
}
int32_t
schHandleCallback
(
void
*
param
,
SDataBuf
*
pMsg
,
int32_t
rspCode
)
{
int32_t
code
=
0
;
...
...
source/libs/scheduler/src/schTask.c
浏览文件 @
19e5d637
...
...
@@ -378,7 +378,8 @@ int32_t schChkUpdateRedirectCtx(SSchJob *pJob, SSchTask *pTask, SEpSet *pEpSet,
if
(
lastTime
>
tsMaxRetryWaitTime
)
{
SCH_TASK_DLOG
(
"task no more redirect retry since timeout, now:%"
PRId64
", start:%"
PRId64
", max:%d, total:%d"
,
nowTs
,
pCtx
->
startTs
,
tsMaxRetryWaitTime
,
pCtx
->
totalTimes
);
SCH_ERR_RET
(
SCH_GET_REDICT_CODE
(
pJob
,
rspCode
));
pJob
->
noMoreRetry
=
true
;
SCH_ERR_RET
(
SCH_GET_REDIRECT_CODE
(
pJob
,
rspCode
));
}
pCtx
->
periodMs
*=
tsRedirectFactor
;
...
...
@@ -408,16 +409,16 @@ void schResetTaskForRetry(SSchJob *pJob, SSchTask *pTask) {
pTask
->
waitRetry
=
true
;
schDropTaskOnExecNode
(
pJob
,
pTask
);
if
(
pTask
->
delayTimer
)
{
taosTmrStopA
(
&
pTask
->
delayTimer
);
}
taosHashClear
(
pTask
->
execNodes
);
schRemoveTaskFromExecList
(
pJob
,
pTask
);
schDeregisterTaskHb
(
pJob
,
pTask
);
atomic_sub_fetch_32
(
&
pTask
->
level
->
taskLaunchedNum
,
1
);
if
(
SCH_TASK_EXEC_DONE
(
pTask
))
{
atomic_sub_fetch_32
(
&
pTask
->
level
->
taskExecDoneNum
,
1
);
}
taosMemoryFreeClear
(
pTask
->
msg
);
pTask
->
msgLen
=
0
;
pTask
->
lastMsgType
=
0
;
pTask
->
childReady
=
0
;
memset
(
&
pTask
->
succeedAddr
,
0
,
sizeof
(
pTask
->
succeedAddr
));
}
...
...
@@ -427,7 +428,7 @@ int32_t schDoTaskRedirect(SSchJob *pJob, SSchTask *pTask, SDataBuf *pData, int32
SCH_TASK_DLOG
(
"task will be redirected now, status:%s, code:%s"
,
SCH_GET_TASK_STATUS_STR
(
pTask
),
tstrerror
(
rspCode
));
if
(
!
NO_RET_REDIRECT_ERROR
(
rspCode
))
{
SCH_UPDATE_REDICT_CODE
(
pJob
,
rspCode
);
SCH_UPDATE_REDI
RE
CT_CODE
(
pJob
,
rspCode
);
}
SCH_ERR_JRET
(
schChkUpdateRedirectCtx
(
pJob
,
pTask
,
pData
?
pData
->
pEpSet
:
NULL
,
rspCode
));
...
...
@@ -496,6 +497,17 @@ int32_t schHandleTaskSetRetry(SSchJob *pJob, SSchTask *pTask, SDataBuf *pData, i
}
}
SCH_TASK_DLOG
(
"start to redirect current task set cause of error: %s"
,
tstrerror
(
rspCode
));
for
(
int32_t
i
=
0
;
i
<
pJob
->
levelNum
;
++
i
)
{
SSchLevel
*
pLevel
=
taosArrayGet
(
pJob
->
levels
,
i
);
pLevel
->
taskExecDoneNum
=
0
;
pLevel
->
taskLaunchedNum
=
0
;
}
SCH_RESET_JOB_LEVEL_IDX
(
pJob
);
code
=
schDoTaskRedirect
(
pJob
,
pTask
,
pData
,
rspCode
);
taosMemoryFreeClear
(
pData
->
pData
);
...
...
@@ -609,6 +621,13 @@ int32_t schMoveTaskToExecList(SSchJob *pJob, SSchTask *pTask, bool *moved) {
*/
int32_t
schTaskCheckSetRetry
(
SSchJob
*
pJob
,
SSchTask
*
pTask
,
int32_t
errCode
,
bool
*
needRetry
)
{
if
(
pJob
->
noMoreRetry
)
{
*
needRetry
=
false
;
SCH_TASK_DLOG
(
"task no more retry since job no more retry, retryTimes:%d/%d"
,
pTask
->
retryTimes
,
pTask
->
maxRetryTimes
);
return
TSDB_CODE_SUCCESS
;
}
if
(
TSDB_CODE_SCH_TIMEOUT_ERROR
==
errCode
)
{
pTask
->
maxExecTimes
++
;
pTask
->
maxRetryTimes
++
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录