Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
taosdata
TDengine
提交
a0b83d47
T
TDengine
项目概览
taosdata
/
TDengine
1 年多 前同步成功
通知
1185
Star
22016
Fork
4786
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
TDengine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a0b83d47
编写于
10月 28, 2020
作者:
S
Steven Li
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Enhanced crash_gen to use SIG_KILL insteadl of SIG_INT when restarting services
上级
4b4e8422
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
43 addition
and
9 deletion
+43
-9
tests/pytest/crash_gen/crash_gen.py
tests/pytest/crash_gen/crash_gen.py
+31
-6
tests/pytest/crash_gen/service_manager.py
tests/pytest/crash_gen/service_manager.py
+12
-3
未找到文件。
tests/pytest/crash_gen/crash_gen.py
浏览文件 @
a0b83d47
...
...
@@ -1226,6 +1226,11 @@ class Task():
"To be implemeted by child classes, class name: {}"
.
format
(
self
.
__class__
.
__name__
))
def
_isServiceStable
(
self
):
if
not
gSvcMgr
:
return
True
# we don't run service, so let's assume it's stable
return
gSvcMgr
.
isStable
()
# otherwise let's examine the service
def
_isErrAcceptable
(
self
,
errno
,
msg
):
if
errno
in
[
0x05
,
# TSDB_CODE_RPC_NOT_READY
...
...
@@ -1263,7 +1268,7 @@ class Task():
return
True
elif
msg
.
find
(
"duplicated column names"
)
!=
-
1
:
# also alter table tag issues
return
True
elif
gSvcMgr
and
(
not
gSvcMgr
.
isStable
()
):
# We are managing service, and ...
elif
not
self
.
_isServiceStable
(
):
# We are managing service, and ...
Logging
.
info
(
"Ignoring error when service starting/stopping: errno = {}, msg = {}"
.
format
(
errno
,
msg
))
return
True
...
...
@@ -1641,15 +1646,35 @@ class TaskReadData(StateTransitionTask):
def
canBeginFrom
(
cls
,
state
:
AnyState
):
return
state
.
canReadData
()
# def _canRestartService(self):
# if not gSvcMgr:
# return True # always
# return gSvcMgr.isActive() # only if it's running TODO: race condition here
def
_executeInternal
(
self
,
te
:
TaskExecutor
,
wt
:
WorkerThread
):
sTable
=
self
.
_db
.
getFixedSuperTable
()
# 1 in 5 chance, simulate a broken connection.
if
random
.
randrange
(
5
)
==
0
:
# TODO: break connection in all situations
wt
.
getDbConn
().
close
()
wt
.
getDbConn
().
open
()
# 1 in 5 chance, simulate a broken connection, only if service stable (not restarting)
if
random
.
randrange
(
20
)
==
0
:
# and self._canRestartService(): # TODO: break connection in all situations
Logging
.
info
(
"Attempting to reconnect to server"
)
# TODO: change to DEBUG
try
:
wt
.
getDbConn
().
close
()
wt
.
getDbConn
().
open
()
except
ConnectionError
as
err
:
# may fail
if
not
gSvcMgr
:
Logging
.
error
(
"Failed to reconnect in client-only mode"
)
raise
# Not OK if we are running in client-only mode
if
gSvcMgr
.
isRunning
():
# may have race conditon, but low prob, due to
Logging
.
error
(
"Failed to reconnect when managed server is running"
)
raise
# Not OK if we are running normally
Logging
.
info
(
"Ignoring DB reconnect error"
)
print
(
"_r"
,
end
=
""
,
flush
=
True
)
# The above might have taken a lot of time, service might be running
# by now, causing error below to be incorrectly handled due to timing issue
return
# TODO: fix server restart status race condtion
dbc
=
wt
.
getDbConn
()
dbName
=
self
.
_db
.
getName
()
for
rTbName
in
sTable
.
getRegTables
(
dbc
,
dbName
):
# regular tables
...
...
tests/pytest/crash_gen/service_manager.py
浏览文件 @
a0b83d47
...
...
@@ -280,16 +280,18 @@ class TdeSubProcess:
# process still alive, let's interrupt it
print
(
"Terminate running process, send SIG_INT and wait..."
)
# sub process should end, then IPC queue should end, causing IO thread to end
self
.
subProcess
.
send_signal
(
signal
.
SIGINT
)
# sig = signal.SIGINT
sig
=
signal
.
SIGKILL
self
.
subProcess
.
send_signal
(
sig
)
# SIGNINT or SIGKILL
self
.
subProcess
.
wait
(
20
)
retCode
=
self
.
subProcess
.
returncode
# should always be there
# May throw subprocess.TimeoutExpired exception above, therefore
# The process is guranteed to have ended by now
self
.
subProcess
=
None
if
retCode
!=
0
:
# != (- signal.SIGINT):
Logging
.
error
(
"TSP.stop(): Failed to stop sub proc properly w/ SIG
_INT, retCode={}"
.
format
(
retCode
))
Logging
.
error
(
"TSP.stop(): Failed to stop sub proc properly w/ SIG
{}, retCode={}"
.
format
(
sig
,
retCode
))
else
:
Logging
.
info
(
"TSP.stop(): sub proc successfully terminated with SIG
_INT"
)
Logging
.
info
(
"TSP.stop(): sub proc successfully terminated with SIG
{}"
.
format
(
sig
)
)
return
-
retCode
class
ServiceManager
:
...
...
@@ -395,6 +397,13 @@ class ServiceManager:
return
True
return
False
def
isRunning
(
self
):
for
ti
in
self
.
_tInsts
:
if
not
ti
.
getStatus
().
isRunning
():
return
False
return
True
# def isRestarting(self):
# """
# Determine if the service/cluster is being "restarted", i.e., at least
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录