Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
taosdata
TDengine
提交
943bbe22
T
TDengine
项目概览
taosdata
/
TDengine
1 年多 前同步成功
通知
1185
Star
22016
Fork
4786
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
TDengine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
943bbe22
编写于
7月 22, 2020
作者:
S
Steven Li
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Enhanced crash_gen tool to report useful info when stuck
上级
3b419a1c
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
179 addition
and
47 deletion
+179
-47
tests/pytest/crash_gen.py
tests/pytest/crash_gen.py
+179
-47
未找到文件。
tests/pytest/crash_gen.py
浏览文件 @
943bbe22
...
...
@@ -42,6 +42,13 @@ import os
import
io
import
signal
import
traceback
try
:
import
psutil
except
:
print
(
"Psutil module needed, please install: sudo pip3 install psutil"
)
sys
.
exit
(
-
1
)
# Require Python 3
if
sys
.
version_info
[
0
]
<
3
:
raise
Exception
(
"Must be using Python 3"
)
...
...
@@ -69,8 +76,7 @@ class CrashGenError(Exception):
class
WorkerThread
:
def
__init__
(
self
,
pool
:
ThreadPool
,
tid
,
tc
:
ThreadCoordinator
,
def
__init__
(
self
,
pool
:
ThreadPool
,
tid
,
tc
:
ThreadCoordinator
,
# te: TaskExecutor,
):
# note: main thread context!
# self._curStep = -1
...
...
@@ -138,7 +144,12 @@ class WorkerThread:
# tc = ThreadCoordinator(None)
while
True
:
tc
=
self
.
_tc
# Thread Coordinator, the overall master
tc
.
crossStepBarrier
()
# shared barrier first, INCLUDING the last one
try
:
tc
.
crossStepBarrier
()
# shared barrier first, INCLUDING the last one
except
threading
.
BrokenBarrierError
as
err
:
# main thread timed out
logger
.
debug
(
"[TRD] Worker thread exiting due to main thread barrier time-out"
)
break
logger
.
debug
(
"[TRD] Worker thread [{}] exited barrier..."
.
format
(
self
.
_tid
))
self
.
crossStepGate
()
# then per-thread gate, after being tapped
logger
.
debug
(
"[TRD] Worker thread [{}] exited step gate..."
.
format
(
self
.
_tid
))
...
...
@@ -248,14 +259,14 @@ class ThreadCoordinator:
def
getDbManager
(
self
)
->
DbManager
:
return
self
.
_dbManager
def
crossStepBarrier
(
self
):
self
.
_stepBarrier
.
wait
(
)
def
crossStepBarrier
(
self
,
timeout
=
None
):
self
.
_stepBarrier
.
wait
(
timeout
)
def
requestToStop
(
self
):
self
.
_runStatus
=
MainExec
.
STATUS_STOPPING
self
.
_execStats
.
registerFailure
(
"User Interruption"
)
def
_runShouldEnd
(
self
,
transitionFailed
,
hasAbortedTask
):
def
_runShouldEnd
(
self
,
transitionFailed
,
hasAbortedTask
,
workerTimeout
):
maxSteps
=
gConfig
.
max_steps
# type: ignore
if
self
.
_curStep
>=
(
maxSteps
-
1
):
# maxStep==10, last curStep should be 9
return
True
...
...
@@ -265,6 +276,8 @@ class ThreadCoordinator:
return
True
if
hasAbortedTask
:
return
True
if
workerTimeout
:
return
True
return
False
def
_hasAbortedTask
(
self
):
# from execution of previous step
...
...
@@ -296,7 +309,7 @@ class ThreadCoordinator:
# let other threads go past the pool barrier, but wait at the
# thread gate
logger
.
debug
(
"[TRD] Main thread about to cross the barrier"
)
self
.
crossStepBarrier
()
self
.
crossStepBarrier
(
timeout
=
15
)
self
.
_stepBarrier
.
reset
()
# Other worker threads should now be at the "gate"
logger
.
debug
(
"[TRD] Main thread finished crossing the barrier"
)
...
...
@@ -342,11 +355,21 @@ class ThreadCoordinator:
self
.
_execStats
.
startExec
()
# start the stop watch
transitionFailed
=
False
hasAbortedTask
=
False
while
not
self
.
_runShouldEnd
(
transitionFailed
,
hasAbortedTask
):
workerTimeout
=
False
while
not
self
.
_runShouldEnd
(
transitionFailed
,
hasAbortedTask
,
workerTimeout
):
if
not
gConfig
.
debug
:
# print this only if we are not in debug mode
print
(
"."
,
end
=
""
,
flush
=
True
)
self
.
_syncAtBarrier
()
# For now just cross the barrier
try
:
self
.
_syncAtBarrier
()
# For now just cross the barrier
except
threading
.
BrokenBarrierError
as
err
:
logger
.
info
(
"Main loop aborted, caused by worker thread time-out"
)
self
.
_execStats
.
registerFailure
(
"Aborted due to worker thread timeout"
)
print
(
"
\n\n
Worker Thread time-out detected, important thread info:"
)
ts
=
ThreadStacks
()
ts
.
print
(
filterInternal
=
True
)
workerTimeout
=
True
break
# At this point, all threads should be pass the overall "barrier" and before the per-thread "gate"
# We use this period to do house keeping work, when all worker
...
...
@@ -364,6 +387,8 @@ class ThreadCoordinator:
if
hasAbortedTask
or
transitionFailed
:
# abnormal ending, workers waiting at "gate"
logger
.
debug
(
"Abnormal ending of main thraed"
)
elif
workerTimeout
:
logger
.
debug
(
"Abnormal ending of main thread, due to worker timeout"
)
else
:
# regular ending, workers waiting at "barrier"
logger
.
debug
(
"Regular ending, main thread waiting for all worker threads to stop..."
)
self
.
_syncAtBarrier
()
...
...
@@ -569,9 +594,7 @@ class DbConn:
# below implemented by child classes
self
.
openByType
()
logger
.
debug
(
"[DB] data connection opened, type = {}"
.
format
(
self
.
_type
))
logger
.
debug
(
"[DB] data connection opened, type = {}"
.
format
(
self
.
_type
))
self
.
isOpen
=
True
def
resetDb
(
self
):
# reset the whole database, etc.
...
...
@@ -786,9 +809,7 @@ class DbConnNative(DbConn):
self
.
__class__
.
_connInfoDisplayed
=
True
# updating CLASS variable
logger
.
info
(
"Initiating TAOS native connection to {}, using config at {}"
.
format
(
hostAddr
,
cfgPath
))
self
.
_conn
=
taos
.
connect
(
host
=
hostAddr
,
config
=
cfgPath
)
# TODO: make configurable
self
.
_conn
=
taos
.
connect
(
host
=
hostAddr
,
config
=
cfgPath
)
# TODO: make configurable
self
.
_cursor
=
self
.
_conn
.
cursor
()
self
.
_cursor
.
execute
(
'reset query cache'
)
...
...
@@ -1674,7 +1695,7 @@ class StateTransitionTask(Task):
@
classmethod
def
getRegTableName
(
cls
,
i
):
return
"
db.
reg_table_{}"
.
format
(
i
)
return
"reg_table_{}"
.
format
(
i
)
def
execute
(
self
,
wt
:
WorkerThread
):
super
().
execute
(
wt
)
...
...
@@ -1984,9 +2005,9 @@ class MyLoggingAdapter(logging.LoggerAdapter):
class
SvcManager
:
def
__init__
(
self
):
print
(
"Starting TDengine Service Manager"
)
signal
.
signal
(
signal
.
SIGTERM
,
self
.
sigIntHandler
)
signal
.
signal
(
signal
.
SIGINT
,
self
.
sigIntHandler
)
signal
.
signal
(
signal
.
SIGUSR1
,
self
.
sigUsrHandler
)
# different handler!
# signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
#
signal.signal(signal.SIGINT, self.sigIntHandler)
#
signal.signal(signal.SIGUSR1, self.sigUsrHandler) # different handler!
self
.
inSigHandler
=
False
# self._status = MainExec.STATUS_RUNNING # set inside
...
...
@@ -2035,14 +2056,14 @@ class SvcManager:
self
.
inSigHandler
=
False
def
sigIntHandler
(
self
,
signalNumber
,
frame
):
print
(
"S
ig INT
Handler starting..."
)
print
(
"S
vcManager: INT Signal
Handler starting..."
)
if
self
.
inSigHandler
:
print
(
"Ignoring repeated SIG_INT..."
)
return
self
.
inSigHandler
=
True
self
.
stopTaosService
()
print
(
"
INT signal h
andler returning..."
)
print
(
"
SvcManager: INT Signal H
andler returning..."
)
self
.
inSigHandler
=
False
def
sigHandlerResume
(
self
):
...
...
@@ -2064,8 +2085,16 @@ class SvcManager:
def
startTaosService
(
self
):
if
self
.
svcMgrThread
:
raise
RuntimeError
(
"Cannot start TAOS service when one may already be running"
)
raise
RuntimeError
(
"Cannot start TAOS service when one may already be running"
)
# Find if there's already a taosd service, and then kill it
for
proc
in
psutil
.
process_iter
():
if
proc
.
name
()
==
'taosd'
:
print
(
"Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe"
)
time
.
sleep
(
2.0
)
proc
.
kill
()
# print("Process: {}".format(proc.name()))
self
.
svcMgrThread
=
ServiceManagerThread
()
# create the object
self
.
svcMgrThread
.
start
()
print
(
"TAOS service started, printing out output..."
)
...
...
@@ -2075,9 +2104,11 @@ class SvcManager:
print
(
"TAOS service started"
)
def
stopTaosService
(
self
,
outputLines
=
20
):
if
not
self
.
isRunning
():
logger
.
warning
(
"Cannot stop TAOS service, not running"
)
return
print
(
"Terminating Service Manager Thread (SMT) execution..."
)
if
not
self
.
svcMgrThread
:
raise
RuntimeError
(
"Unexpected empty svc mgr thread"
)
self
.
svcMgrThread
.
stop
()
if
self
.
svcMgrThread
.
isStopped
():
self
.
svcMgrThread
.
procIpcBatch
(
outputLines
)
# one last time
...
...
@@ -2090,9 +2121,11 @@ class SvcManager:
def
run
(
self
):
self
.
startTaosService
()
self
.
_procIpcAll
()
# pump/process all the messages
if
self
.
svcMgrThread
:
# if sig handler hasn't destroyed it by now
if
self
.
isRunning
()
:
# if sig handler hasn't destroyed it by now
self
.
stopTaosService
()
# should have started already
def
isRunning
(
self
):
return
self
.
svcMgrThread
!=
None
class
ServiceManagerThread
:
MAX_QUEUE_SIZE
=
10000
...
...
@@ -2144,6 +2177,7 @@ class ServiceManagerThread:
logger
.
info
(
"[] TDengine service READY to process requests"
)
return
# now we've started
# TODO: handle this better?
self
.
procIpcBatch
(
20
,
True
)
# display output before cronking out, trim to last 20 msgs, force output
raise
RuntimeError
(
"TDengine service did not start successfully"
)
def
stop
(
self
):
...
...
@@ -2292,6 +2326,15 @@ class TdeSubProcess:
taosdPath
=
self
.
getBuildPath
()
+
"/build/bin/taosd"
cfgPath
=
self
.
getBuildPath
()
+
"/test/cfg"
# Delete the log files
logPath
=
self
.
getBuildPath
()
+
"/test/log"
# ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
filelist
=
[
f
for
f
in
os
.
listdir
(
logPath
)
]
# if f.endswith(".bak") ]
for
f
in
filelist
:
filePath
=
os
.
path
.
join
(
logPath
,
f
)
print
(
"Removing log file: {}"
.
format
(
filePath
))
os
.
remove
(
filePath
)
svcCmd
=
[
taosdPath
,
'-c'
,
cfgPath
]
# svcCmd = ['vmstat', '1']
if
self
.
subProcess
:
# already there
...
...
@@ -2325,16 +2368,46 @@ class TdeSubProcess:
print
(
"TDengine service process terminated successfully from SIG_INT"
)
self
.
subProcess
=
None
class
ThreadStacks
:
# stack info for all threads
def
__init__
(
self
):
self
.
_allStacks
=
{}
allFrames
=
sys
.
_current_frames
()
for
th
in
threading
.
enumerate
():
stack
=
traceback
.
extract_stack
(
allFrames
[
th
.
ident
])
self
.
_allStacks
[
th
.
native_id
]
=
stack
def
print
(
self
,
filteredEndName
=
None
,
filterInternal
=
False
):
for
thNid
,
stack
in
self
.
_allStacks
.
items
():
# for each thread
lastFrame
=
stack
[
-
1
]
if
filteredEndName
:
# we need to filter out stacks that match this name
if
lastFrame
.
name
==
filteredEndName
:
# end did not match
continue
if
filterInternal
:
if
lastFrame
.
name
in
[
'wait'
,
'invoke_excepthook'
,
'_wait'
,
# The Barrier exception
'svcOutputReader'
,
# the svcMgr thread
'__init__'
]:
# the thread that extracted the stack
continue
# ignore
# Now print
print
(
"
\n
<----- Thread Info for ID: {}"
.
format
(
thNid
))
for
frame
in
stack
:
# print(frame)
print
(
"File {filename}, line {lineno}, in {name}"
.
format
(
filename
=
frame
.
filename
,
lineno
=
frame
.
lineno
,
name
=
frame
.
name
))
print
(
" {}"
.
format
(
frame
.
line
))
print
(
"-----> End of Thread Info
\n
"
)
class
ClientManager
:
def
__init__
(
self
):
print
(
"Starting service manager"
)
signal
.
signal
(
signal
.
SIGTERM
,
self
.
sigIntHandler
)
signal
.
signal
(
signal
.
SIGINT
,
self
.
sigIntHandler
)
#
signal.signal(signal.SIGTERM, self.sigIntHandler)
#
signal.signal(signal.SIGINT, self.sigIntHandler)
self
.
_status
=
MainExec
.
STATUS_RUNNING
self
.
tc
=
None
self
.
inSigHandler
=
False
def
sigIntHandler
(
self
,
signalNumber
,
frame
):
if
self
.
_status
!=
MainExec
.
STATUS_RUNNING
:
print
(
"Repeated SIGINT received, forced exit..."
)
...
...
@@ -2342,9 +2415,50 @@ class ClientManager:
sys
.
exit
(
-
1
)
self
.
_status
=
MainExec
.
STATUS_STOPPING
# immediately set our status
print
(
"Terminating program..."
)
print
(
"
ClientManager:
Terminating program..."
)
self
.
tc
.
requestToStop
()
def
_doMenu
(
self
):
choice
=
""
while
True
:
print
(
"
\n
Interrupting Client Program, Choose an Action: "
)
print
(
"1: Resume"
)
print
(
"2: Terminate"
)
print
(
"3: Show Threads"
)
# Remember to update the if range below
# print("Enter Choice: ", end="", flush=True)
while
choice
==
""
:
choice
=
input
(
"Enter Choice: "
)
if
choice
!=
""
:
break
# done with reading repeated input
if
choice
in
[
"1"
,
"2"
,
"3"
]:
break
# we are done with whole method
print
(
"Invalid choice, please try again."
)
choice
=
""
# reset
return
choice
def
sigUsrHandler
(
self
,
signalNumber
,
frame
):
print
(
"Interrupting main thread execution upon SIGUSR1"
)
if
self
.
inSigHandler
:
# already
print
(
"Ignoring repeated SIG_USR1..."
)
return
# do nothing if it's already not running
self
.
inSigHandler
=
True
choice
=
self
.
_doMenu
()
if
choice
==
"1"
:
print
(
"Resuming execution..."
)
time
.
sleep
(
1.0
)
elif
choice
==
"2"
:
print
(
"Not implemented yet"
)
time
.
sleep
(
1.0
)
elif
choice
==
"3"
:
ts
=
ThreadStacks
()
ts
.
print
()
else
:
raise
RuntimeError
(
"Invalid menu choice: {}"
.
format
(
choice
))
self
.
inSigHandler
=
False
def
_printLastNumbers
(
self
):
# to verify data durability
dbManager
=
DbManager
(
resetDb
=
False
)
dbc
=
dbManager
.
getDbConn
()
...
...
@@ -2377,11 +2491,7 @@ class ClientManager:
def
prepare
(
self
):
self
.
_printLastNumbers
()
def
run
(
self
):
if
gConfig
.
auto_start_service
:
svcMgr
=
SvcManager
()
svcMgr
.
startTaosService
()
def
run
(
self
,
svcMgr
):
self
.
_printLastNumbers
()
dbManager
=
DbManager
()
# Regular function
...
...
@@ -2391,7 +2501,7 @@ class ClientManager:
self
.
tc
.
run
()
# print("exec stats: {}".format(self.tc.getExecStats()))
# print("TC failed = {}".format(self.tc.isFailed()))
if
gConfig
.
auto_start_service
:
if
svcMgr
:
#
gConfig.auto_start_service:
svcMgr
.
stopTaosService
()
# Print exec status, etc., AFTER showing messages from the server
self
.
conclude
()
...
...
@@ -2410,18 +2520,39 @@ class MainExec:
STATUS_STOPPING
=
3
STATUS_STOPPED
=
4
@
classmethod
def
runClient
(
cls
):
clientManager
=
ClientManager
()
return
clientManager
.
run
()
def
__init__
(
self
):
self
.
_clientMgr
=
None
self
.
_svcMgr
=
None
@
classmethod
def
runService
(
cls
):
svcManager
=
SvcManager
()
svcManager
.
run
()
signal
.
signal
(
signal
.
SIGTERM
,
self
.
sigIntHandler
)
signal
.
signal
(
signal
.
SIGINT
,
self
.
sigIntHandler
)
signal
.
signal
(
signal
.
SIGUSR1
,
self
.
sigUsrHandler
)
# different handler!
@
classmethod
def
runTemp
(
cls
):
# for debugging purposes
def
sigUsrHandler
(
self
,
signalNumber
,
frame
):
if
self
.
_clientMgr
:
self
.
_clientMgr
.
sigUsrHandler
(
signalNumber
,
frame
)
elif
self
.
_svcMgr
:
# Only if no client mgr, we are running alone
self
.
_svcMgr
.
sigUsrHandler
(
signalNumber
,
frame
)
def
sigIntHandler
(
self
,
signalNumber
,
frame
):
if
self
.
_svcMgr
:
self
.
_svcMgr
.
sigIntHandler
(
signalNumber
,
frame
)
if
self
.
_clientMgr
:
self
.
_clientMgr
.
sigIntHandler
(
signalNumber
,
frame
)
def
runClient
(
self
):
if
gConfig
.
auto_start_service
:
self
.
_svcMgr
=
SvcManager
()
self
.
_svcMgr
.
startTaosService
()
# we start, don't run
self
.
_clientMgr
=
ClientManager
()
ret
=
self
.
_clientMgr
.
run
(
self
.
_svcMgr
)
# stop TAOS service inside
def
runService
(
self
):
self
.
_svcMgr
=
SvcManager
()
self
.
_svcMgr
.
run
()
# run to some end state
def
runTemp
(
self
):
# for debugging purposes
# # Hack to exercise reading from disk, imcreasing coverage. TODO: fix
# dbc = dbState.getDbConn()
# sTbName = dbState.getFixedSuperTableName()
...
...
@@ -2577,10 +2708,11 @@ def main():
Dice
.
seed
(
0
)
# initial seeding of dice
# Run server or client
mExec
=
MainExec
()
if
gConfig
.
run_tdengine
:
# run server
Main
Exec
.
runService
()
m
Exec
.
runService
()
else
:
return
Main
Exec
.
runClient
()
return
m
Exec
.
runClient
()
if
__name__
==
"__main__"
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录