未验证 提交 6f3bf5b7 编写于 作者: W wade zhang 提交者: GitHub

Merge pull request #15423 from taosdata/enh/crash_gen

Enhanced crash_gen tool to report better error for deadlocks
......@@ -371,7 +371,9 @@ class ThreadCoordinator:
if isinstance(err, CrashGenError): # our own transition failure
Logging.info("State transition error")
# TODO: saw an error here once, let's print out stack info for err?
traceback.print_stack()
traceback.print_stack() # Stack frame to here.
Logging.info("Caused by:")
traceback.print_exception(*sys.exc_info()) # Ref: https://www.geeksforgeeks.org/how-to-print-exception-stack-trace-in-python/
transitionFailed = True
self._te = None # Not running any more
self._execStats.registerFailure("State transition error: {}".format(err))
......@@ -741,7 +743,8 @@ class AnyState:
sCnt += 1
if (sCnt >= 2):
raise CrashGenError(
"Unexpected more than 1 success with task: {}, in task set: {}".format(
"Unexpected more than 1 success at state: {}, with task: {}, in task set: {}".format(
self.__class__.__name__,
cls.__name__, # verified just now that isinstance(task, cls)
[c.__class__.__name__ for c in tasks]
))
......@@ -756,8 +759,11 @@ class AnyState:
if task.isSuccess():
sCnt += 1
if (exists and sCnt <= 0):
raise CrashGenError("Unexpected zero success for task type: {}, from tasks: {}"
.format(cls, tasks))
raise CrashGenError("Unexpected zero success at state: {}, with task: {}, in task set: {}".format(
self.__class__.__name__,
cls.__name__, # verified just now that isinstance(task, cls)
[c.__class__.__name__ for c in tasks]
))
def assertNoTask(self, tasks, cls):
for task in tasks:
......@@ -809,8 +815,6 @@ class StateEmpty(AnyState):
]
def verifyTasksToState(self, tasks, newState):
if Config.getConfig().ignore_errors: # if we are asked to ignore certain errors, let's not verify CreateDB success.
return
if (self.hasSuccess(tasks, TaskCreateDb)
): # at EMPTY, if there's succes in creating DB
if (not self.hasTask(tasks, TaskDropDb)): # and no drop_db tasks
......@@ -995,16 +999,17 @@ class StateMechine:
dbc.execute("show dnodes")
# Generic Checks, first based on the start state
if self._curState.canCreateDb():
self._curState.assertIfExistThenSuccess(tasks, TaskCreateDb)
# self.assertAtMostOneSuccess(tasks, CreateDbTask) # not really, in
# case of multiple creation and drops
if self._curState.canDropDb():
if gSvcMgr == None: # only if we are running as client-only
self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
# self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
# case of drop-create-drop
if not Config.getConfig().ignore_errors: # verify state, only if we are asked not to ignore certain errors.
if self._curState.canCreateDb():
self._curState.assertIfExistThenSuccess(tasks, TaskCreateDb)
# self.assertAtMostOneSuccess(tasks, CreateDbTask) # not really, in
# case of multiple creation and drops
if self._curState.canDropDb():
if gSvcMgr == None: # only if we are running as client-only
self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
# self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
# case of drop-create-drop
# if self._state.canCreateFixedTable():
# self.assertIfExistThenSuccess(tasks, CreateFixedTableTask) # Not true, DB may be dropped
......@@ -1026,7 +1031,8 @@ class StateMechine:
newState = self._findCurrentState(dbc)
Logging.debug("[STT] New DB state determined: {}".format(newState))
# can old state move to new state through the tasks?
self._curState.verifyTasksToState(tasks, newState)
if not Config.getConfig().ignore_errors: # verify state, only if we are asked not to ignore certain errors.
self._curState.verifyTasksToState(tasks, newState)
self._curState = newState
def pickTaskType(self):
......@@ -2231,16 +2237,14 @@ class TaskAddData(StateTransitionTask):
class ThreadStacks: # stack info for all threads
def __init__(self):
self._allStacks = {}
allFrames = sys._current_frames() # All current stack frames
allFrames = sys._current_frames() # All current stack frames, keyed with "ident"
for th in threading.enumerate(): # For each thread
if th.ident is None:
continue
stack = traceback.extract_stack(allFrames[th.ident]) # Get stack for a thread
shortTid = th.ident % 10000
stack = traceback.extract_stack(allFrames[th.ident]) #type: ignore # Get stack for a thread
shortTid = th.native_id % 10000 #type: ignore
self._allStacks[shortTid] = stack # Was using th.native_id
def print(self, filteredEndName = None, filterInternal = False):
for tIdent, stack in self._allStacks.items(): # for each thread, stack frames top to bottom
for shortTid, stack in self._allStacks.items(): # for each thread, stack frames top to bottom
lastFrame = stack[-1]
if filteredEndName: # we need to filter out stacks that match this name
if lastFrame.name == filteredEndName : # end did not match
......@@ -2252,7 +2256,9 @@ class ThreadStacks: # stack info for all threads
'__init__']: # the thread that extracted the stack
continue # ignore
# Now print
print("\n<----- Thread Info for LWP/ID: {} (most recent call last) <-----".format(tIdent))
print("\n<----- Thread Info for LWP/ID: {} (most recent call last) <-----".format(shortTid))
lastSqlForThread = DbConn.fetchSqlForThread(shortTid)
print("Last SQL statement attempted from thread {} is: {}".format(shortTid, lastSqlForThread))
stackFrame = 0
for frame in stack: # was using: reversed(stack)
# print(frame)
......
......@@ -27,6 +27,26 @@ class DbConn:
TYPE_REST = "rest-api"
TYPE_INVALID = "invalid"
# class variables
lastSqlFromThreads : dict[int, str] = {} # stored by thread id, obtained from threading.current_thread().ident%10000
@classmethod
def saveSqlForCurrentThread(cls, sql: str):
'''
Let us save the last SQL statement on a per-thread basis, so that when later we
run into a dead-lock situation, we can pick out the deadlocked thread, and use
that information to find what what SQL statement is stuck.
'''
th = threading.current_thread()
shortTid = th.native_id % 10000 #type: ignore
cls.lastSqlFromThreads[shortTid] = sql # Save this for later
@classmethod
def fetchSqlForThread(cls, shortTid : int) -> str :
if shortTid not in cls.lastSqlFromThreads:
raise CrashGenError("No last-attempted-SQL found for thread id: {}".format(shortTid))
return cls.lastSqlFromThreads[shortTid]
@classmethod
def create(cls, connType, dbTarget):
if connType == cls.TYPE_NATIVE:
......@@ -163,6 +183,7 @@ class DbConnRest(DbConn):
def _doSql(self, sql):
self._lastSql = sql # remember this, last SQL attempted
self.saveSqlForCurrentThread(sql) # Save in global structure too. #TODO: combine with above
try:
r = requests.post(self._url,
data = sql,
......@@ -392,6 +413,7 @@ class DbConnNative(DbConn):
"Cannot exec SQL unless db connection is open", CrashGenError.DB_CONNECTION_NOT_OPEN)
Logging.debug("[SQL] Executing SQL: {}".format(sql))
self._lastSql = sql
self.saveSqlForCurrentThread(sql) # Save in global structure too. #TODO: combine with above
nRows = self._tdSql.execute(sql)
cls = self.__class__
cls.totalRequests += 1
......@@ -407,6 +429,7 @@ class DbConnNative(DbConn):
"Cannot query database until connection is open, restarting?", CrashGenError.DB_CONNECTION_NOT_OPEN)
Logging.debug("[SQL] Executing SQL: {}".format(sql))
self._lastSql = sql
self.saveSqlForCurrentThread(sql) # Save in global structure too. #TODO: combine with above
nRows = self._tdSql.query(sql)
cls = self.__class__
cls.totalRequests += 1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册