未验证 提交 2469b578 编写于 作者: C Chen Weihang 提交者: GitHub

Unified paddle error format when catch system signal (#25765)

* unified signal error format

* refine signal error message
上级 818d38f1
......@@ -229,25 +229,66 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
}
#ifndef _WIN32
// Description Quoted from
// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
const struct {
const char *name;
const char *error_string;
} SignalErrorStrings[] = {
{"SIGSEGV", "Segmentation fault"},
{"SIGILL", "Illegal instruction"},
{"SIGFPE", "Erroneous arithmetic operation"},
{"SIGABRT", "Process abort signal"},
{"SIGBUS", "Access to an undefined portion of a memory object"},
{"SIGTERM", "Termination signal"},
};
bool StartsWith(const char *str, const char *prefix) {
size_t len_prefix = strlen(prefix);
size_t len_str = strlen(str);
return len_str < len_prefix ? false : memcmp(prefix, str, len_prefix) == 0;
}
const char *ParseSignalErrorString(const std::string &str) {
for (size_t i = 0;
i < (sizeof(SignalErrorStrings) / sizeof(*(SignalErrorStrings))); ++i) {
if (std::string::npos != str.find(SignalErrorStrings[i].name)) {
return SignalErrorStrings[i].error_string;
}
}
return "Unknown signal";
}
// Handle SIGSEGV, SIGILL, SIGFPE, SIGABRT, SIGBUS, and SIGTERM.
std::ostringstream signal_msg_dumper;
void SignalHandle(const char *data, int size) {
auto file_path = string::Sprintf("/tmp/paddle.%d.dump_info", ::getpid());
try {
// The signal is coming line by line but we print general guide just once
std::call_once(glog_warning_once_flag, [&]() {
LOG(WARNING) << "Warning: PaddlePaddle catches a failure signal, it may "
"not work properly\n";
LOG(WARNING) << "You could check whether you killed PaddlePaddle "
"thread/process accidentally or report the case to "
"PaddlePaddle\n";
LOG(WARNING) << "The detail failure signal is:\n\n";
});
LOG(WARNING) << std::string(data, size);
std::ofstream dump_info;
dump_info.open(file_path, std::ios::app);
dump_info << std::string(data, size);
dump_info.close();
// NOTE1: The glog FailureSignalHandler dumped messages
// are deal with line by line
// NOTE2: we only deal with the time info ane signal info,
// the stack trace will generated by paddle self
if (StartsWith(data, "*** Aborted at")) {
signal_msg_dumper << " [TimeInfo: " << std::string(data, size - 1)
<< "]\n";
} else if (StartsWith(data, "***")) {
std::string signal_info(data, size - 1);
std::string useless_substr("; stack trace:");
size_t start_pos = signal_info.rfind(useless_substr);
signal_info.replace(start_pos, useless_substr.length(), "");
signal_msg_dumper << " [SignalInfo: " << signal_info << "]\n";
// NOTE3: Here does not throw an exception,
// otherwise it will casue "terminate called recursively"
auto exp = platform::EnforceNotMet(
platform::errors::Fatal(
"A serious error (%s) is detected by the operating system.",
ParseSignalErrorString(signal_info)),
__FILE__, __LINE__);
std::cout << exp.what() << signal_msg_dumper.str() << std::endl;
}
} catch (...) {
// Since the program has already triggered a system error,
// no further processing is required here, glog FailureSignalHandler
// will Kill program by the default signal handler
}
}
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册