未验证 提交 bf7dedcb 编写于 作者: Z Zeng Jinle 提交者: GitHub

Merge pull request #15545 from sneaxiy/fix_debug_nccl_error

Fix nccl unittest error in debug mode
......@@ -85,7 +85,7 @@ class ProtoEncodeHelper {
#define REPLACE_ENFORCE_GLOG 1
// Make sure callers didn't do operations that went over max_size promised
if (paddle::platform::is_error(p_ <= limit_)) {
paddle::platform::throw_on_error(p_ <= limit_);
paddle::platform::throw_on_error(p_ <= limit_, "");
}
#undef REPLACE_ENFORCE_GLOG
}
......
......@@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception {
}
}
template <typename... ARGS>
EnforceNotMet(const char* f, int l, ARGS... args) {
Init(string::Sprintf(args...), f, l);
EnforceNotMet(const std::string& str, const char* f, int l) {
Init(str, f, l);
}
const char* what() const noexcept override { return err_str_.c_str(); }
......@@ -142,28 +141,23 @@ struct EOFException : public std::exception {
inline bool is_error(bool stat) { return !stat; }
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
bool stat, const Args&... args) {
inline void throw_on_error(bool stat, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(string::Sprintf(args...));
throw std::runtime_error(msg);
#else
LOG(FATAL) << string::Sprintf(args...);
LOG(FATAL) << msg;
#endif
}
#ifdef PADDLE_WITH_CUDA
inline bool is_error(cudaError_t e) { return UNLIKELY(e); }
inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) {
inline void throw_on_error(cudaError_t e, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(),
string::Sprintf(args...));
throw thrust::system_error(e, thrust::cuda_category(), msg);
#else
LOG(FATAL) << string::Sprintf(args...);
LOG(FATAL) << msg;
#endif
}
......@@ -171,14 +165,12 @@ inline bool is_error(curandStatus_t stat) {
return stat != CURAND_STATUS_SUCCESS;
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) {
inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
string::Sprintf(args...));
msg);
#else
LOG(FATAL) << string::Sprintf(args...);
LOG(FATAL) << msg;
#endif
}
......@@ -186,14 +178,11 @@ inline bool is_error(cudnnStatus_t stat) {
return stat != CUDNN_STATUS_SUCCESS;
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudnnStatus_t stat, const Args&... args) {
inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
string::Sprintf(args...));
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg);
#else
LOG(FATAL) << string::Sprintf(args...);
LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg;
#endif
}
......@@ -201,9 +190,7 @@ inline bool is_error(cublasStatus_t stat) {
return stat != CUBLAS_STATUS_SUCCESS;
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cublasStatus_t stat, const Args&... args) {
inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
std::string err;
if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
err = "CUBLAS: not initialized, ";
......@@ -225,87 +212,45 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
err = "CUBLAS: license error, ";
}
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(err + string::Sprintf(args...));
throw std::runtime_error(err + msg);
#else
LOG(FATAL) << err << string::Sprintf(args...);
LOG(FATAL) << err << msg;
#endif
}
#if !defined(__APPLE__) && !defined(_WIN32)
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
ncclResult_t stat, const Args&... args) {
if (stat == ncclSuccess) {
return;
} else {
inline bool is_error(ncclResult_t nccl_result) {
return nccl_result != ncclSuccess;
}
inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
string::Sprintf(args...));
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg);
#else
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
<< string::Sprintf(args...);
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg;
#endif
}
}
#endif // __APPLE__ and windows
#endif // PADDLE_WITH_CUDA
template <typename T>
inline void throw_on_error(T e) {
throw_on_error(e, "");
}
#define PADDLE_THROW(...) \
throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG));
#ifdef _WIN32
#define __PADDLE_THROW_ON_ERROR(COND, ...) \
__THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)
#else // _WIN32
#define __PADDLE_THROW_ON_ERROR(COND, ...) \
__PADDLE_THROW_ERROR_I( \
__VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
__THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__))
#endif // _WIN32
#define __PADDLE_UNARY_COMPARE(COND, ...) \
do { \
auto __cond = COND; \
if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
__PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \
} \
#define PADDLE_THROW(...) \
throw ::paddle::platform::EnforceNotMet( \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
#define PADDLE_ENFORCE(COND, ...) \
do { \
auto __cond__ = (COND); \
if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \
try { \
::paddle::platform::throw_on_error( \
__cond__, ::paddle::string::Sprintf(__VA_ARGS__)); \
} catch (...) { \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
__FILE__, __LINE__); \
} \
} \
} while (0)
#ifndef REPLACE_ENFORCE_GLOG
#define __PADDLE_ENFORCE_I(COND, ...) \
do { \
try { \
__PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \
} catch (...) { \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
__FILE__, __LINE__); \
} \
} while (0)
#else
#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
#endif // REPLACE_ENFORCE_GLOG
#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args
#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
......
......@@ -64,7 +64,7 @@ class NCCLGroupGuard {
}
inline ~NCCLGroupGuard() {
CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess);
PADDLE_ENFORCE(dynload::ncclGroupEnd());
NCCLMutex().unlock();
}
};
......
......@@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
}
inline std::string Sprintf() { return ""; }
template <typename... Args>
std::string Sprintf(const Args&... args) {
std::ostringstream oss;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册