未验证 提交 bf7dedcb 编写于 作者: Z Zeng Jinle 提交者: GitHub

Merge pull request #15545 from sneaxiy/fix_debug_nccl_error

Fix nccl unittest error in debug mode
...@@ -85,7 +85,7 @@ class ProtoEncodeHelper { ...@@ -85,7 +85,7 @@ class ProtoEncodeHelper {
#define REPLACE_ENFORCE_GLOG 1 #define REPLACE_ENFORCE_GLOG 1
// Make sure callers didn't do operations that went over max_size promised // Make sure callers didn't do operations that went over max_size promised
if (paddle::platform::is_error(p_ <= limit_)) { if (paddle::platform::is_error(p_ <= limit_)) {
paddle::platform::throw_on_error(p_ <= limit_); paddle::platform::throw_on_error(p_ <= limit_, "");
} }
#undef REPLACE_ENFORCE_GLOG #undef REPLACE_ENFORCE_GLOG
} }
......
...@@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception { ...@@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception {
} }
} }
template <typename... ARGS> EnforceNotMet(const std::string& str, const char* f, int l) {
EnforceNotMet(const char* f, int l, ARGS... args) { Init(str, f, l);
Init(string::Sprintf(args...), f, l);
} }
const char* what() const noexcept override { return err_str_.c_str(); } const char* what() const noexcept override { return err_str_.c_str(); }
...@@ -142,28 +141,23 @@ struct EOFException : public std::exception { ...@@ -142,28 +141,23 @@ struct EOFException : public std::exception {
inline bool is_error(bool stat) { return !stat; } inline bool is_error(bool stat) { return !stat; }
template <typename... Args> inline void throw_on_error(bool stat, const std::string& msg) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
bool stat, const Args&... args) {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(string::Sprintf(args...)); throw std::runtime_error(msg);
#else #else
LOG(FATAL) << string::Sprintf(args...); LOG(FATAL) << msg;
#endif #endif
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
inline bool is_error(cudaError_t e) { return UNLIKELY(e); } inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
template <typename... Args> inline void throw_on_error(cudaError_t e, const std::string& msg) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(), throw thrust::system_error(e, thrust::cuda_category(), msg);
string::Sprintf(args...));
#else #else
LOG(FATAL) << string::Sprintf(args...); LOG(FATAL) << msg;
#endif #endif
} }
...@@ -171,14 +165,12 @@ inline bool is_error(curandStatus_t stat) { ...@@ -171,14 +165,12 @@ inline bool is_error(curandStatus_t stat) {
return stat != CURAND_STATUS_SUCCESS; return stat != CURAND_STATUS_SUCCESS;
} }
template <typename... Args> inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
string::Sprintf(args...)); msg);
#else #else
LOG(FATAL) << string::Sprintf(args...); LOG(FATAL) << msg;
#endif #endif
} }
...@@ -186,14 +178,11 @@ inline bool is_error(cudnnStatus_t stat) { ...@@ -186,14 +178,11 @@ inline bool is_error(cudnnStatus_t stat) {
return stat != CUDNN_STATUS_SUCCESS; return stat != CUDNN_STATUS_SUCCESS;
} }
template <typename... Args> inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudnnStatus_t stat, const Args&... args) {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg);
string::Sprintf(args...));
#else #else
LOG(FATAL) << string::Sprintf(args...); LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg;
#endif #endif
} }
...@@ -201,9 +190,7 @@ inline bool is_error(cublasStatus_t stat) { ...@@ -201,9 +190,7 @@ inline bool is_error(cublasStatus_t stat) {
return stat != CUBLAS_STATUS_SUCCESS; return stat != CUBLAS_STATUS_SUCCESS;
} }
template <typename... Args> inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cublasStatus_t stat, const Args&... args) {
std::string err; std::string err;
if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
err = "CUBLAS: not initialized, "; err = "CUBLAS: not initialized, ";
...@@ -225,87 +212,45 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -225,87 +212,45 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
err = "CUBLAS: license error, "; err = "CUBLAS: license error, ";
} }
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(err + string::Sprintf(args...)); throw std::runtime_error(err + msg);
#else #else
LOG(FATAL) << err << string::Sprintf(args...); LOG(FATAL) << err << msg;
#endif #endif
} }
#if !defined(__APPLE__) && !defined(_WIN32) #if !defined(__APPLE__) && !defined(_WIN32)
template <typename... Args> inline bool is_error(ncclResult_t nccl_result) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( return nccl_result != ncclSuccess;
ncclResult_t stat, const Args&... args) { }
if (stat == ncclSuccess) {
return; inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
} else {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg);
string::Sprintf(args...));
#else #else
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg;
<< string::Sprintf(args...);
#endif #endif
}
} }
#endif // __APPLE__ and windows #endif // __APPLE__ and windows
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
template <typename T>
inline void throw_on_error(T e) {
throw_on_error(e, "");
}
#define PADDLE_THROW(...) \ #define PADDLE_THROW(...) \
throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) throw ::paddle::platform::EnforceNotMet( \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG));
#ifdef _WIN32
#define __PADDLE_THROW_ON_ERROR(COND, ...) \
__THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)
#else // _WIN32
#define __PADDLE_THROW_ON_ERROR(COND, ...) \
__PADDLE_THROW_ERROR_I( \
__VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
__THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__))
#endif // _WIN32
#define __PADDLE_UNARY_COMPARE(COND, ...) \
do { \
auto __cond = COND; \
if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
__PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \
} \
} while (0)
#ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(COND, ...) \
#define __PADDLE_ENFORCE_I(COND, ...) \
do { \ do { \
auto __cond__ = (COND); \
if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \
try { \ try { \
__PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ ::paddle::platform::throw_on_error( \
__cond__, ::paddle::string::Sprintf(__VA_ARGS__)); \
} catch (...) { \ } catch (...) { \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} \ } \
} \
} while (0) } while (0)
#else
#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
#endif // REPLACE_ENFORCE_GLOG
#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args
#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
#define PADDLE_THROW_EOF() \ #define PADDLE_THROW_EOF() \
do { \ do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
......
...@@ -64,7 +64,7 @@ class NCCLGroupGuard { ...@@ -64,7 +64,7 @@ class NCCLGroupGuard {
} }
inline ~NCCLGroupGuard() { inline ~NCCLGroupGuard() {
CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess); PADDLE_ENFORCE(dynload::ncclGroupEnd());
NCCLMutex().unlock(); NCCLMutex().unlock();
} }
}; };
......
...@@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { ...@@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...)); tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
} }
inline std::string Sprintf() { return ""; }
template <typename... Args> template <typename... Args>
std::string Sprintf(const Args&... args) { std::string Sprintf(const Args&... args) {
std::ostringstream oss; std::ostringstream oss;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册