未验证 提交 a1909aff 编写于 作者: H Huihuang Zheng 提交者: GitHub

Fix Unit Test: Add Sleep Time for CUDA Retry (#29442)

Add Sleep Time for CUDA Retry, which is similar to our GPU retry logic. This is a try to avoid init GPU allocation random failure in unit test.
上级 e5e52249
...@@ -19,12 +19,13 @@ limitations under the License. */ ...@@ -19,12 +19,13 @@ limitations under the License. */
#endif // __GNUC__ #endif // __GNUC__
#if !defined(_WIN32) #if !defined(_WIN32)
#include <dlfcn.h> // dladdr #include <dlfcn.h> // dladdr
#else // _WIN32 #include <unistd.h> // sleep
#else // _WIN32
#ifndef NOMINMAX #ifndef NOMINMAX
#define NOMINMAX // msvc max/min macro conflict with std::min/max #define NOMINMAX // msvc max/min macro conflict with std::min/max
#endif #endif
#include <windows.h> // GetModuleFileName #include <windows.h> // GetModuleFileName, Sleep
#endif #endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -80,6 +81,9 @@ class ErrorSummary; ...@@ -80,6 +81,9 @@ class ErrorSummary;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_int32(call_stack_level); DECLARE_int32(call_stack_level);
namespace paddle { namespace paddle {
...@@ -924,6 +928,14 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); ...@@ -924,6 +928,14 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
} \ } \
} while (0) } while (0)
inline void retry_sleep(unsigned millisecond) {
#ifdef _WIN32
Sleep(millisecond);
#else
sleep(millisecond);
#endif
}
#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ #define PADDLE_RETRY_CUDA_SUCCESS(COND) \
do { \ do { \
auto __cond__ = (COND); \ auto __cond__ = (COND); \
...@@ -933,6 +945,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); ...@@ -933,6 +945,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
::paddle::platform::details::CudaStatusType< \ ::paddle::platform::details::CudaStatusType< \
__CUDA_STATUS_TYPE__>::kSuccess; \ __CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \ __cond__ = (COND); \
++retry_count; \ ++retry_count; \
} \ } \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册