未验证 提交 a1909aff 编写于 作者: H Huihuang Zheng 提交者: GitHub

Fix Unit Test: Add Sleep Time for CUDA Retry (#29442)

Add Sleep Time for CUDA Retry, which is similar to our GPU retry logic. This is a try to avoid init GPU allocation random failure in unit test.
上级 e5e52249
......@@ -19,12 +19,13 @@ limitations under the License. */
#endif // __GNUC__
#if !defined(_WIN32)
#include <dlfcn.h> // dladdr
#else // _WIN32
#include <dlfcn.h> // dladdr
#include <unistd.h> // sleep
#else // _WIN32
#ifndef NOMINMAX
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#endif
#include <windows.h> // GetModuleFileName
#include <windows.h> // GetModuleFileName, Sleep
#endif
#ifdef PADDLE_WITH_CUDA
......@@ -80,6 +81,9 @@ class ErrorSummary;
} // namespace platform
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_int32(call_stack_level);
namespace paddle {
......@@ -924,6 +928,14 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
} \
} while (0)
inline void retry_sleep(unsigned millisecond) {
#ifdef _WIN32
Sleep(millisecond);
#else
sleep(millisecond);
#endif
}
#define PADDLE_RETRY_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
......@@ -933,6 +945,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
::paddle::platform::details::CudaStatusType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \
++retry_count; \
} \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册