From 19eefef4ca8f1f006c687c0f443c3837e9f1b2f6 Mon Sep 17 00:00:00 2001 From: XiangGao Date: Tue, 27 Apr 2021 10:00:53 +0800 Subject: [PATCH] Check for cuda errors immediately after kernel launch (#32557) Co-authored-by: Yang Zhang --- paddle/fluid/framework/op_registry.h | 23 ++++++++++++++++++++--- paddle/fluid/platform/enforce.h | 10 ++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 818da7478b..9f0dc50774 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -134,6 +134,17 @@ class OpRegistry { static std::unique_ptr CreateOp(const OpDesc& op_desc); }; +template +inline void CheckKernelLaunch(const char* op_type){}; + +#ifdef PADDLE_WITH_CUDA +template <> +inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>( + const char* op_type) { + PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type); +}; +#endif + template struct OpKernelRegistrarFunctor; @@ -162,8 +173,9 @@ struct OpKernelRegistrarFunctor { RegisterKernelClass( op_type, library_type, customized_type_value, - [](const framework::ExecutionContext& ctx) { + [op_type](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); + CheckKernelLaunch(op_type); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor @@ -223,8 +235,13 @@ struct OpKernelRegistrarFunctorEx(op_type, library_type, - customized_type_value, Functor()); + RegisterKernelClass( + op_type, library_type, customized_type_value, + + [op_type](const framework::ExecutionContext& ctx) { + Functor()(ctx); + CheckKernelLaunch(op_type); + }); constexpr auto size = std::tuple_size>::value; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index cfca3ceadf..d42733823e 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -991,6 +991,16 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); } \ } while (0) +#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ + do { \ + auto res = cudaGetLastError(); \ + if (UNLIKELY(res != cudaSuccess)) { \ + auto msg = ::paddle::platform::build_nvidia_error_msg(res); \ + PADDLE_THROW(platform::errors::Fatal("CUDA error after kernel (%s): %s", \ + OP, msg)); \ + } \ + } while (0) + inline void retry_sleep(unsigned milliseconds) { #ifdef _WIN32 Sleep(milliseconds); -- GitLab