add flag to check_kernel launch (#32692) (#32709)

09adf20f · XiangGao · GitHub · 097d5f52 · 09adf20f · 09adf20f
3 changed file
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <unordered_set>

 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gflags/gflags.h"
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
@@ -67,6 +68,8 @@ class Version;
 }  // namespace framework
 }  // namespace paddle

+DECLARE_bool(check_kernel_launch);
+
 namespace paddle {
 namespace framework {

@@ -135,14 +138,16 @@ class OpRegistry {
 };

 template <typename PlaceType>
-inline void CheckKernelLaunch(const char* op_type){};
+inline void CheckKernelLaunch(const char* op_type) {}

 #ifdef PADDLE_WITH_CUDA
 template <>
 inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>(
    const char* op_type) {
+  if (FLAGS_check_kernel_launch) {
    PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type);
-};
+  }
+}
 #endif

 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -578,6 +578,19 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
 DEFINE_string(tracer_mkldnn_ops_off, "",
              "List of OneDNN operation types to be turned off");

+/**
+ * Debug related FLAG
+ * Name: check_kernel_launch
+ * Since Version: 2.1.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Check kernel launch status after every kernel compute.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DEFINE_bool(check_kernel_launch, false,
+            "Check kernel launch status after every kernel compute");
+#endif
+
 /**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn

--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -41,6 +41,7 @@ DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
 DECLARE_bool(sort_sum_gradient);
+DECLARE_bool(check_kernel_launch);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -376,7 +377,7 @@ static void RegisterGlobalVarGetterSetter() {
      FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
      FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
-      FLAGS_conv2d_disable_cudnn);
+      FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
 #endif
 #ifdef PADDLE_WITH_XPU
  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);