From 7ef731e4288a057819edf25d18aaffa231f318c9 Mon Sep 17 00:00:00 2001
From: Wei Xin <weixin10@baidu.com>
Date: Wed, 23 Sep 2020 12:53:21 +0000
Subject: [PATCH] Improve the customer experience of cuda kernel error
 reporting

---
 paddle/fluid/operators/bernoulli_op.cu       |  5 +-
 paddle/fluid/operators/lookup_table_v2_op.cu |  7 ++
 paddle/fluid/operators/nll_loss_op.cu        | 10 +++
 paddle/fluid/platform/error_cuda_msg.cu      | 68 ++++++++++++++++++++
 4 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/platform/error_cuda_msg.cu
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
index 6565f5a9a21..2305ccb4c6e 100644
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/bernoulli_op.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/fluid/platform/error_cuda_msg.cu"
 
 namespace paddle {
 namespace operators {
@@ -32,6 +33,7 @@ struct BernoulliCudaFunctor {
   __host__ __device__ T operator()(const unsigned int n, const T p) const {
     // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
     // lines of error messages if, and it should be refined.
+    PADDLE_ENFORCE_CUDA_KERNEL(p >= 0.0 && p <= 1.0,"The probability should be >=0 and <= 1");
     PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
                    "The probability should be >=0 and <= 1, but got %f", p);
     thrust::minstd_rand rng;
@@ -61,7 +63,8 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
         static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
     trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
           out_data, BernoulliCudaFunctor<T>(seed));
-  }
+          PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
+        }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 551f0d3c641..2b96cdc19ad 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/error_cuda_msg.cu"
 
 namespace paddle {
 namespace operators {
@@ -31,6 +32,8 @@ __global__ void LookupTableV2(T *output, const T *table, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
+    PADDLE_ENFORCE_CUDA_KERNEL(id >= 0 && id < N,"Variable value (input) of OP(fluid.layers.embedding) "
+    "expected >= 0 and < N.");
     PADDLE_ENFORCE(
         id >= 0,
         "Variable value (input) of OP(fluid.layers.embedding) "
@@ -66,6 +69,8 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
+    PADDLE_ENFORCE_CUDA_KERNEL(id >= 0 && id < N,"Variable value (input) of OP(fluid.layers.embedding) "
+    "expected >= 0 and < N.");
     PADDLE_ENFORCE(
         id >= 0,
         "Variable value (input) of OP(fluid.layers.embedding) "
@@ -140,6 +145,7 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
           T, 256, 4, 80,
           true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
           output, table, ids_p, N, K, D, padding_idx);
+          PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
   }
 };
 
@@ -233,6 +239,7 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
 
       LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids_p, N, K, D);
+          PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
     }
   }
 };
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index 531c175e03e..02cfcf1e00d 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/error_cuda_msg.cu"
 
 namespace paddle {
 namespace operators {
@@ -44,6 +45,8 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
       out_data[i] = 0;
       continue;
     }
+    PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+      "label should not be out of bounds");
     PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                    "label should not be out of bounds.");
     const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -64,6 +67,8 @@ __global__ void GPUNLLLossForward1D_with_reduce(
   for (i = threadIdx.x; i < batch_size; i += NTHREADS) {
     const auto cur_label = label_data[i];
     if (cur_label != ignore_index) {
+      PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+        "label should not be out of bounds");
       PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                      "label should not be out of bounds.");
       const auto cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -202,6 +207,8 @@ __global__ void GPUNLLLossForward2D_no_reduce(
       out_data[index] = 0;
       continue;
     }
+    PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+      "label should not be out of bounds");
     PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                    "label should not be out of bounds.");
     const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -232,6 +239,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
        i < map_nelem; i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
+      PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+        "label should not be out of bounds");
       PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                      "label should not be out of bounds.");
       const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -409,6 +418,7 @@ class NLLLossCUDAKernel : public framework::OpKernel<T> {
         }
       }
     }
+    PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
   }
 };
 
diff --git a/paddle/fluid/platform/error_cuda_msg.cu b/paddle/fluid/platform/error_cuda_msg.cu
new file mode 100644
index 00000000000..a20aaa97c3a
--- /dev/null
+++ b/paddle/fluid/platform/error_cuda_msg.cu
@@ -0,0 +1,68 @@
+#ifndef PADDLE_FLUID_PLATFORM_ERR_MSG_CU
+#define PADDLE_FLUID_PLATFORM_ERR_MSG_CU
+
+#include"paddle/fluid/platform/enforce.h"
+#define __LEN_ERROR_MSG 100
+
+typedef  struct _CudaKerenlErrorPro{
+    char _CudaErrorMsg[__LEN_ERROR_MSG];
+    int _line;
+    char _file[__LEN_ERROR_MSG];
+    }CudaKerenlErrorPro;
+
+    #define PADDLE_ENFORCE_CHECK_CUDA_KERNEL()                    \
+    do {                                                          \
+        char msg[__LEN_ERROR_MSG]={0};                            \
+        int line=0;                                               \
+        char occur;                                               \
+        char file[__LEN_ERROR_MSG]={0};                           \
+        get_msg_from_cuda(&occur,msg,file,&line);                 \
+        if(occur) {                                               \
+        throw ::paddle::platform::EnforceNotMet(msg,file,line);   \
+      }                                                           \
+    } while (0)
+
+    #define PADDLE_ENFORCE_CUDA_KERNEL(CON, MSG)             \
+    do {                                                     \
+      if (!(CON)) {                                          \
+    send_error_msg(MSG,__FILE__,__LINE__);                   \
+                         asm("exit;");                       \
+      }                                                      \
+    } while (0)
+
+  __device__ static char _CudaKernelErrorOccurred[1]={0};
+  __device__ static CudaKerenlErrorPro _CudaKernelErrorMsg;
+  
+  __device__  __host__ inline void _strcpy(char* src,char* target){
+      int i=0;
+      while(0!=src[i] && i<__LEN_ERROR_MSG-1){
+          target[i]=src[i];
+          i++;
+      }
+      target[i]=0;
+  }
+  
+   __device__ __host__ inline void send_error_msg(char*msg,char* file,int line){
+      if(!_CudaKernelErrorOccurred[0]){
+          _strcpy(msg,_CudaKernelErrorMsg._CudaErrorMsg);
+          _strcpy(file,_CudaKernelErrorMsg._file);
+          _CudaKernelErrorMsg._line=line;
+          _CudaKernelErrorOccurred[0]=1;
+      }
+  }
+
+extern "C"{
+ void  inline get_msg_from_cuda(char* con,char* msg,char* file,int*line){
+        CudaKerenlErrorPro temp;
+        char occur[1]={0};
+        cudaError_t err=cudaMemcpyFromSymbol(&temp,(_CudaKernelErrorMsg), sizeof(temp));
+        cudaMemcpyFromSymbol(occur,_CudaKernelErrorOccurred,sizeof(char));
+        strcpy(msg,temp._CudaErrorMsg);
+        strcpy(file,temp._file);
+        *line=temp._line;
+        *con=occur[0];
+        PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
+    }
+}
+
+#endif
-- 
GitLab