Improve the customer experience of cuda kernel error reporting

7ef731e4 · Wei Xin · 5508c787 · 7ef731e4 · 7ef731e4 · 7ef731e4
4 changed file
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/bernoulli_op.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/fluid/platform/error_cuda_msg.cu"
 namespace paddle {
 namespace operators {
@@ -32,6 +33,7 @@ struct BernoulliCudaFunctor {
  __host__ __device__ T operator()(const unsigned int n, const T p) const {
    // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
    // lines of error messages if, and it should be refined.
+    PADDLE_ENFORCE_CUDA_KERNEL(p >= 0.0 && p <= 1.0,"The probability should be >=0 and <= 1");
    PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
                   "The probability should be >=0 and <= 1, but got %f", p);
    thrust::minstd_rand rng;
@@ -61,6 +63,7 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
    trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
          out_data, BernoulliCudaFunctor<T>(seed));
+          PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
        }
 };

--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/error_cuda_msg.cu"
 namespace paddle {
 namespace operators {
@@ -31,6 +32,8 @@ __global__ void LookupTableV2(T *output, const T *table, const int64_t *ids,
  while (idy < K) {
    int64_t id = ids[idy];
+    PADDLE_ENFORCE_CUDA_KERNEL(id >= 0 && id < N,"Variable value (input) of OP(fluid.layers.embedding) "
+    "expected >= 0 and < N.");
    PADDLE_ENFORCE(
        id >= 0,
        "Variable value (input) of OP(fluid.layers.embedding) "
@@ -66,6 +69,8 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
  while (idy < K) {
    int64_t id = ids[idy];
+    PADDLE_ENFORCE_CUDA_KERNEL(id >= 0 && id < N,"Variable value (input) of OP(fluid.layers.embedding) "
+    "expected >= 0 and < N.");
    PADDLE_ENFORCE(
        id >= 0,
        "Variable value (input) of OP(fluid.layers.embedding) "
@@ -140,6 +145,7 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
          T, 256, 4, 80,
          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
          output, table, ids_p, N, K, D, padding_idx);
+          PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
  }
 };
@@ -233,6 +239,7 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
      LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
          d_table, d_output, ids_p, N, K, D);
+          PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
    }
  }
 };

--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/error_cuda_msg.cu"
 namespace paddle {
 namespace operators {
@@ -44,6 +45,8 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
      out_data[i] = 0;
      continue;
    }
+    PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+      "label should not be out of bounds");
    PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                   "label should not be out of bounds.");
    const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -64,6 +67,8 @@ __global__ void GPUNLLLossForward1D_with_reduce(
  for (i = threadIdx.x; i < batch_size; i += NTHREADS) {
    const auto cur_label = label_data[i];
    if (cur_label != ignore_index) {
+      PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+        "label should not be out of bounds");
      PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                     "label should not be out of bounds.");
      const auto cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -202,6 +207,8 @@ __global__ void GPUNLLLossForward2D_no_reduce(
      out_data[index] = 0;
      continue;
    }
+    PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+      "label should not be out of bounds");
    PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                   "label should not be out of bounds.");
    const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -232,6 +239,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
       i < map_nelem; i += step) {
    const int64_t cur_label = label_data[toffset + i];
    if (cur_label != ignore_index) {
+      PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
+        "label should not be out of bounds");
      PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
                     "label should not be out of bounds.");
      const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
@@ -409,6 +418,7 @@ class NLLLossCUDAKernel : public framework::OpKernel<T> {
        }
      }
    }
+    PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
  }
 };

--- a/paddle/fluid/platform/error_cuda_msg.cu
+++ b/paddle/fluid/platform/error_cuda_msg.cu
+#ifndef PADDLE_FLUID_PLATFORM_ERR_MSG_CU
+#define PADDLE_FLUID_PLATFORM_ERR_MSG_CU
+#include"paddle/fluid/platform/enforce.h"
+#define __LEN_ERROR_MSG 100
+typedef  struct _CudaKerenlErrorPro{
+    char _CudaErrorMsg[__LEN_ERROR_MSG];
+    int _line;
+    char _file[__LEN_ERROR_MSG];
+    }CudaKerenlErrorPro;
+    #define PADDLE_ENFORCE_CHECK_CUDA_KERNEL()                    \
+    do {                                                          \
+        char msg[__LEN_ERROR_MSG]={0};                            \
+        int line=0;                                               \
+        char occur;                                               \
+        char file[__LEN_ERROR_MSG]={0};                           \
+        get_msg_from_cuda(&occur,msg,file,&line);                 \
+        if(occur) {                                               \
+        throw ::paddle::platform::EnforceNotMet(msg,file,line);   \
+      }                                                           \
+    } while (0)
+    #define PADDLE_ENFORCE_CUDA_KERNEL(CON, MSG)             \
+    do {                                                     \
+      if (!(CON)) {                                          \
+    send_error_msg(MSG,__FILE__,__LINE__);                   \
+                         asm("exit;");                       \
+      }                                                      \
+    } while (0)
+  __device__ static char _CudaKernelErrorOccurred[1]={0};
+  __device__ static CudaKerenlErrorPro _CudaKernelErrorMsg;
+  __device__  __host__ inline void _strcpy(char* src,char* target){
+      int i=0;
+      while(0!=src[i] && i<__LEN_ERROR_MSG-1){
+          target[i]=src[i];
+          i++;
+      }
+      target[i]=0;
+  }
+   __device__ __host__ inline void send_error_msg(char*msg,char* file,int line){
+      if(!_CudaKernelErrorOccurred[0]){
+          _strcpy(msg,_CudaKernelErrorMsg._CudaErrorMsg);
+          _strcpy(file,_CudaKernelErrorMsg._file);
+          _CudaKernelErrorMsg._line=line;
+          _CudaKernelErrorOccurred[0]=1;
+      }
+  }
+extern "C"{
+ void  inline get_msg_from_cuda(char* con,char* msg,char* file,int*line){
+        CudaKerenlErrorPro temp;
+        char occur[1]={0};
+        cudaError_t err=cudaMemcpyFromSymbol(&temp,(_CudaKernelErrorMsg), sizeof(temp));
+        cudaMemcpyFromSymbol(occur,_CudaKernelErrorOccurred,sizeof(char));
+        strcpy(msg,temp._CudaErrorMsg);
+        strcpy(file,temp._file);
+        *line=temp._line;
+        *con=occur[0];
+        PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
+    }
+}
+#endif