提交 7ef731e4 编写于 作者: W Wei Xin

Improve the customer experience of cuda kernel error reporting

上级 5508c787
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/bernoulli_op.h" #include "paddle/fluid/operators/bernoulli_op.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#include "paddle/fluid/platform/error_cuda_msg.cu"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -32,6 +33,7 @@ struct BernoulliCudaFunctor { ...@@ -32,6 +33,7 @@ struct BernoulliCudaFunctor {
__host__ __device__ T operator()(const unsigned int n, const T p) const { __host__ __device__ T operator()(const unsigned int n, const T p) const {
// NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
// lines of error messages if, and it should be refined. // lines of error messages if, and it should be refined.
PADDLE_ENFORCE_CUDA_KERNEL(p >= 0.0 && p <= 1.0,"The probability should be >=0 and <= 1");
PADDLE_ENFORCE(p >= 0.0 && p <= 1.0, PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
"The probability should be >=0 and <= 1, but got %f", p); "The probability should be >=0 and <= 1, but got %f", p);
thrust::minstd_rand rng; thrust::minstd_rand rng;
...@@ -61,7 +63,8 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T> ...@@ -61,7 +63,8 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
static_cast<const platform::CUDADeviceContext*>(&ctx.device_context()); static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
trans(*context, index_sequence_begin, index_sequence_begin + size, in_data, trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
out_data, BernoulliCudaFunctor<T>(seed)); out_data, BernoulliCudaFunctor<T>(seed));
} PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
}
}; };
} // namespace operators } // namespace operators
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/operators/lookup_table_v2_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/error_cuda_msg.cu"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,6 +32,8 @@ __global__ void LookupTableV2(T *output, const T *table, const int64_t *ids, ...@@ -31,6 +32,8 @@ __global__ void LookupTableV2(T *output, const T *table, const int64_t *ids,
while (idy < K) { while (idy < K) {
int64_t id = ids[idy]; int64_t id = ids[idy];
PADDLE_ENFORCE_CUDA_KERNEL(id >= 0 && id < N,"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < N.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
id >= 0, id >= 0,
"Variable value (input) of OP(fluid.layers.embedding) " "Variable value (input) of OP(fluid.layers.embedding) "
...@@ -66,6 +69,8 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids, ...@@ -66,6 +69,8 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
while (idy < K) { while (idy < K) {
int64_t id = ids[idy]; int64_t id = ids[idy];
PADDLE_ENFORCE_CUDA_KERNEL(id >= 0 && id < N,"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < N.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
id >= 0, id >= 0,
"Variable value (input) of OP(fluid.layers.embedding) " "Variable value (input) of OP(fluid.layers.embedding) "
...@@ -140,6 +145,7 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> { ...@@ -140,6 +145,7 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
T, 256, 4, 80, T, 256, 4, 80,
true><<<grids, threads, 0, context.cuda_device_context().stream()>>>( true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids_p, N, K, D, padding_idx); output, table, ids_p, N, K, D, padding_idx);
PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
} }
}; };
...@@ -233,6 +239,7 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> { ...@@ -233,6 +239,7 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>( LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
d_table, d_output, ids_p, N, K, D); d_table, d_output, ids_p, N, K, D);
PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
} }
} }
}; };
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/nll_loss_op.h" #include "paddle/fluid/operators/nll_loss_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/error_cuda_msg.cu"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -44,6 +45,8 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data, ...@@ -44,6 +45,8 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
out_data[i] = 0; out_data[i] = 0;
continue; continue;
} }
PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds");
PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes, PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds."); "label should not be out of bounds.");
const T cur_weight = weight_data ? weight_data[cur_label] : (T)1; const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
...@@ -64,6 +67,8 @@ __global__ void GPUNLLLossForward1D_with_reduce( ...@@ -64,6 +67,8 @@ __global__ void GPUNLLLossForward1D_with_reduce(
for (i = threadIdx.x; i < batch_size; i += NTHREADS) { for (i = threadIdx.x; i < batch_size; i += NTHREADS) {
const auto cur_label = label_data[i]; const auto cur_label = label_data[i];
if (cur_label != ignore_index) { if (cur_label != ignore_index) {
PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds");
PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes, PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds."); "label should not be out of bounds.");
const auto cur_weight = weight_data ? weight_data[cur_label] : (T)1; const auto cur_weight = weight_data ? weight_data[cur_label] : (T)1;
...@@ -202,6 +207,8 @@ __global__ void GPUNLLLossForward2D_no_reduce( ...@@ -202,6 +207,8 @@ __global__ void GPUNLLLossForward2D_no_reduce(
out_data[index] = 0; out_data[index] = 0;
continue; continue;
} }
PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds");
PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes, PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds."); "label should not be out of bounds.");
const T cur_weight = weight_data ? weight_data[cur_label] : (T)1; const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
...@@ -232,6 +239,8 @@ __global__ void GPUNLLLossForward2D_with_reduce( ...@@ -232,6 +239,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
i < map_nelem; i += step) { i < map_nelem; i += step) {
const int64_t cur_label = label_data[toffset + i]; const int64_t cur_label = label_data[toffset + i];
if (cur_label != ignore_index) { if (cur_label != ignore_index) {
PADDLE_ENFORCE_CUDA_KERNEL(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds");
PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes, PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
"label should not be out of bounds."); "label should not be out of bounds.");
const T cur_weight = weight_data ? weight_data[cur_label] : (T)1; const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
...@@ -409,6 +418,7 @@ class NLLLossCUDAKernel : public framework::OpKernel<T> { ...@@ -409,6 +418,7 @@ class NLLLossCUDAKernel : public framework::OpKernel<T> {
} }
} }
} }
PADDLE_ENFORCE_CHECK_CUDA_KERNEL();
} }
}; };
......
#ifndef PADDLE_FLUID_PLATFORM_ERR_MSG_CU
#define PADDLE_FLUID_PLATFORM_ERR_MSG_CU
#include"paddle/fluid/platform/enforce.h"
#define __LEN_ERROR_MSG 100
typedef struct _CudaKerenlErrorPro{
char _CudaErrorMsg[__LEN_ERROR_MSG];
int _line;
char _file[__LEN_ERROR_MSG];
}CudaKerenlErrorPro;
#define PADDLE_ENFORCE_CHECK_CUDA_KERNEL() \
do { \
char msg[__LEN_ERROR_MSG]={0}; \
int line=0; \
char occur; \
char file[__LEN_ERROR_MSG]={0}; \
get_msg_from_cuda(&occur,msg,file,&line); \
if(occur) { \
throw ::paddle::platform::EnforceNotMet(msg,file,line); \
} \
} while (0)
#define PADDLE_ENFORCE_CUDA_KERNEL(CON, MSG) \
do { \
if (!(CON)) { \
send_error_msg(MSG,__FILE__,__LINE__); \
asm("exit;"); \
} \
} while (0)
__device__ static char _CudaKernelErrorOccurred[1]={0};
__device__ static CudaKerenlErrorPro _CudaKernelErrorMsg;
__device__ __host__ inline void _strcpy(char* src,char* target){
int i=0;
while(0!=src[i] && i<__LEN_ERROR_MSG-1){
target[i]=src[i];
i++;
}
target[i]=0;
}
__device__ __host__ inline void send_error_msg(char*msg,char* file,int line){
if(!_CudaKernelErrorOccurred[0]){
_strcpy(msg,_CudaKernelErrorMsg._CudaErrorMsg);
_strcpy(file,_CudaKernelErrorMsg._file);
_CudaKernelErrorMsg._line=line;
_CudaKernelErrorOccurred[0]=1;
}
}
extern "C"{
void inline get_msg_from_cuda(char* con,char* msg,char* file,int*line){
CudaKerenlErrorPro temp;
char occur[1]={0};
cudaError_t err=cudaMemcpyFromSymbol(&temp,(_CudaKernelErrorMsg), sizeof(temp));
cudaMemcpyFromSymbol(occur,_CudaKernelErrorOccurred,sizeof(char));
strcpy(msg,temp._CudaErrorMsg);
strcpy(file,temp._file);
*line=temp._line;
*con=occur[0];
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
}
}
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册