diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 30e96b8e5f69eccd40b0bc3b8f97f6f4297b01aa..a52d91741aff721d1fa342a0bcd8ec5e8dd9eff3 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -135,6 +135,12 @@ copy(inference_lib_dist
         SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
         DSTS ${dst_dir})
 
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+copy(inference_lib_dist
+        SRCS ${cudaerror_INCLUDE_DIR}
+        DSTS ${dst_dir})
+
+# CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
         DSTS ${FLUID_INFERENCE_INSTALL_DIR})
@@ -184,7 +190,7 @@ copy(fluid_lib_dist
         )
 
 set(module "framework")
-set(framework_lib_deps framework_proto)
+set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
 add_dependencies(fluid_lib_dist ${framework_lib_deps})
 copy(fluid_lib_dist
         SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
@@ -204,11 +210,11 @@ copy(fluid_lib_dist
         )
 
 set(module "platform")
-set(platform_lib_deps profiler_proto)
+set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
         )
 
 set(module "string")
@@ -249,6 +255,7 @@ copy(inference_lib_dist
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
         DSTS ${dst_dir} ${dst_dir}/lib)
 
+
 # CMakeCache Info
 copy(fluid_lib_dist
         SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 3b3a43a69a9ef1e1437add16e8e562415dc1dbf3..9c8a9e0af1c126b462973cee7a078b7d66f7c1ce 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+include(ExternalProject)
 # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
 set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH     "${CMAKE_SOURCE_DIR}"    CACHE STRING
     "A path cache third party source code to avoid repeated download.")
 
 set(THIRD_PARTY_BUILD_TYPE Release)
+set(third_party_deps)
 
 # cache funciton to avoid repeat download code of third_party.
 # This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
@@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
     UNSET(${VAR_NAME})
 ENDMACRO()
 
+# Funciton to Download the dependencies during compilation
+# This function has 2 parameters, URL / DIRNAME:
+# 1. URL:           The download url of 3rd dependencies
+# 2. NAME:          The name of file, that determin the dirname
+#
+MACRO(file_download_and_uncompress URL NAME)
+  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
+  SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
+  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
+      URL                   ${URL}
+      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
+      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+    )
+  list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
+ENDMACRO()
+
+
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
@@ -178,10 +206,13 @@ include(external/dlpack)    # download dlpack
 include(external/xxhash)    # download, build, install xxhash
 include(external/warpctc)   # download, build, install warpctc
 
-set(third_party_deps)
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 
+# download file
+set(CUDAERROR_URL  "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
+file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
+
 if(WITH_AMD_GPU)
     include(external/rocprim)   # download, build, install rocprim
     list(APPEND third_party_deps extern_rocprim)
@@ -274,4 +305,4 @@ if (WITH_LITE)
     include(external/lite)
 endif (WITH_LITE)
 
-add_custom_target(third_party DEPENDS ${third_party_deps})
+add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 44668e491eb29106d6863685180574a936bca35a..f9f91680e34016b4aee71c5f5796c20297c6c767 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
 
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
-                          cudaMemcpyHostToDevice, dev_ctx->stream()),
-          platform::errors::External(
-              "Async cudaMemcpy op_var info to gpu failed."));
+                          cudaMemcpyHostToDevice, dev_ctx->stream()));
     } else {  // get
       auto iter = op_var2gpu_str.find(op_var);
       PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 7a032acef676bfb360d00378a3db8b21c509197a..9eefb925d2061f398db53bc7d4c99ac0e8636678 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
   float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
   float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyAsync(output_ptrs, h_odatas,
-                      d_output_ptrs_.size() * sizeof(float*),
-                      cudaMemcpyHostToDevice, stream),
-      platform::errors::External(
-          "CUDA Memcpy failed during split plugin run."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
+      cudaMemcpyHostToDevice, stream));
 
   int outer_rows = outer_rows_ * batchSize;
 
@@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
     float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(float*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
+        cudaMemcpyHostToDevice, stream));
 
     split_kernel<<<grid, block, 0, stream>>>(
         d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
@@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
     half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(half*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
+        cudaMemcpyHostToDevice, stream));
 
     split_kernel<<<grid, block, 0, stream>>>(
         d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 0997f575acc4ecafb96d93df18d3fa90890bdee4..2163562a6080bac9aca822ef7c291623616ea23e 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventCreate(&event_, cudaEventDisableTiming),
-        platform::errors::External(
-            "Create event failed in CUDADeviceContextAllocator"));
+        cudaEventCreate(&event_, cudaEventDisableTiming));
   }
 
   ~CUDADeviceContextAllocator() {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event_),
-          "Destory event failed in CUDADeviceContextAllocator destroctor");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
     }
   }
 
@@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
     auto allocation =
         new CUDADeviceContextAllocation(memory::Alloc(place_, size));
     // Wait for the event on stream
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(event_, default_stream_),
-        "Failed to record event in CUDADeviceContextAllocator");
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0),
-        "Failed to wait event in CUDADeviceContextAllocator");
+        cudaStreamWaitEvent(default_stream_, event_, 0));
     return allocation;
   }
 
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index 006bf559195aa23d08433618d36231aeda228b3d..cbd7e33bc6b7238eacb29ebab1306802d974a90b 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
         cu_stream);
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
-      "temp_storage_bytes, status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
 
   Tensor temp_storage;
   temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         cu_stream);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-      "temp_storage_bytes:%d status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
 template <typename T, typename IndType>
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index f2068d3aadf77594deb409a130e770a745b96741..1b8360a3092f5640a4de83a038dee60bdfe8b83e 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -167,13 +167,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         conv_desc.descriptor<T>(padding_common, strides, dilations);
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
-                                                         groups),
-        platform::errors::External(
-            "Call of cudnnSetConvolutionGroupCount(cudnn_conv_desc, groups) "
-            "failed, where cudnn_conv_desc is configured: padding = [%s], "
-            "strides = [%s], dilations = [%s]; groups = %d",
-            framework::make_ddim(padding_common), framework::make_ddim(strides),
-            framework::make_ddim(dilations), groups));
+                                                         groups));
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize<int>(transformed_input.dims()));
@@ -204,15 +198,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
-                                                       CUDNN_DEFAULT_MATH),
-        platform::errors::External(
-            "Call of cudnnSetConvolutionMathType(cudnn_conv_desc, "
-            "CUDNN_DEFAULT_MATH) failed, where cudnn_conv_desc is configured: "
-            "padding = %d, strides = %d, dilations = %d.",
-            framework::make_ddim(padding_common), framework::make_ddim(strides),
-            framework::make_ddim(dilations)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
 
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
@@ -221,9 +208,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo),
-          platform::errors::External(
-              "Call of cudnnGetConvolutionForwardAlgorithm failed."));
+              workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else {
       std::function<cudnnConvolutionFwdAlgo_t()> search_func =
@@ -237,9 +222,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                   handle, cudnn_input_desc, input_data, cudnn_filter_desc,
                   filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
                   kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit),
-              platform::errors::External(
-                  "Call of cudnnFindConvolutionForwardAlgorithmEx failed."));
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
         };
         workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
         VLOG(3) << "Perf result: (algo: stat, time, memory)";
@@ -273,9 +256,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
             handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-            cudnn_output_desc, algo, &workspace_size_in_bytes),
-        platform::errors::External(
-            "Call of cudnnGetConvolutionForwardWorkspaceSize failed."));
+            cudnn_output_desc, algo, &workspace_size_in_bytes));
     PADDLE_ENFORCE_LE(
         workspace_size_in_bytes, workspace_size_limit,
         platform::errors::InvalidArgument(
@@ -292,20 +273,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // ------------- cudnn conv forward and bias add ---------------------
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnConvolutionForward(
-                handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
-                filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-                workspace_size_in_bytes, &beta, cudnn_output_desc, output_data),
-            platform::errors::External(
-                "Call of cudnnConvolutionForward failed."));
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnAddTensor(handle, &alpha, cudnn_bias_desc,
-                                            bias_data, &alpha,
-                                            cudnn_output_desc, output_data),
-          platform::errors::External("Call of cudnnAddTensor failed."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
+          output_data));
     } else {
       if (activation == "identity") {
         algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
@@ -320,9 +296,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                 cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
                 cudnn_workspace, workspace_size_in_bytes, &alpha2,
                 cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
-                cudnn_act_desc, cudnn_output_desc, output_data),
-            platform::errors::External(
-                "Call of cudnnConvolutionBiasActivationForward failed."));
+                cudnn_act_desc, cudnn_output_desc, output_data));
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 2e308657936c093d0fdbddd8a019be1210b7cd34..32eaf1180977a070bd14b3eb79eaaa2357bdb2b0 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
     VLOG(3) << "Setting descriptors.";
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
-            data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 /*yDesc=*/data_desc_,
                 /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
                 /*activationDesc=*/activation_desc_,
-                /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
+                /*sizeInBytes=*/&workspace_size));
 
     // -------------- cudnn batchnorm reserve space --------------
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             /*bnOps=*/bnOps_,
             /*activationDesc=*/activation_desc_,
             /*xDesc=*/data_desc_,
-            /*sizeInBytes=*/&reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
+            /*sizeInBytes=*/&reserve_space_size));
 
     reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
                                                     reserve_space_size);
@@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
-            reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnBatchNormalizationForwardTrainingEx."));
+            reserve_space_size));
 
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
 
@@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
                  << "CUDNN_BN_MIN_EPSILON. Setting it to "
@@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
-            data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*dxDesc=*/data_desc_,
             /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
             /*activationDesc=*/activation_desc_,
-            /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
+            /*sizeInBytes=*/&workspace_size));
 
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
@@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*workspace=*/workspace_ptr,
             /*workSpaceSizeInBytes=*/workspace_size,
             /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
-            /*reserveSpaceSizeInBytes=*/reserve_space_size),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnBatchNormalizationBackwardEx."));
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
 
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index b61ef8e566b77607e6db9b51b270836338e06160..17cb4556d45ef3adee2adc0d2f19ea048e096982 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t in_desc;
     cudnnTensorDescriptor_t out_desc;
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         dims_y[i] = 1;
       }
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnTransformTensor(
-              handle, CudnnDataType<T>::kOne(), in_desc,
-              static_cast<const void*>(ins[k]->data<T>()),
-              CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
-          platform::errors::External("Create cudnn transform tensor failed in "
-                                     "transpose_flatten_concat op."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+          handle, CudnnDataType<T>::kOne(), in_desc,
+          static_cast<const void*>(ins[k]->data<T>()),
+          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
       if (concat_axis == 0) {
         odata += osize;
       } else {
@@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
       }
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index c266b0d32b14ad0a3b8fe1ae3b6f1c3d44ac269e..3bf34fc685ee8af39b66f444c35d606c4b5d8ffb 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSpatialTfSamplerForward(
-            handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
-            input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
-            output_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
+        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
+        output_data));
   }
 };
 
@@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
             input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
             input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
             output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
+            grid_grad_data));
   }
 };
 
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index c0ab35b0e753c6ca9357ed2a92d0e493167e5cca..8e903a4eccc745932c2de1ec88908098a0d82d34 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -41,16 +41,12 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSscal(args...),
-        platform::errors::External("dynload cublasSscal lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasScopy(args...),
-        platform::errors::External("dynload cublasScopy lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
   }
 
   template <typename... ARGS>
@@ -108,16 +104,12 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDscal(args...),
-        platform::errors::External("dynload cublasDscal lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDcopy(args...),
-        platform::errors::External("dynload cublasDcopy lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
   }
 
   template <typename... ARGS>
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index d2b01fafb731f64b141af529c2d60ced78e0acbe..081c077ab73c24640e324e4f1235e905cb122549 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
 
     auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
                                       out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        err, platform::errors::External(
-                 "MeanOP failed to get reduce workspace size %s.",
-                 cudaGetErrorString(err)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
     framework::Tensor tmp;
     auto* temp_storage = tmp.mutable_data<uint8_t>(
         framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
         context.GetPlace());
     err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
                                  out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        err, platform::errors::External(
-                 "MeanOP failed to run CUDA reduce computation: %s.",
-                 cudaGetErrorString(err)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b237df130abcc8b8d023a771fc641b7a29d83335..e72820611d3a99a83279c811e417d05ae39faa70 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) {
       // gpu memory immediately without waiting gpu kernel ends
       platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventRecord(events_[i].get(), compute_stream_),
-          platform::errors::Fatal(
-              "cudaEventRecord raises unexpected exception"));
+          cudaEventRecord(events_[i].get(), compute_stream_));
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0),
-          platform::errors::Fatal(
-              "cudaStreamWaitEvent raises unexpected exception"));
+          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 
       platform::RecordEvent record_event("BufferedReader:MemoryCopy");
       for (size_t i = 0; i < cpu.size(); ++i) {
@@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) {
                        size);
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        cuda_pinned_place, cuda_pinned_ptr, size, stream_.get());
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              cudaStreamSynchronize(stream_.get()),
-              platform::errors::Fatal(
-                  "cudaStreamSynchronize raises unexpected exception"));
+          PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
         }
         gpu[i].set_lod(cpu[i].lod());
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamSynchronize(stream_.get()),
-          platform::errors::Fatal(
-              "cudaStreamSynchronize raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
     }
 #endif
     return i;
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index 083d22aa2a38a95225901f4f34f998d86b41e670..cfb9e16942c25f0397d00f17b0c3353a24b7acd8 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
     if (comm) {
       int dtype = platform::ToNCCLDataType(mean_out->type());
       // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
-                                           static_cast<ncclDataType_t>(dtype),
-                                           ncclSum, comm, stream),
-          platform::errors::InvalidArgument(
-              "ncclAllReduce in Op(sync_batch_norm) failed"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+          comm, stream));
     }
 #endif
 
@@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor(
   if (comm) {
     int dtype = platform::ToNCCLDataType(scale->type());
     // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
-                                         static_cast<ncclDataType_t>(dtype),
-                                         ncclSum, comm, stream),
-        platform::errors::InvalidArgument(
-            "ncclAllReduce in Op(sync_batch_norm) failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+        comm, stream));
   }
 #endif
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index b954d1b7658ae4ff9e00b55eeae1b2c1f935867d..acc5a0f172d4127cb2f05babd2e49e607658a4bd 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,6 +1,6 @@
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 proto_library(error_codes_proto SRCS error_codes.proto)
-
+proto_library(cuda_error_proto SRCS cuda_error.proto)
 
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
+cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
 set(CPU_INFO_DEPS gflags glog enforce)
diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/cuda_error.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b55e0af81ee6f8fb47d558287c7f902ef0fde81b
--- /dev/null
+++ b/paddle/fluid/platform/cuda_error.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message MessageDesc {
+  // Indicates the type of error
+  required int32 errorCode = 1;
+  // Indicates the message of error
+  required string errorMessage = 2;
+}
+
+message AllMessageDesc {
+  // Version of cuda API
+  required int32 version = 1;
+  // Error messages of different errortype
+  repeated MessageDesc Messages = 2;
+}
+
+message cudaerrorDesc {
+  // Error messages of different cuda versions(9.0/10.0/10.2)
+  repeated AllMessageDesc AllMessages = 2;
+}
\ No newline at end of file
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 54f5e911e3d0f5377734129a9d043837d68476e8..74cf5545239f1dde1a6f81ebdf7f735a132133d9 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -29,14 +29,7 @@ namespace platform {
 class CublasHandleHolder {
  public:
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cublasCreate(&handle_),
-        platform::errors::External(
-            "The cuBLAS library was not initialized. This is usually caused by "
-            "an error in the CUDA Runtime API called by the cuBLAS routine, or "
-            "an error in the hardware setup.\n"
-            "To correct: check that the hardware, an appropriate version of "
-            "the driver, and the cuBLAS library are correctly installed."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
 #if CUDA_VERSION >= 9000
     if (math_type == CUBLAS_TENSOR_OP_MATH) {
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc
index 1828f0760a79ae19cb919ef0cd4b7de3944851d5..65c8b96028aceef09c4deff6cee92c3d970a659f 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
       platform::SetDeviceId(dev_idx);
       cudaStream_t stream;
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking),
-          platform::errors::Fatal(
-              "cudaStreamCreateWithFlags raises unexpected exception"));
+          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
       return stream;
     };
 
     auto deleter = [dev_idx](cudaStream_t stream) {
       platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamDestroy(stream),
-          platform::errors::Fatal(
-              "cudaStreamDestroy raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
     };
 
     pool_.emplace_back(
@@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() {
       platform::SetDeviceId(dev_idx);
       cudaEvent_t event;
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventCreateWithFlags(&event, cudaEventDisableTiming),
-          platform::errors::Fatal(
-              "cudaEventCreateWithFlags raises unexpected exception"));
+          cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
       return event;
     };
 
     auto deleter = [dev_idx](cudaEvent_t event) {
       platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event),
-          platform::errors::Fatal(
-              "cudaEventDestroy raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
     };
 
     pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 322a32796787e8099c5156e495ef54b1114226eb..3bffa72bb927de9366a9814cf3b674e5c60881d2 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -278,12 +278,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
             << "Please recompile or reinstall Paddle with compatible CUDNN "
                "version.";
       }
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnCreate(&cudnn_handle_),
-          "Failed to create Cudnn handle in DeviceContext");
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, stream_),
-          "Failed to set stream for Cudnn handle in DeviceContext");
+          dynload::cudnnSetStream(cudnn_handle_, stream_));
     } else {
       cudnn_handle_ = nullptr;
     }
@@ -302,8 +299,7 @@ CUDADeviceContext::~CUDADeviceContext() {
   eigen_device_.reset();
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
   if (cudnn_handle_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
-                                "Failed to destory Cudnn handle");
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
   }
 #if defined(PADDLE_WITH_NCCL)
   if (nccl_comm_) {
@@ -325,10 +321,7 @@ void CUDADeviceContext::Wait() const {
   }
 #endif
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      e_sync, platform::errors::Fatal(
-                  "cudaStreamSynchronize raises error: %s, errono: %d",
-                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
 }
 
 int CUDADeviceContext::GetComputeCapability() const {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 99f83d9732029920d4e41533edb72634b69b0d45..f2e0c52170b6012ce8d34248923e9b8c7d82ed81 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,6 +18,13 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
+#if !defined(_WIN32)
+#include <dlfcn.h>    // dladdr
+#else                 // _WIN32
+#define NOMINMAX      // msvc max/min macro conflict with std::min/max
+#include <windows.h>  // GetModuleFileName
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -38,6 +45,7 @@ limitations under the License. */
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
+#include "paddle/fluid/platform/cuda_error.pb.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"
@@ -220,10 +228,6 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
 
 inline bool is_error(bool stat) { return !stat; }
 
-inline std::string build_ex_string(bool stat, const std::string& msg) {
-  return msg;
-}
-
 inline void throw_on_error(bool stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
   throw std::runtime_error(msg);
@@ -284,23 +288,21 @@ struct EnforceNotMet : public std::exception {
     }                                                                        \
   } while (0)
 #else
-#define PADDLE_ENFORCE(COND, ...)                                           \
-  do {                                                                      \
-    auto __cond__ = (COND);                                                 \
-    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {                 \
-      try {                                                                 \
-        ::paddle::platform::throw_on_error(                                 \
-            __cond__,                                                       \
-            ::paddle::platform::build_ex_string(                            \
-                __cond__,                                                   \
-                ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
-      } catch (...) {                                                       \
-        HANDLE_THE_ERROR                                                    \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(),   \
-                                                __FILE__, __LINE__);        \
-        END_HANDLE_THE_ERROR                                                \
-      }                                                                     \
-    }                                                                       \
+#define PADDLE_ENFORCE(COND, ...)                                         \
+  do {                                                                    \
+    auto __cond__ = (COND);                                               \
+    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {               \
+      try {                                                               \
+        ::paddle::platform::throw_on_error(                               \
+            __cond__,                                                     \
+            ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString());    \
+      } catch (...) {                                                     \
+        HANDLE_THE_ERROR                                                  \
+        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                                __FILE__, __LINE__);      \
+        END_HANDLE_THE_ERROR                                              \
+      }                                                                   \
+    }                                                                     \
   } while (0)
 #endif
 
@@ -464,30 +466,148 @@ struct EOFException : public std::exception {
   } while (0)
 
 /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
-
 #ifdef PADDLE_WITH_CUDA
 
+/***** CUDA ERROR *****/
 inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
 
-inline std::string build_ex_string(cudaError_t e, const std::string& msg) {
-  return msg;
+inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
+  std::ostringstream webstr;
+  webstr << "https://docs.nvidia.com/cuda/";
+  if (cuda_version != -1) {
+    double version = cuda_version / 10;
+    webstr << "archive/" << std::fixed << std::setprecision(1) << version;
+  }
+  webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
+            "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
+  return webstr.str();
+}
+
+inline std::string build_nvidia_error_msg(cudaError_t e) {
+#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
+  int32_t cuda_version = 100;
+#elif CUDA_VERSION >= 9000
+  int32_t cuda_version = 90;
+#else
+  int32_t cuda_version = -1;
+#endif
+  std::ostringstream sout;
+  sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
+  static platform::proto::cudaerrorDesc cudaerror;
+  static bool _initSucceed = false;
+  if (cudaerror.ByteSizeLong() == 0) {
+    std::string filePath;
+#if !defined(_WIN32)
+    Dl_info info;
+    if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
+      std::string strModule(info.dli_fname);
+      const size_t last_slash_idx = strModule.find_last_of("/");
+      std::string compare_path = strModule.substr(strModule.length() - 6);
+      if (std::string::npos != last_slash_idx) {
+        strModule.erase(last_slash_idx, std::string::npos);
+      }
+      if (compare_path.compare("avx.so") == 0) {
+        filePath = strModule +
+                   "/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
+      } else {
+        filePath =
+            strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
+      }
+    }
+#else
+    char buf[100];
+    MEMORY_BASIC_INFORMATION mbi;
+    HMODULE h_module =
+        (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
+            ? (HMODULE)mbi.AllocationBase
+            : NULL;
+    GetModuleFileName(h_module, buf, 100);
+    std::string strModule(buf);
+    const size_t last_slash_idx = strModule.find_last_of("\\");
+    std::string compare_path = strModule.substr(strModule.length() - 7);
+    if (std::string::npos != last_slash_idx) {
+      strModule.erase(last_slash_idx, std::string::npos);
+    }
+    if (compare_path.compare("avx.pyd") == 0) {
+      filePath =
+          strModule +
+          "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+    } else {
+      filePath =
+          strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+    }
+#endif
+    std::ifstream fin(filePath, std::ios::in | std::ios::binary);
+    _initSucceed = cudaerror.ParseFromIstream(&fin);
+  }
+  if (_initSucceed) {
+    for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
+      if (cuda_version == cudaerror.allmessages(i).version()) {
+        for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
+          if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
+            sout << "\n  [Advise: "
+                 << cudaerror.allmessages(i).messages(j).errormessage() << "]";
+            return sout.str();
+          }
+        }
+      }
+    }
+  }
+  sout << "\n  [Advise: Please search for the error code(" << e
+       << ") on website( " << GetCudaErrorWebsite(cuda_version)
+       << " ) to get Nvidia's official solution about CUDA Error.]";
+  return sout.str();
 }
 
 inline void throw_on_error(cudaError_t e, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw thrust::system_error(e, thrust::cuda_category(), msg);
+  throw std::runtime_error(msg);
 #else
   LOG(FATAL) << msg;
 #endif
 }
 
+/** curand ERROR **/
 inline bool is_error(curandStatus_t stat) {
   return stat != CURAND_STATUS_SUCCESS;
 }
 
-inline std::string build_ex_string(curandStatus_t stat,
-                                   const std::string& msg) {
-  return msg;
+inline const char* curandGetErrorString(curandStatus_t stat) {
+  switch (stat) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+    default:
+      return "Unknown curand status";
+  }
+}
+
+inline std::string build_nvidia_error_msg(curandStatus_t stat) {
+  std::string msg(" Curand error, ");
+  return msg + curandGetErrorString(stat) + " ";
 }
 
 inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
@@ -499,13 +619,14 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
 #endif
 }
 
+/***** CUDNN ERROR *****/
 inline bool is_error(cudnnStatus_t stat) {
   return stat != CUDNN_STATUS_SUCCESS;
 }
 
-inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) {
-  return msg + "\n  [Hint: " + platform::dynload::cudnnGetErrorString(stat) +
-         "]";
+inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
+  std::string msg(" Cudnn error, ");
+  return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
 }
 
 inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
@@ -516,33 +637,39 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
 #endif
 }
 
+/***** CUBLAS ERROR *****/
 inline bool is_error(cublasStatus_t stat) {
   return stat != CUBLAS_STATUS_SUCCESS;
 }
 
-inline std::string build_ex_string(cublasStatus_t stat,
-                                   const std::string& msg) {
-  std::string err;
-  if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
-    err = "CUBLAS_STATUS_NOT_INITIALIZED";
-  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
-    err = "CUBLAS_STATUS_ALLOC_FAILED";
-  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
-    err = "CUBLAS_STATUS_INVALID_VALUE";
-  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
-    err = "CUBLAS_STATUS_ARCH_MISMATCH";
-  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
-    err = "CUBLAS_STATUS_MAPPING_ERROR";
-  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
-    err = "CUBLAS_STATUS_EXECUTION_FAILED";
-  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
-    err = "CUBLAS_STATUS_INTERNAL_ERROR";
-  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
-    err = "CUBLAS_STATUS_NOT_SUPPORTED";
-  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
-    err = "CUBLAS_STATUS_LICENSE_ERROR";
+inline const char* cublasGetErrorString(cublasStatus_t stat) {
+  switch (stat) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+    default:
+      return "Unknown cublas status";
   }
-  return msg + "\n  [Hint: " + err + "]";
+}
+
+inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
+  std::string msg(" Cublas error, ");
+  return msg + cublasGetErrorString(stat) + " ";
 }
 
 inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
@@ -553,15 +680,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
 #endif
 }
 
+/****** NCCL ERROR ******/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
   return nccl_result != ncclSuccess;
 }
 
-inline std::string build_ex_string(ncclResult_t nccl_result,
-                                   const std::string& msg) {
-  return msg + "\n  [" + platform::dynload::ncclGetErrorString(nccl_result) +
-         "]";
+inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
+  std::string msg(" Nccl error, ");
+  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
 }
 
 inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
@@ -571,11 +698,8 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
   LOG(FATAL) << msg;
 #endif
 }
-#endif  // __APPLE__ and windows
-
-#endif  // PADDLE_WITH_CUDA
+#endif  // not(__APPLE__) and PADDLE_WITH_NCCL
 
-#ifdef PADDLE_WITH_CUDA
 namespace details {
 
 template <typename T>
@@ -598,30 +722,28 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif
 
 }  // namespace details
-#endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_CUDA
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...)                              \
-  do {                                                                      \
-    auto __cond__ = (COND);                                                 \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                        \
-    constexpr auto __success_type__ =                                       \
-        ::paddle::platform::details::CudaStatusType<                        \
-            __CUDA_STATUS_TYPE__>::kSuccess;                                \
-    if (UNLIKELY(__cond__ != __success_type__)) {                           \
-      try {                                                                 \
-        ::paddle::platform::throw_on_error(                                 \
-            __cond__,                                                       \
-            ::paddle::platform::build_ex_string(                            \
-                __cond__,                                                   \
-                ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
-      } catch (...) {                                                       \
-        HANDLE_THE_ERROR                                                    \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(),   \
-                                                __FILE__, __LINE__);        \
-        END_HANDLE_THE_ERROR                                                \
-      }                                                                     \
-    }                                                                       \
+#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                                 \
+  do {                                                                    \
+    auto __cond__ = (COND);                                               \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                      \
+    constexpr auto __success_type__ =                                     \
+        ::paddle::platform::details::CudaStatusType<                      \
+            __CUDA_STATUS_TYPE__>::kSuccess;                              \
+    if (UNLIKELY(__cond__ != __success_type__)) {                         \
+      try {                                                               \
+        ::paddle::platform::throw_on_error(                               \
+            __cond__,                                                     \
+            ::paddle::platform::errors::External(                         \
+                ::paddle::platform::build_nvidia_error_msg(__cond__))     \
+                .ToString());                                             \
+      } catch (...) {                                                     \
+        HANDLE_THE_ERROR                                                  \
+        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                                __FILE__, __LINE__);      \
+        END_HANDLE_THE_ERROR                                              \
+      }                                                                   \
+    }                                                                     \
   } while (0)
 
 #undef DEFINE_CUDA_STATUS_TYPE
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 0057c784528c2654fbe58aa7e48c91e27f9843de..237e09573abe21ce51d933e315eab6d56df84262 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
+  PADDLE_ENFORCE_CUDA_SUCCESS(value);
   return true;
 }
 
 template <typename T>
-bool CheckCudaStatusFailure(
-    T value, const std::string& msg = "self-defined cuda status failed") {
+bool CheckCudaStatusFailure(T value, const std::string& msg) {
   try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
+    PADDLE_ENFORCE_CUDA_SUCCESS(value);
     return false;
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
@@ -279,24 +278,31 @@ bool CheckCudaStatusFailure(
 
 TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
 
+  int count;
+  PADDLE_ENFORCE(cudaGetDeviceCount(&count));
   EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
 #endif
 }
 #endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 40d6bc54ccf928845095cbc6561a0447657b0a25..c07abba9e8ef95362d0a1e780a35736121f14099 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
-#include <string>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -42,18 +41,13 @@ faster way to query device properties. You can see details in
 https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
 */
 
-inline std::string CudaErrorWebsite() {
-  return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
-         "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
-         "6db0a94a430e0038";
-}
-
 static int GetCUDADeviceCountImpl() {
   int driverVersion = 0;
   cudaError_t status = cudaDriverGetVersion(&driverVersion);
 
   if (!(status == cudaSuccess && driverVersion != 0)) {
     // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
     return 0;
   }
 
@@ -67,14 +61,8 @@ static int GetCUDADeviceCountImpl() {
       return 0;
     }
   }
-
   int count;
-  auto error_code = cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaGetDeviceCount failed in "
-      "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
-      error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
   return count;
 }
 
@@ -84,72 +72,63 @@ int GetCUDADeviceCount() {
 }
 
 int GetCUDAComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   int major, minor;
 
   auto major_error_code =
       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
   auto minor_error_code =
       cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
-  PADDLE_ENFORCE_EQ(
-      major_error_code, 0,
-      "cudaDevAttrComputeCapabilityMajor failed in "
-      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
-      major_error_code, CudaErrorWebsite());
-  PADDLE_ENFORCE_EQ(
-      minor_error_code, 0,
-      "cudaDevAttrComputeCapabilityMinor failed in "
-      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
-      minor_error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
   return major * 10 + minor;
 }
 
 dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   dim3 ret;
   int size;
   auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
-  PADDLE_ENFORCE_EQ(error_code_x, 0,
-                    "cudaDevAttrMaxGridDimX failed in "
-                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
-                    error_code_x, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
   ret.x = size;
 
   auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
-  PADDLE_ENFORCE_EQ(error_code_y, 0,
-                    "cudaDevAttrMaxGridDimY failed in "
-                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
-                    error_code_y, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
   ret.y = size;
 
   auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
-  PADDLE_ENFORCE_EQ(error_code_z, 0,
-                    "cudaDevAttrMaxGridDimZ failed in "
-                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
-                    error_code_z, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
   ret.z = size;
   return ret;
 }
 
 int GetCUDARuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   int runtime_version = 0;
-  auto error_code = cudaRuntimeGetVersion(&runtime_version);
-  PADDLE_ENFORCE(error_code,
-                 "cudaRuntimeGetVersion failed in "
-                 "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetCUDADriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   int driver_version = 0;
-  auto error_code = cudaDriverGetVersion(&driver_version);
-  PADDLE_ENFORCE(error_code,
-                 "cudaDriverGetVersion failed in "
-                 "paddle::platform::GetCUDADriverVersion, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
   return driver_version;
 }
 
@@ -164,56 +143,44 @@ bool TensorCoreAvailable() {
 }
 
 int GetCUDAMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   int count;
-  auto error_code =
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaDeviceGetAttribute failed in "
-                 "paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
   return count;
 }
 
 int GetCUDAMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   int count;
-  auto error_code = cudaDeviceGetAttribute(
-      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaDeviceGetAttribute failed in paddle::"
-      "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
-      error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
   return count;
 }
 
 int GetCUDAMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(
-      id, GetCUDADeviceCount(),
-      platform::errors::InvalidArgument(
-          "Device id must less than GPU count, but received id is:%d, "
-          "GPU count is: %d.",
-          id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
   int count;
-  auto error_code =
-      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id);
-  PADDLE_ENFORCE_EQ(
-      error_code, 0,
-      platform::errors::InvalidArgument(
-          "cudaDeviceGetAttribute returned error code should be 0, "
-          "but received error code is: %d, %s",
-          error_code, CudaErrorWebsite()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
   return count;
 }
 
 int GetCurrentDeviceId() {
   int device_id;
-  auto error_code = cudaGetDevice(&device_id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaGetDevice failed in "
-                 "paddle::platform::GetCurrentDeviceId, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
   return device_id;
 }
 
@@ -237,12 +204,12 @@ std::vector<int> GetSelectedDevices() {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  auto error_code = cudaSetDevice(id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaSetDevice failed in "
-                 "paddle::platform::SetDeviced, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id));
 }
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
@@ -306,74 +273,44 @@ size_t GpuMaxChunkSize() {
 
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
-  auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
-                 "(%p -> %p, length: %d) error code : %d, %s",
-                 src, dst, static_cast<int>(count), error_code,
-                 CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }
 
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind) {
-  auto error_code = cudaMemcpy(dst, src, count, kind);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
-                 "(%p -> %p, length: %d) error code : %d, %s",
-                 src, dst, static_cast<int>(count), error_code,
-                 CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }
 
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
                         int src_device, size_t count, cudaStream_t stream) {
-  auto error_code =
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
-      "error code : %d, %s",
-      error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }
 
 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                        int src_device, size_t count) {
-  auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
-                 "error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  auto error_code = cudaMemsetAsync(dst, value, count, stream);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
-                 "error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
 }
 
 void GpuStreamSync(cudaStream_t stream) {
-  auto error_code = cudaStreamSynchronize(stream);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      error_code,
-      platform::errors::External(
-          "cudaStreamSynchronize failed in paddle::platform::GpuStreamSync "
-          "error code : %d, %s",
-          error_code, CudaErrorWebsite()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 }
 
 static void RaiseNonOutOfMemoryError(cudaError_t *status) {
   if (*status == cudaErrorMemoryAllocation) {
     *status = cudaSuccess;
   }
-
   PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 
   *status = cudaGetLastError();
   if (*status == cudaErrorMemoryAllocation) {
     *status = cudaSuccess;
   }
-
   PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 }
 
@@ -450,8 +387,7 @@ class RecordedCudaMallocHelper {
     CUDADeviceGuard guard(dev_id_);
     auto err = cudaFree(ptr);
     if (err != cudaErrorCudartUnloading) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          err, platform::errors::External("cudaFree raises unexpected error"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
       if (NeedRecord()) {
         std::lock_guard<std::mutex> guard(*mtx_);
         cur_size_ -= size;
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 41d5180ffaf5fbd61fc08f7a9e2724fc63f9542b..af27564b99f79f73a91680071d35b07ebe8aa634 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -117,10 +117,7 @@ void SynchronizeAllDevice() {
   int count = GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
     SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaDeviceSynchronize(),
-        platform::errors::External(
-            "Device synchronize failed in cudaDeviceSynchronize()"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
   }
 #endif
 }
diff --git a/python/setup.py.in b/python/setup.py.in
index 36851adde19054cb9f42b7ea492f8f547e30f121..ed77787d4cdd2711bd203fddc3e38199668d91af 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
+
 if not '${WIN32}':
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
 if '${HAS_NOAVX_CORE}' == 'ON':
     package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
+
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
     # The paddle.fluid.proto will be generated while compiling.
@@ -329,6 +331,7 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
     list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
     list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
+    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
     ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
     list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
     list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
@@ -400,7 +403,9 @@ class InstallHeaders(Command):
         return self.copy_file(header, install_dir)
 
     def run(self):
+        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
         if os.name == 'nt' or sys.platform == 'darwin':
+            self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
             return
         hdrs = self.distribution.headers
         if not hdrs:
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 51330bea8ea6272031d04aad426c89d369fdcd1b..3e079d0433f87c962e6d545aec2c4a1cb47fb335 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
-VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
+ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
 INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
 if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
diff --git a/tools/count_invalid_enforce.sh b/tools/count_invalid_enforce.sh
index 77d264370fdb02ccb67b7fca91b2912e49d332e1..46feadd56672fbfc91a281b039681c3ecde5fd5e 100644
--- a/tools/count_invalid_enforce.sh
+++ b/tools/count_invalid_enforce.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
-ALL_PADDLE_CHECK=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" ../paddle/fluid || true`
+ALL_PADDLE_CHECK=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" ../paddle/fluid || true`
 ALL_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
-VALID_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
+VALID_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
 
 echo "----------------------------"
 echo "PADDLE ENFORCE & THROW COUNT"
diff --git a/tools/cudaError/README.md b/tools/cudaError/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..df7434c33a9fd7c6dfcf8c3cd7479169d748ca48
--- /dev/null
+++ b/tools/cudaError/README.md
@@ -0,0 +1,22 @@
+Usage:
+
+Please run:
+```
+bash start.sh
+```
+
+The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
+
+If you want to crawl a specified version of CUDA, Please run:
+```
+bash start.sh <version> <URL(optional)>
+```
+URL can be derived by default, so you don't have to enter a URL.
+
+for example:
+```
+bash start.sh 11.0
+```
+will capture error message of CUDA11.0(in future).
+
+Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
diff --git a/tools/cudaError/spider.py b/tools/cudaError/spider.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2c3dc97f422202e96d9c6ab58ce462e7dbd980e
--- /dev/null
+++ b/tools/cudaError/spider.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ssl
+import re
+import urllib2
+import json
+import collections
+import sys, getopt
+import cuda_error_pb2
+
+
+def parsing(cuda_errorDesc, version, url):
+    All_Messages = cuda_errorDesc.AllMessages.add()
+    All_Messages.version = int(version)
+
+    ssl._create_default_https_context = ssl._create_unverified_context
+    html = urllib2.urlopen(url).read()
+    res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)
+
+    url_list = url.split('/')
+    url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
+
+    dic = collections.OrderedDict()
+    dic_message = collections.OrderedDict()
+    for line in m_div:
+        res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
+        m_dt = re.findall(res_dt, line, re.S | re.M)
+        for error in m_dt:
+            res_type = r'<span class="ph ph apiData">(.*?)</span>'
+            m_type = re.findall(res_type, error[0], re.S | re.M)[0]
+            m_message = error[1]
+            m_message = m_message.replace('\n', '')
+            res_a = r'(<a class=.*?</a>)'
+            res_shape = r'<a class=.*?>(.*?)</a>'
+            list_a = re.findall(res_a, m_message, re.S | re.M)
+            list_shape = re.findall(res_shape, m_message, re.S | re.M)
+            assert len(list_a) == len(list_shape)
+            for idx in range(len(list_a)):
+                m_message = m_message.replace(list_a[idx], list_shape[idx])
+
+            m_message = m_message.replace(
+                '<h6 class=\"deprecated_header\">Deprecated</h6>', '')
+
+            res_span = r'(<span class=.*?</span>)'
+            res_span_detail = r'<span class=.*?>(.*?)</span>'
+            list_span = re.findall(res_span, m_message, re.S | re.M)
+            list_span_detail = re.findall(res_span_detail, m_message, re.S |
+                                          re.M)
+            assert len(list_span) == len(list_span_detail)
+            for idx in range(len(list_span)):
+                m_message = m_message.replace(list_span[idx],
+                                              list_span_detail[idx])
+
+            res_p = r'(<p>.*?</p>)'
+            res_p_detail = r'<p>(.*?)</p>'
+            list_p = re.findall(res_p, m_message, re.S | re.M)
+            list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
+            assert len(list_p) == len(list_p_detail)
+            for idx in range(len(list_p)):
+                m_message = m_message.replace(list_p[idx], list_p_detail[idx])
+
+            m_message = m_message.replace('  ', '')
+            _Messages = All_Messages.Messages.add()
+            try:
+                _Messages.errorCode = int(m_type)
+            except ValueError:
+                if re.match('0x', m_type):
+                    _Messages.errorCode = int(m_type, 16)
+                else:
+                    raise ValueError
+            _Messages.errorMessage = m_message  # save for cudaErrorMessage.pb from python-protobuf interface
+
+
+def main(argv):
+    version = []
+    url = []
+    try:
+        opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
+    except getopt.GetoptError:
+        print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt in ("-h", "--help"):
+            print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
+            sys.exit()
+        elif opt in ("-v", "--version"):
+            version = arg
+        elif opt in ("-u", "--url"):
+            url = arg
+    version = version.split(',')
+    url = url.split(',')
+    assert len(version) == len(url)
+    cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
+    for idx in range(len(version)):
+        if version[idx] == "-1":
+            print("crawling errorMessage for CUDA%s from %s" %
+                  ("-latest-version", url[idx]))
+        else:
+            print("crawling errorMessage for CUDA%s from %s" %
+                  (version[idx], url[idx]))
+        parsing(cuda_errorDesc, version[idx], url[idx])
+
+    serializeToString = cuda_errorDesc.SerializeToString()
+    with open("cudaErrorMessage.pb", "wb") as f:
+        f.write(serializeToString
+                )  # save for cudaErrorMessage.pb from python-protobuf interface
+    print("crawling errorMessage for CUDA has been done!!!")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3c0e57ffe7ec1f0ab61c2baf25d85fbe8a0fae94
--- /dev/null
+++ b/tools/cudaError/start.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -ex
+SYSTEM=`uname -s`
+rm -f protoc-3.11.3-linux-x86_64.*
+if [ "$SYSTEM" == "Linux" ]; then
+    wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
+    unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip
+    rm protoc-3.11.3-linux-x86_64.*
+elif [ "$SYSTEM" == "Darwin" ]; then
+    wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip
+    unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip
+    rm protoc-3.11.3-osx-x86_64.*
+else
+    echo "please run on Mac/Linux"
+    exit 1
+fi
+protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
+
+version=90,100,-1    # -1 represent the latest cuda-version 
+url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
+
+if [ "$1" != "" ]; then
+    version=$version,$(($1*10))
+    if [ "$2" != "" ]; then
+        url=$url,$2
+    else
+        url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
+    fi
+fi
+
+python spider.py --version=$version --url=$url
+tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb