Optimize the error messages of paddle CUDA API (#23816)

* Optimize the error messages of paddle CUDA API, test=develop * fix the error messages of paddle CUDA API, test=develop * Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cublas/NCCL,test=develop * remove build_ex_string,test=develop * merge conflict,test=develop

Optimize the error messages of paddle CUDA API (#23816)
* Optimize the error messages of paddle CUDA API, test=develop * fix the error messages of paddle CUDA API, test=develop * Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cublas/NCCL,test=develop * remove build_ex_string,test=develop * merge conflict,test=develop
78170037 · Zhou Wei · GitHub · f6dbf8e3 · 78170037 · 78170037
30 changed file
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -135,6 +135,12 @@ copy(inference_lib_dist
        SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
        DSTS ${dst_dir})

+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+copy(inference_lib_dist
+        SRCS ${cudaerror_INCLUDE_DIR}
+        DSTS ${dst_dir})
+
+# CMakeCache Info
 copy(inference_lib_dist
        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
        DSTS ${FLUID_INFERENCE_INSTALL_DIR})
@@ -184,7 +190,7 @@ copy(fluid_lib_dist
        )

 set(module "framework")
-set(framework_lib_deps framework_proto)
+set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
 add_dependencies(fluid_lib_dist ${framework_lib_deps})
 copy(fluid_lib_dist
        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
@@ -204,11 +210,11 @@ copy(fluid_lib_dist
        )

 set(module "platform")
-set(platform_lib_deps profiler_proto)
+set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
        )

 set(module "string")
@@ -249,6 +255,7 @@ copy(inference_lib_dist
        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
        DSTS ${dst_dir} ${dst_dir}/lib)

+
 # CMakeCache Info
 copy(fluid_lib_dist
        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+include(ExternalProject)
 # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)

 set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH     "${CMAKE_SOURCE_DIR}"    CACHE STRING
    "A path cache third party source code to avoid repeated download.")

 set(THIRD_PARTY_BUILD_TYPE Release)
+set(third_party_deps)

 # cache funciton to avoid repeat download code of third_party.
 # This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
@@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
    UNSET(${VAR_NAME})
 ENDMACRO()

+# Funciton to Download the dependencies during compilation
+# This function has 2 parameters, URL / DIRNAME:
+# 1. URL:           The download url of 3rd dependencies
+# 2. NAME:          The name of file, that determin the dirname
+#
+MACRO(file_download_and_uncompress URL NAME)
+  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
+  SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
+  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
+      URL                   ${URL}
+      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
+      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+    )
+  list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
+ENDMACRO()
+
+
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
    if(WITH_MKL)
@@ -178,10 +206,13 @@ include(external/dlpack)    # download dlpack
 include(external/xxhash)    # download, build, install xxhash
 include(external/warpctc)   # download, build, install warpctc

-set(third_party_deps)
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)

+# download file
+set(CUDAERROR_URL  "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
+file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
+
 if(WITH_AMD_GPU)
    include(external/rocprim)   # download, build, install rocprim
    list(APPEND third_party_deps extern_rocprim)
@@ -274,4 +305,4 @@ if (WITH_LITE)
    include(external/lite)
 endif (WITH_LITE)

-add_custom_target(third_party DEPENDS ${third_party_deps})
+add_custom_target(third_party ALL DEPENDS ${third_party_deps})
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(

      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
-                          cudaMemcpyHostToDevice, dev_ctx->stream()),
-          platform::errors::External(
-              "Async cudaMemcpy op_var info to gpu failed."));
+                          cudaMemcpyHostToDevice, dev_ctx->stream()));
    } else {  // get
      auto iter = op_var2gpu_str.find(op_var);
      PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
  float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
  float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
  float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyAsync(output_ptrs, h_odatas,
-                      d_output_ptrs_.size() * sizeof(float*),
-                      cudaMemcpyHostToDevice, stream),
-      platform::errors::External(
-          "CUDA Memcpy failed during split plugin run."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
+      cudaMemcpyHostToDevice, stream));

  int outer_rows = outer_rows_ * batchSize;

@@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
    float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
    float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(float*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
+        cudaMemcpyHostToDevice, stream));

    split_kernel<<<grid, block, 0, stream>>>(
        d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
@@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
    half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
    half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(output_ptrs, h_odatas,
-                        d_output_ptrs.size() * sizeof(half*),
-                        cudaMemcpyHostToDevice, stream),
-        platform::errors::External(
-            "CUDA Memcpy failed during split plugin run."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
+        cudaMemcpyHostToDevice, stream));

    split_kernel<<<grid, block, 0, stream>>>(
        d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
      : place_(place), default_stream_(default_stream) {
    platform::CUDADeviceGuard guard(place_.device);
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventCreate(&event_, cudaEventDisableTiming),
-        platform::errors::External(
-            "Create event failed in CUDADeviceContextAllocator"));
+        cudaEventCreate(&event_, cudaEventDisableTiming));
  }

  ~CUDADeviceContextAllocator() {
    if (event_) {
      platform::CUDADeviceGuard guard(place_.device);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event_),
-          "Destory event failed in CUDADeviceContextAllocator destroctor");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
    }
  }

@@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
    auto allocation =
        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
    // Wait for the event on stream
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(event_, default_stream_),
-        "Failed to record event in CUDADeviceContextAllocator");
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0),
-        "Failed to wait event in CUDADeviceContextAllocator");
+        cudaStreamWaitEvent(default_stream_, event_, 0));
    return allocation;
  }


--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
        cu_stream);
  }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
-      "temp_storage_bytes, status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);

  Tensor temp_storage;
  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
        cu_stream);
  }

-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      err,
-      "ArgSortOP failed as could not launch "
-      "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-      "temp_storage_bytes:%d status:%s.",
-      temp_storage_bytes, cudaGetErrorString(err));
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }

 template <typename T, typename IndType>

--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));

    VLOG(3) << "Setting descriptors.";
    std::vector<int> dims = {N, C, H, W, D};
    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
        data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));

    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));

    double this_factor = 1. - momentum;
    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                /*yDesc=*/data_desc_,
                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
                /*activationDesc=*/activation_desc_,
-                /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
+                /*sizeInBytes=*/&workspace_size));

    // -------------- cudnn batchnorm reserve space --------------
    PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
            /*bnOps=*/bnOps_,
            /*activationDesc=*/activation_desc_,
            /*xDesc=*/data_desc_,
-            /*sizeInBytes=*/&reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
+            /*sizeInBytes=*/&reserve_space_size));

    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
                                                    reserve_space_size);
@@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
            saved_variance->template mutable_data<BatchNormParamType<T>>(
                ctx.GetPlace()),
            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
-            reserve_space_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnBatchNormalizationForwardTrainingEx."));
+            reserve_space_size));

    // clean when exit.
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
  }
 };

@@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;

    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&data_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
      LOG(ERROR) << "Provided epsilon is smaller than "
                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
@@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
    }
    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
        data_desc_, CudnnDataType<T>::type,
-            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
-        platform::errors::External(
-            "The error has happened when calling cudnnSetTensorNdDescriptor."));
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnDeriveBNTensorDescriptor."));
+                                                         data_desc_, mode_));

    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
            /*dxDesc=*/data_desc_,
            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
            /*activationDesc=*/activation_desc_,
-            /*sizeInBytes=*/&workspace_size),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
+            /*sizeInBytes=*/&workspace_size));

    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                  workspace_size);
@@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
            /*workspace=*/workspace_ptr,
            /*workSpaceSizeInBytes=*/workspace_size,
            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
-            /*reserveSpaceSizeInBytes=*/reserve_space_size),
-        platform::errors::External("The error has happened when calling "
-                                   "cudnnBatchNormalizationBackwardEx."));
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));

    // clean when exit.
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(data_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
-        platform::errors::External(
-            "The error has happened when calling "
-            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
  }
 };


--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
    cudnnTensorDescriptor_t in_desc;
    cudnnTensorDescriptor_t out_desc;
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
-        platform::errors::External("Create cudnn tensor descriptor failed in "
-                                   "transpose_flatten_concat_fusion op."));
+        platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;

    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
        dims_y[i] = 1;
      }

-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(
-              out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
-          platform::errors::External("Create cudnn tensorNd descriptor failed "
-                                     "in transpose_flatten_concat op."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));

-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnTransformTensor(
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
          handle, CudnnDataType<T>::kOne(), in_desc,
          static_cast<const void*>(ins[k]->data<T>()),
-              CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
-          platform::errors::External("Create cudnn transform tensor failed in "
-                                     "transpose_flatten_concat op."));
+          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
      if (concat_axis == 0) {
        odata += osize;
      } else {
@@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
      }
    }
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
-        platform::errors::External(
-            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
+        platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
  }
 };


--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
        DataLayout::kNCHW, framework::vectorize<int>(output->dims()));

-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSpatialTfSamplerForward(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
-            output_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
+        output_data));
  }
 };

@@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
            input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
            input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
            output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data),
-        platform::errors::InvalidArgument(
-            "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
+            grid_grad_data));
  }
 };


--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -41,16 +41,12 @@ struct CUBlas<float> {

  template <typename... ARGS>
  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSscal(args...),
-        platform::errors::External("dynload cublasSscal lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasScopy(args...),
-        platform::errors::External("dynload cublasScopy lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
  }

  template <typename... ARGS>
@@ -108,16 +104,12 @@ struct CUBlas<double> {

  template <typename... ARGS>
  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDscal(args...),
-        platform::errors::External("dynload cublasDscal lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
  }

  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDcopy(args...),
-        platform::errors::External("dynload cublasDcopy lib failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
  }

  template <typename... ARGS>

--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {

    auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
                                      out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        err, platform::errors::External(
-                 "MeanOP failed to get reduce workspace size %s.",
-                 cudaGetErrorString(err)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
    framework::Tensor tmp;
    auto* temp_storage = tmp.mutable_data<uint8_t>(
        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
        context.GetPlace());
    err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
                                 out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        err, platform::errors::External(
-                 "MeanOP failed to run CUDA reduce computation: %s.",
-                 cudaGetErrorString(err)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
  }
 };


--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) {
      // gpu memory immediately without waiting gpu kernel ends
      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventRecord(events_[i].get(), compute_stream_),
-          platform::errors::Fatal(
-              "cudaEventRecord raises unexpected exception"));
+          cudaEventRecord(events_[i].get(), compute_stream_));
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0),
-          platform::errors::Fatal(
-              "cudaStreamWaitEvent raises unexpected exception"));
+          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));

      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
      for (size_t i = 0; i < cpu.size(); ++i) {
@@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) {
                       size);
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       cuda_pinned_place, cuda_pinned_ptr, size, stream_.get());
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              cudaStreamSynchronize(stream_.get()),
-              platform::errors::Fatal(
-                  "cudaStreamSynchronize raises unexpected exception"));
+          PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
        }
        gpu[i].set_lod(cpu[i].lod());
      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamSynchronize(stream_.get()),
-          platform::errors::Fatal(
-              "cudaStreamSynchronize raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
    }
 #endif
    return i;

--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
    if (comm) {
      int dtype = platform::ToNCCLDataType(mean_out->type());
      // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
-                                           static_cast<ncclDataType_t>(dtype),
-                                           ncclSum, comm, stream),
-          platform::errors::InvalidArgument(
-              "ncclAllReduce in Op(sync_batch_norm) failed"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+          comm, stream));
    }
 #endif

@@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor(
  if (comm) {
    int dtype = platform::ToNCCLDataType(scale->type());
    // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
-                                         static_cast<ncclDataType_t>(dtype),
-                                         ncclSum, comm, stream),
-        platform::errors::InvalidArgument(
-            "ncclAllReduce in Op(sync_batch_norm) failed"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+        comm, stream));
  }
 #endif


--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 proto_library(error_codes_proto SRCS error_codes.proto)
-
+proto_library(cuda_error_proto SRCS cuda_error.proto)

 if (WITH_PYTHON)
  py_proto_compile(profiler_py_proto SRCS profiler.proto)
@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)

-cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
+cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)

 set(CPU_INFO_DEPS gflags glog enforce)

--- a/paddle/fluid/platform/cuda_error.proto
+++ b/paddle/fluid/platform/cuda_error.proto
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message MessageDesc {
+  // Indicates the type of error
+  required int32 errorCode = 1;
+  // Indicates the message of error
+  required string errorMessage = 2;
+}
+
+message AllMessageDesc {
+  // Version of cuda API
+  required int32 version = 1;
+  // Error messages of different errortype
+  repeated MessageDesc Messages = 2;
+}
+
+message cudaerrorDesc {
+  // Error messages of different cuda versions(9.0/10.0/10.2)
+  repeated AllMessageDesc AllMessages = 2;
+}
\ No newline at end of file
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -29,14 +29,7 @@ namespace platform {
 class CublasHandleHolder {
 public:
  CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cublasCreate(&handle_),
-        platform::errors::External(
-            "The cuBLAS library was not initialized. This is usually caused by "
-            "an error in the CUDA Runtime API called by the cuBLAS routine, or "
-            "an error in the hardware setup.\n"
-            "To correct: check that the hardware, an appropriate version of "
-            "the driver, and the cuBLAS library are correctly installed."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
 #if CUDA_VERSION >= 9000
    if (math_type == CUBLAS_TENSOR_OP_MATH) {

--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
      platform::SetDeviceId(dev_idx);
      cudaStream_t stream;
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking),
-          platform::errors::Fatal(
-              "cudaStreamCreateWithFlags raises unexpected exception"));
+          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
      return stream;
    };

    auto deleter = [dev_idx](cudaStream_t stream) {
      platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamDestroy(stream),
-          platform::errors::Fatal(
-              "cudaStreamDestroy raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
    };

    pool_.emplace_back(
@@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() {
      platform::SetDeviceId(dev_idx);
      cudaEvent_t event;
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventCreateWithFlags(&event, cudaEventDisableTiming),
-          platform::errors::Fatal(
-              "cudaEventCreateWithFlags raises unexpected exception"));
+          cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
      return event;
    };

    auto deleter = [dev_idx](cudaEvent_t event) {
      platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventDestroy(event),
-          platform::errors::Fatal(
-              "cudaEventDestroy raises unexpected exception"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
    };

    pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -162,14 +162,9 @@ class CUDAContext {
            << "Please recompile or reinstall Paddle with compatible CUDNN "
               "version.";
      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnCreate(&cudnn_handle_),
-          platform::errors::Fatal(
-              "Failed to create Cudnn handle in DeviceContext"));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, RawStream()),
-          platform::errors::Fatal(
-              "Failed to set stream for Cudnn handle in DeviceContext"));
+          dynload::cudnnSetStream(cudnn_handle_, RawStream()));
    } else {
      cudnn_handle_ = nullptr;
    }
@@ -177,9 +172,7 @@ class CUDAContext {

  void DestoryCuDNNContext() {
    if (cudnn_handle_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnDestroy(cudnn_handle_),
-          platform::errors::Fatal("Failed to destory Cudnn handle"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
    }
    cudnn_handle_ = nullptr;
  }

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,6 +18,13 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__

+#if !defined(_WIN32)
+#include <dlfcn.h>    // dladdr
+#else                 // _WIN32
+#define NOMINMAX      // msvc max/min macro conflict with std::min/max
+#include <windows.h>  // GetModuleFileName
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -38,6 +45,7 @@ limitations under the License. */

 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
+#include "paddle/fluid/platform/cuda_error.pb.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"
@@ -220,10 +228,6 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,

 inline bool is_error(bool stat) { return !stat; }

-inline std::string build_ex_string(bool stat, const std::string& msg) {
-  return msg;
-}
-
 inline void throw_on_error(bool stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
  throw std::runtime_error(msg);
@@ -291,9 +295,7 @@ struct EnforceNotMet : public std::exception {
      try {                                                               \
        ::paddle::platform::throw_on_error(                               \
            __cond__,                                                     \
-            ::paddle::platform::build_ex_string(                            \
-                __cond__,                                                   \
-                ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
+            ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString());    \
      } catch (...) {                                                     \
        HANDLE_THE_ERROR                                                  \
        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
@@ -464,30 +466,148 @@ struct EOFException : public std::exception {
  } while (0)

 /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
-
 #ifdef PADDLE_WITH_CUDA

+/***** CUDA ERROR *****/
 inline bool is_error(cudaError_t e) { return e != cudaSuccess; }

-inline std::string build_ex_string(cudaError_t e, const std::string& msg) {
-  return msg;
+inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
+  std::ostringstream webstr;
+  webstr << "https://docs.nvidia.com/cuda/";
+  if (cuda_version != -1) {
+    double version = cuda_version / 10;
+    webstr << "archive/" << std::fixed << std::setprecision(1) << version;
+  }
+  webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
+            "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
+  return webstr.str();
+}
+
+inline std::string build_nvidia_error_msg(cudaError_t e) {
+#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
+  int32_t cuda_version = 100;
+#elif CUDA_VERSION >= 9000
+  int32_t cuda_version = 90;
+#else
+  int32_t cuda_version = -1;
+#endif
+  std::ostringstream sout;
+  sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
+  static platform::proto::cudaerrorDesc cudaerror;
+  static bool _initSucceed = false;
+  if (cudaerror.ByteSizeLong() == 0) {
+    std::string filePath;
+#if !defined(_WIN32)
+    Dl_info info;
+    if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
+      std::string strModule(info.dli_fname);
+      const size_t last_slash_idx = strModule.find_last_of("/");
+      std::string compare_path = strModule.substr(strModule.length() - 6);
+      if (std::string::npos != last_slash_idx) {
+        strModule.erase(last_slash_idx, std::string::npos);
+      }
+      if (compare_path.compare("avx.so") == 0) {
+        filePath = strModule +
+                   "/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
+      } else {
+        filePath =
+            strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
+      }
+    }
+#else
+    char buf[100];
+    MEMORY_BASIC_INFORMATION mbi;
+    HMODULE h_module =
+        (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
+            ? (HMODULE)mbi.AllocationBase
+            : NULL;
+    GetModuleFileName(h_module, buf, 100);
+    std::string strModule(buf);
+    const size_t last_slash_idx = strModule.find_last_of("\\");
+    std::string compare_path = strModule.substr(strModule.length() - 7);
+    if (std::string::npos != last_slash_idx) {
+      strModule.erase(last_slash_idx, std::string::npos);
+    }
+    if (compare_path.compare("avx.pyd") == 0) {
+      filePath =
+          strModule +
+          "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+    } else {
+      filePath =
+          strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+    }
+#endif
+    std::ifstream fin(filePath, std::ios::in | std::ios::binary);
+    _initSucceed = cudaerror.ParseFromIstream(&fin);
+  }
+  if (_initSucceed) {
+    for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
+      if (cuda_version == cudaerror.allmessages(i).version()) {
+        for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
+          if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
+            sout << "\n  [Advise: "
+                 << cudaerror.allmessages(i).messages(j).errormessage() << "]";
+            return sout.str();
+          }
+        }
+      }
+    }
+  }
+  sout << "\n  [Advise: Please search for the error code(" << e
+       << ") on website( " << GetCudaErrorWebsite(cuda_version)
+       << " ) to get Nvidia's official solution about CUDA Error.]";
+  return sout.str();
 }

 inline void throw_on_error(cudaError_t e, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw thrust::system_error(e, thrust::cuda_category(), msg);
+  throw std::runtime_error(msg);
 #else
  LOG(FATAL) << msg;
 #endif
 }

+/** curand ERROR **/
 inline bool is_error(curandStatus_t stat) {
  return stat != CURAND_STATUS_SUCCESS;
 }

-inline std::string build_ex_string(curandStatus_t stat,
-                                   const std::string& msg) {
-  return msg;
+inline const char* curandGetErrorString(curandStatus_t stat) {
+  switch (stat) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+    default:
+      return "Unknown curand status";
+  }
+}
+
+inline std::string build_nvidia_error_msg(curandStatus_t stat) {
+  std::string msg(" Curand error, ");
+  return msg + curandGetErrorString(stat) + " ";
 }

 inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
@@ -499,13 +619,14 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
 #endif
 }

+/***** CUDNN ERROR *****/
 inline bool is_error(cudnnStatus_t stat) {
  return stat != CUDNN_STATUS_SUCCESS;
 }

-inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) {
-  return msg + "\n  [Hint: " + platform::dynload::cudnnGetErrorString(stat) +
-         "]";
+inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
+  std::string msg(" Cudnn error, ");
+  return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
 }

 inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
@@ -516,33 +637,39 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
 #endif
 }

+/***** CUBLAS ERROR *****/
 inline bool is_error(cublasStatus_t stat) {
  return stat != CUBLAS_STATUS_SUCCESS;
 }

-inline std::string build_ex_string(cublasStatus_t stat,
-                                   const std::string& msg) {
-  std::string err;
-  if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
-    err = "CUBLAS_STATUS_NOT_INITIALIZED";
-  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
-    err = "CUBLAS_STATUS_ALLOC_FAILED";
-  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
-    err = "CUBLAS_STATUS_INVALID_VALUE";
-  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
-    err = "CUBLAS_STATUS_ARCH_MISMATCH";
-  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
-    err = "CUBLAS_STATUS_MAPPING_ERROR";
-  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
-    err = "CUBLAS_STATUS_EXECUTION_FAILED";
-  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
-    err = "CUBLAS_STATUS_INTERNAL_ERROR";
-  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
-    err = "CUBLAS_STATUS_NOT_SUPPORTED";
-  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
-    err = "CUBLAS_STATUS_LICENSE_ERROR";
+inline const char* cublasGetErrorString(cublasStatus_t stat) {
+  switch (stat) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+    default:
+      return "Unknown cublas status";
  }
-  return msg + "\n  [Hint: " + err + "]";
+}
+
+inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
+  std::string msg(" Cublas error, ");
+  return msg + cublasGetErrorString(stat) + " ";
 }

 inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
@@ -553,15 +680,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
 #endif
 }

+/****** NCCL ERROR ******/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
  return nccl_result != ncclSuccess;
 }

-inline std::string build_ex_string(ncclResult_t nccl_result,
-                                   const std::string& msg) {
-  return msg + "\n  [" + platform::dynload::ncclGetErrorString(nccl_result) +
-         "]";
+inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
+  std::string msg(" Nccl error, ");
+  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
 }

 inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
@@ -571,11 +698,8 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
  LOG(FATAL) << msg;
 #endif
 }
-#endif  // __APPLE__ and windows
-
-#endif  // PADDLE_WITH_CUDA
+#endif  // not(__APPLE__) and PADDLE_WITH_NCCL

-#ifdef PADDLE_WITH_CUDA
 namespace details {

 template <typename T>
@@ -598,10 +722,8 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif

 }  // namespace details
-#endif  // PADDLE_WITH_CUDA

-#ifdef PADDLE_WITH_CUDA
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...)                              \
+#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                                 \
  do {                                                                    \
    auto __cond__ = (COND);                                               \
    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                      \
@@ -612,9 +734,9 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
      try {                                                               \
        ::paddle::platform::throw_on_error(                               \
            __cond__,                                                     \
-            ::paddle::platform::build_ex_string(                            \
-                __cond__,                                                   \
-                ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
+            ::paddle::platform::errors::External(                         \
+                ::paddle::platform::build_nvidia_error_msg(__cond__))     \
+                .ToString());                                             \
      } catch (...) {                                                     \
        HANDLE_THE_ERROR                                                  \
        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
+  PADDLE_ENFORCE_CUDA_SUCCESS(value);
  return true;
 }

 template <typename T>
-bool CheckCudaStatusFailure(
-    T value, const std::string& msg = "self-defined cuda status failed") {
+bool CheckCudaStatusFailure(T value, const std::string& msg) {
  try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
+    PADDLE_ENFORCE_CUDA_SUCCESS(value);
    return false;
  } catch (paddle::platform::EnforceNotMet& error) {
    std::string ex_msg = error.what();
@@ -279,24 +278,29 @@ bool CheckCudaStatusFailure(

 TEST(enforce, cuda_success) {
  EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));

  EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));

  EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));

  EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
 #endif
 }
 #endif

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
-#include <string>

 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -42,18 +41,13 @@ faster way to query device properties. You can see details in
 https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
 */

-inline std::string CudaErrorWebsite() {
-  return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
-         "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
-         "6db0a94a430e0038";
-}
-
 static int GetCUDADeviceCountImpl() {
  int driverVersion = 0;
  cudaError_t status = cudaDriverGetVersion(&driverVersion);

  if (!(status == cudaSuccess && driverVersion != 0)) {
    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
    return 0;
  }

@@ -67,14 +61,8 @@ static int GetCUDADeviceCountImpl() {
      return 0;
    }
  }
-
  int count;
-  auto error_code = cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaGetDeviceCount failed in "
-      "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
-      error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
  return count;
 }

@@ -84,72 +72,63 @@ int GetCUDADeviceCount() {
 }

 int GetCUDAComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
  int major, minor;

  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
  auto minor_error_code =
      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
-  PADDLE_ENFORCE_EQ(
-      major_error_code, 0,
-      "cudaDevAttrComputeCapabilityMajor failed in "
-      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
-      major_error_code, CudaErrorWebsite());
-  PADDLE_ENFORCE_EQ(
-      minor_error_code, 0,
-      "cudaDevAttrComputeCapabilityMinor failed in "
-      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
-      minor_error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
  return major * 10 + minor;
 }

 dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
  dim3 ret;
  int size;
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
-  PADDLE_ENFORCE_EQ(error_code_x, 0,
-                    "cudaDevAttrMaxGridDimX failed in "
-                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
-                    error_code_x, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
  ret.x = size;

  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
-  PADDLE_ENFORCE_EQ(error_code_y, 0,
-                    "cudaDevAttrMaxGridDimY failed in "
-                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
-                    error_code_y, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
  ret.y = size;

  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
-  PADDLE_ENFORCE_EQ(error_code_z, 0,
-                    "cudaDevAttrMaxGridDimZ failed in "
-                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
-                    error_code_z, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
  ret.z = size;
  return ret;
 }

 int GetCUDARuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
  int runtime_version = 0;
-  auto error_code = cudaRuntimeGetVersion(&runtime_version);
-  PADDLE_ENFORCE(error_code,
-                 "cudaRuntimeGetVersion failed in "
-                 "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
  return runtime_version;
 }

 int GetCUDADriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
  int driver_version = 0;
-  auto error_code = cudaDriverGetVersion(&driver_version);
-  PADDLE_ENFORCE(error_code,
-                 "cudaDriverGetVersion failed in "
-                 "paddle::platform::GetCUDADriverVersion, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
  return driver_version;
 }

@@ -164,56 +143,44 @@ bool TensorCoreAvailable() {
 }

 int GetCUDAMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
  int count;
-  auto error_code =
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaDeviceGetAttribute failed in "
-                 "paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
  return count;
 }

 int GetCUDAMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
  int count;
-  auto error_code = cudaDeviceGetAttribute(
-      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaDeviceGetAttribute failed in paddle::"
-      "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
-      error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
  return count;
 }

 int GetCUDAMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(
-      id, GetCUDADeviceCount(),
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
-          "Device id must less than GPU count, but received id is:%d, "
-          "GPU count is: %d.",
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
  int count;
-  auto error_code =
-      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id);
-  PADDLE_ENFORCE_EQ(
-      error_code, 0,
-      platform::errors::InvalidArgument(
-          "cudaDeviceGetAttribute returned error code should be 0, "
-          "but received error code is: %d, %s",
-          error_code, CudaErrorWebsite()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
  return count;
 }

 int GetCurrentDeviceId() {
  int device_id;
-  auto error_code = cudaGetDevice(&device_id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaGetDevice failed in "
-                 "paddle::platform::GetCurrentDeviceId, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
  return device_id;
 }

@@ -237,12 +204,12 @@ std::vector<int> GetSelectedDevices() {

 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  auto error_code = cudaSetDevice(id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaSetDevice failed in "
-                 "paddle::platform::SetDeviced, error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id));
 }

 void GpuMemoryUsage(size_t *available, size_t *total) {
@@ -306,74 +273,44 @@ size_t GpuMaxChunkSize() {

 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
-  auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
-                 "(%p -> %p, length: %d) error code : %d, %s",
-                 src, dst, static_cast<int>(count), error_code,
-                 CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }

 void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum cudaMemcpyKind kind) {
-  auto error_code = cudaMemcpy(dst, src, count, kind);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
-                 "(%p -> %p, length: %d) error code : %d, %s",
-                 src, dst, static_cast<int>(count), error_code,
-                 CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }

 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
                        int src_device, size_t count, cudaStream_t stream) {
-  auto error_code =
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
-  PADDLE_ENFORCE(
-      error_code,
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
-      "error code : %d, %s",
-      error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }

 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count) {
-  auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
-                 "error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
 }

 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  auto error_code = cudaMemsetAsync(dst, value, count, stream);
-  PADDLE_ENFORCE(error_code,
-                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
-                 "error code : %d, %s",
-                 error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
 }

 void GpuStreamSync(cudaStream_t stream) {
-  auto error_code = cudaStreamSynchronize(stream);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      error_code,
-      platform::errors::External(
-          "cudaStreamSynchronize failed in paddle::platform::GpuStreamSync "
-          "error code : %d, %s",
-          error_code, CudaErrorWebsite()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 }

 static void RaiseNonOutOfMemoryError(cudaError_t *status) {
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
-
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);

  *status = cudaGetLastError();
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
-
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 }

@@ -450,8 +387,7 @@ class RecordedCudaMallocHelper {
    CUDADeviceGuard guard(dev_id_);
    auto err = cudaFree(ptr);
    if (err != cudaErrorCudartUnloading) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          err, platform::errors::External("cudaFree raises unexpected error"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
      if (NeedRecord()) {
        std::lock_guard<std::mutex> guard(*mtx_);
        cur_size_ -= size;

--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -117,10 +117,7 @@ void SynchronizeAllDevice() {
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaDeviceSynchronize(),
-        platform::errors::External(
-            "Device synchronize failed in cudaDeviceSynchronize()"));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
  }
 #endif
 }

--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -30,13 +30,10 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
  CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
  if (priority == Priority::kHigh) {
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1),
-        platform::errors::Fatal("High priority cuda stream creation failed."));
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
  } else if (priority == Priority::kNormal) {
    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0),
-        platform::errors::Fatal(
-            "Normal priority cuda stream creation failed."));
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
  }
  callback_manager_.reset(new StreamCallbackManager(stream_));
  VLOG(3) << "CUDAStream Init stream: " << stream_
@@ -49,9 +46,7 @@ void CUDAStream::Destroy() {
  Wait();
  WaitCallback();
  if (stream_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamDestroy(stream_),
-        platform::errors::Fatal("Cuda stream destruction failed."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
  }
  stream_ = nullptr;
 }
@@ -67,10 +62,7 @@ void CUDAStream::Wait() const {
  }
 #endif

-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      e_sync, platform::errors::Fatal(
-                  "cudaStreamSynchronize raises error: %s, errono: %d",
-                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
 }

 }  // namespace stream

--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -53,21 +53,15 @@ class CUDAStream final {
  template <typename Callback>
  void RecordEvent(cudaEvent_t ev, Callback callback) const {
    callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(ev, stream_),
-        platform::errors::Fatal("CUDA event recording failed."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
  }

  void RecordEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventRecord(ev, stream_),
-        platform::errors::Fatal("CUDA event recording failed."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
  }

  void WaitEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(stream_, ev, 0),
-        platform::errors::Fatal("Failed to wait event."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
  }

  void Wait() const;

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:

 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
+
 if not '${WIN32}':
    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
 if '${HAS_NOAVX_CORE}' == 'ON':
    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]

+
 package_dir={
    '': '${PADDLE_BINARY_DIR}/python',
    # The paddle.fluid.proto will be generated while compiling.
@@ -329,6 +331,7 @@ headers = (
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
+    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
@@ -400,7 +403,9 @@ class InstallHeaders(Command):
        return self.copy_file(header, install_dir)

    def run(self):
+        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
        if os.name == 'nt' or sys.platform == 'darwin':
+            self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
            return
        hdrs = self.distribution.headers
        if not hdrs:

--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    check_approval 1 6836917 47554610 22561442
 fi

-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
-VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
+ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
 INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
 if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"

--- a/tools/count_invalid_enforce.sh
+++ b/tools/count_invalid_enforce.sh
@@ -30,9 +30,9 @@ ALL_PADDLE_CHECK_CNT=0
 VALID_PADDLE_CHECK_CNT=0

 function enforce_scan(){
-    paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true`
+    paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true`
    total_check_cnt=`echo "$paddle_check" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
-    valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
+    valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
    eval $2=$total_check_cnt
    eval $3=$valid_check_cnt
 }

--- a/tools/cudaError/README.md
+++ b/tools/cudaError/README.md
+Usage:
+
+Please run:
+```
+bash start.sh
+```
+
+The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
+
+If you want to crawl a specified version of CUDA, Please run:
+```
+bash start.sh <version> <URL(optional)>
+```
+URL can be derived by default, so you don't have to enter a URL.
+
+for example:
+```
+bash start.sh 11.0
+```
+will capture error message of CUDA11.0(in future).
+
+Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
--- a/tools/cudaError/spider.py
+++ b/tools/cudaError/spider.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ssl
+import re
+import urllib2
+import json
+import collections
+import sys, getopt
+import cuda_error_pb2
+
+
+def parsing(cuda_errorDesc, version, url):
+    All_Messages = cuda_errorDesc.AllMessages.add()
+    All_Messages.version = int(version)
+
+    ssl._create_default_https_context = ssl._create_unverified_context
+    html = urllib2.urlopen(url).read()
+    res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)
+
+    url_list = url.split('/')
+    url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
+
+    dic = collections.OrderedDict()
+    dic_message = collections.OrderedDict()
+    for line in m_div:
+        res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
+        m_dt = re.findall(res_dt, line, re.S | re.M)
+        for error in m_dt:
+            res_type = r'<span class="ph ph apiData">(.*?)</span>'
+            m_type = re.findall(res_type, error[0], re.S | re.M)[0]
+            m_message = error[1]
+            m_message = m_message.replace('\n', '')
+            res_a = r'(<a class=.*?</a>)'
+            res_shape = r'<a class=.*?>(.*?)</a>'
+            list_a = re.findall(res_a, m_message, re.S | re.M)
+            list_shape = re.findall(res_shape, m_message, re.S | re.M)
+            assert len(list_a) == len(list_shape)
+            for idx in range(len(list_a)):
+                m_message = m_message.replace(list_a[idx], list_shape[idx])
+
+            m_message = m_message.replace(
+                '<h6 class=\"deprecated_header\">Deprecated</h6>', '')
+
+            res_span = r'(<span class=.*?</span>)'
+            res_span_detail = r'<span class=.*?>(.*?)</span>'
+            list_span = re.findall(res_span, m_message, re.S | re.M)
+            list_span_detail = re.findall(res_span_detail, m_message, re.S |
+                                          re.M)
+            assert len(list_span) == len(list_span_detail)
+            for idx in range(len(list_span)):
+                m_message = m_message.replace(list_span[idx],
+                                              list_span_detail[idx])
+
+            res_p = r'(<p>.*?</p>)'
+            res_p_detail = r'<p>(.*?)</p>'
+            list_p = re.findall(res_p, m_message, re.S | re.M)
+            list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
+            assert len(list_p) == len(list_p_detail)
+            for idx in range(len(list_p)):
+                m_message = m_message.replace(list_p[idx], list_p_detail[idx])
+
+            m_message = m_message.replace('  ', '')
+            _Messages = All_Messages.Messages.add()
+            try:
+                _Messages.errorCode = int(m_type)
+            except ValueError:
+                if re.match('0x', m_type):
+                    _Messages.errorCode = int(m_type, 16)
+                else:
+                    raise ValueError
+            _Messages.errorMessage = m_message  # save for cudaErrorMessage.pb from python-protobuf interface
+
+
+def main(argv):
+    version = []
+    url = []
+    try:
+        opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
+    except getopt.GetoptError:
+        print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt in ("-h", "--help"):
+            print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
+            sys.exit()
+        elif opt in ("-v", "--version"):
+            version = arg
+        elif opt in ("-u", "--url"):
+            url = arg
+    version = version.split(',')
+    url = url.split(',')
+    assert len(version) == len(url)
+    cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
+    for idx in range(len(version)):
+        if version[idx] == "-1":
+            print("crawling errorMessage for CUDA%s from %s" %
+                  ("-latest-version", url[idx]))
+        else:
+            print("crawling errorMessage for CUDA%s from %s" %
+                  (version[idx], url[idx]))
+        parsing(cuda_errorDesc, version[idx], url[idx])
+
+    serializeToString = cuda_errorDesc.SerializeToString()
+    with open("cudaErrorMessage.pb", "wb") as f:
+        f.write(serializeToString
+                )  # save for cudaErrorMessage.pb from python-protobuf interface
+    print("crawling errorMessage for CUDA has been done!!!")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
+#!/usr/bin/env bash
+set -ex
+SYSTEM=`uname -s`
+rm -f protoc-3.11.3-linux-x86_64.*
+if [ "$SYSTEM" == "Linux" ]; then
+    wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
+    unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip
+    rm protoc-3.11.3-linux-x86_64.*
+elif [ "$SYSTEM" == "Darwin" ]; then
+    wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip
+    unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip
+    rm protoc-3.11.3-osx-x86_64.*
+else
+    echo "please run on Mac/Linux"
+    exit 1
+fi
+protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
+
+version=90,100,-1    # -1 represent the latest cuda-version 
+url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
+
+if [ "$1" != "" ]; then
+    version=$version,$(($1*10))
+    if [ "$2" != "" ]; then
+        url=$url,$2
+    else
+        url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
+    fi
+fi
+
+python spider.py --version=$version --url=$url
+tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb