From 7817003795e679faf0ba0a8d5dd610d498fac1de Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Mon, 20 Apr 2020 19:06:50 +0800 Subject: [PATCH] Optimize the error messages of paddle CUDA API (#23816) * Optimize the error messages of paddle CUDA API, test=develop * fix the error messages of paddle CUDA API, test=develop * Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cublas/NCCL,test=develop * remove build_ex_string,test=develop * merge conflict,test=develop --- cmake/inference_lib.cmake | 15 +- cmake/third_party.cmake | 35 ++- .../framework/details/nan_inf_utils_detail.cu | 4 +- .../tensorrt/plugin/split_op_plugin.cu | 27 +- .../cuda_device_context_allocator.h | 15 +- paddle/fluid/operators/argsort_op.cu | 14 +- .../operators/fused/fused_bn_activation_op.cu | 90 ++---- .../fusion_transpose_flatten_concat_op.cu.cc | 43 +-- .../operators/grid_sampler_cudnn_op.cu.cc | 15 +- paddle/fluid/operators/math/blas_impl.cu.h | 16 +- paddle/fluid/operators/mean_op.cu | 10 +- .../fluid/operators/reader/buffered_reader.cc | 18 +- .../fluid/operators/sync_batch_norm_op.cu.h | 18 +- paddle/fluid/platform/CMakeLists.txt | 4 +- paddle/fluid/platform/cuda_error.proto | 35 +++ paddle/fluid/platform/cuda_helper.h | 9 +- paddle/fluid/platform/cuda_resource_pool.cc | 18 +- paddle/fluid/platform/device_context.h | 13 +- paddle/fluid/platform/enforce.h | 290 +++++++++++++----- paddle/fluid/platform/enforce_test.cc | 32 +- paddle/fluid/platform/gpu_info.cc | 196 ++++-------- paddle/fluid/platform/profiler_helper.h | 5 +- paddle/fluid/platform/stream/cuda_stream.cc | 16 +- paddle/fluid/platform/stream/cuda_stream.h | 12 +- python/setup.py.in | 5 + tools/check_api_approvals.sh | 4 +- tools/count_invalid_enforce.sh | 4 +- tools/cudaError/README.md | 22 ++ tools/cudaError/spider.py | 124 ++++++++ tools/cudaError/start.sh | 32 ++ 30 files changed, 645 insertions(+), 496 deletions(-) create mode 100644 paddle/fluid/platform/cuda_error.proto create mode 100644 tools/cudaError/README.md create mode 100644 tools/cudaError/spider.py create mode 100644 tools/cudaError/start.sh diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 30e96b8e5f..a52d91741a 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -135,6 +135,12 @@ copy(inference_lib_dist SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h DSTS ${dst_dir}) +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") +copy(inference_lib_dist + SRCS ${cudaerror_INCLUDE_DIR} + DSTS ${dst_dir}) + +# CMakeCache Info copy(inference_lib_dist SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt DSTS ${FLUID_INFERENCE_INSTALL_DIR}) @@ -184,7 +190,7 @@ copy(fluid_lib_dist ) set(module "framework") -set(framework_lib_deps framework_proto) +set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto) add_dependencies(fluid_lib_dist ${framework_lib_deps}) copy(fluid_lib_dist SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h @@ -204,11 +210,11 @@ copy(fluid_lib_dist ) set(module "platform") -set(platform_lib_deps profiler_proto) +set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto) add_dependencies(fluid_lib_dist ${platform_lib_deps}) copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ) set(module "string") @@ -249,6 +255,7 @@ copy(inference_lib_dist SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) + # CMakeCache Info copy(fluid_lib_dist SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 3b3a43a69a..9c8a9e0af1 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +include(ExternalProject) # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac) set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING @@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING "A path cache third party source code to avoid repeated download.") set(THIRD_PARTY_BUILD_TYPE Release) +set(third_party_deps) # cache funciton to avoid repeat download code of third_party. # This function has 4 parameters, URL / REPOSITOR / TAG / DIR: @@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME) UNSET(${VAR_NAME}) ENDMACRO() +# Funciton to Download the dependencies during compilation +# This function has 2 parameters, URL / DIRNAME: +# 1. URL: The download url of 3rd dependencies +# 2. NAME: The name of file, that determin the dirname +# +MACRO(file_download_and_uncompress URL NAME) + MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") + SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}") + SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data) + ExternalProject_Add( + ${EXTERNAL_PROJECT_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${THIRD_PARTY_PATH}/${NAME} + URL ${URL} + DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ + SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "" + ) + list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME}) +ENDMACRO() + + # Correction of flags on different Platform(WIN/MAC) and Print Warning Message if (APPLE) if(WITH_MKL) @@ -178,10 +206,13 @@ include(external/dlpack) # download dlpack include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc -set(third_party_deps) list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool) +# download file +set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) +file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") + if(WITH_AMD_GPU) include(external/rocprim) # download, build, install rocprim list(APPEND third_party_deps extern_rocprim) @@ -274,4 +305,4 @@ if (WITH_LITE) include(external/lite) endif (WITH_LITE) -add_custom_target(third_party DEPENDS ${third_party_deps}) +add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 44668e491e..f9f91680e3 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -152,9 +152,7 @@ void TensorCheckerVisitor::apply( PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, - cudaMemcpyHostToDevice, dev_ctx->stream()), - platform::errors::External( - "Async cudaMemcpy op_var info to gpu failed.")); + cudaMemcpyHostToDevice, dev_ctx->stream())); } else { // get auto iter = op_var2gpu_str.find(op_var); PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 7a032acef6..9eefb925d2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, float const* input_ptr = reinterpret_cast(inputs[0]); float* const* h_odatas = reinterpret_cast(outputs); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyAsync(output_ptrs, h_odatas, - d_output_ptrs_.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream), - platform::errors::External( - "CUDA Memcpy failed during split plugin run.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*), + cudaMemcpyHostToDevice, stream)); int outer_rows = outer_rows_ * batchSize; @@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, float* const* h_odatas = reinterpret_cast(outputs); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyAsync(output_ptrs, h_odatas, - d_output_ptrs.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream), - platform::errors::External( - "CUDA Memcpy failed during split plugin run.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*), + cudaMemcpyHostToDevice, stream)); split_kernel<<>>( d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, @@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, half* const* h_odatas = reinterpret_cast(outputs); half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyAsync(output_ptrs, h_odatas, - d_output_ptrs.size() * sizeof(half*), - cudaMemcpyHostToDevice, stream), - platform::errors::External( - "CUDA Memcpy failed during split plugin run.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*), + cudaMemcpyHostToDevice, stream)); split_kernel<<>>( d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 0997f575ac..2163562a60 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator { : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventCreate(&event_, cudaEventDisableTiming), - platform::errors::External( - "Create event failed in CUDADeviceContextAllocator")); + cudaEventCreate(&event_, cudaEventDisableTiming)); } ~CUDADeviceContextAllocator() { if (event_) { platform::CUDADeviceGuard guard(place_.device); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventDestroy(event_), - "Destory event failed in CUDADeviceContextAllocator destroctor"); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); } } @@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator { auto allocation = new CUDADeviceContextAllocation(memory::Alloc(place_, size)); // Wait for the event on stream + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventRecord(event_, default_stream_), - "Failed to record event in CUDADeviceContextAllocator"); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(default_stream_, event_, 0), - "Failed to wait event in CUDADeviceContextAllocator"); + cudaStreamWaitEvent(default_stream_, event_, 0)); return allocation; } diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu index 006bf55919..cbd7e33bc6 100644 --- a/paddle/fluid/operators/argsort_op.cu +++ b/paddle/fluid/operators/argsort_op.cu @@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, cu_stream); } - PADDLE_ENFORCE_CUDA_SUCCESS( - err, - "ArgSortOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate" - "temp_storage_bytes, status:%s.", - temp_storage_bytes, cudaGetErrorString(err)); + PADDLE_ENFORCE_CUDA_SUCCESS(err); Tensor temp_storage; temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); @@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, cu_stream); } - PADDLE_ENFORCE_CUDA_SUCCESS( - err, - "ArgSortOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, " - "temp_storage_bytes:%d status:%s.", - temp_storage_bytes, cudaGetErrorString(err)); + PADDLE_ENFORCE_CUDA_SUCCESS(err); } template diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index 2e30865793..32eaf11809 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -108,32 +108,21 @@ class FusedBatchNormActKernel cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnCreateTensorDescriptor(&data_desc_).")); + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnCreateTensorDescriptor(&bn_param_desc_).")); + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); VLOG(3) << "Setting descriptors."; std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()), - platform::errors::External( - "The error has happened when calling cudnnSetTensorNdDescriptor.")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_), - platform::errors::External("The error has happened when calling " - "cudnnDeriveBNTensorDescriptor.")); + data_desc_, mode_)); double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; @@ -166,10 +155,7 @@ class FusedBatchNormActKernel /*yDesc=*/data_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*activationDesc=*/activation_desc_, - /*sizeInBytes=*/&workspace_size), - platform::errors::External( - "The error has happened when calling " - "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize.")); + /*sizeInBytes=*/&workspace_size)); // -------------- cudnn batchnorm reserve space -------------- PADDLE_ENFORCE_CUDA_SUCCESS( @@ -179,10 +165,7 @@ class FusedBatchNormActKernel /*bnOps=*/bnOps_, /*activationDesc=*/activation_desc_, /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size), - platform::errors::External( - "The error has happened when calling " - "cudnnGetBatchNormalizationTrainingExReserveSpaceSize.")); + /*sizeInBytes=*/&reserve_space_size)); reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(), reserve_space_size); @@ -204,22 +187,13 @@ class FusedBatchNormActKernel saved_variance->template mutable_data>( ctx.GetPlace()), activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size), - platform::errors::External( - "The error has happened when calling " - "cudnnBatchNormalizationForwardTrainingEx.")); + reserve_space_size)); // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnDestroyTensorDescriptor(data_desc_).")); + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnDestroyTensorDescriptor(bn_param_desc_).")); + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; @@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnCreateTensorDescriptor(&data_desc_).")); + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnCreateTensorDescriptor(&bn_param_desc_).")); + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " << "CUDNN_BN_MIN_EPSILON. Setting it to " @@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()), - platform::errors::External( - "The error has happened when calling cudnnSetTensorNdDescriptor.")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_), - platform::errors::External("The error has happened when calling " - "cudnnDeriveBNTensorDescriptor.")); + data_desc_, mode_)); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); @@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel /*dxDesc=*/data_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*activationDesc=*/activation_desc_, - /*sizeInBytes=*/&workspace_size), - platform::errors::External( - "The error has happened when calling " - "cudnnGetBatchNormalizationBackwardExWorkspaceSize.")); + /*sizeInBytes=*/&workspace_size)); workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); @@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel /*workspace=*/workspace_ptr, /*workSpaceSizeInBytes=*/workspace_size, /*reserveSpace=*/const_cast(reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size), - platform::errors::External("The error has happened when calling " - "cudnnBatchNormalizationBackwardEx.")); + /*reserveSpaceSizeInBytes=*/reserve_space_size)); // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnDestroyTensorDescriptor(data_desc_).")); + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), - platform::errors::External( - "The error has happened when calling " - "cudnnDestroyTensorDescriptor(bn_param_desc_).")); + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index b61ef8e566..17cb4556d4 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { cudnnTensorDescriptor_t in_desc; cudnnTensorDescriptor_t out_desc; PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&in_desc), - platform::errors::External("Create cudnn tensor descriptor failed in " - "transpose_flatten_concat_fusion op.")); + platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&out_desc), - platform::errors::External("Create cudnn tensor descriptor failed in " - "transpose_flatten_concat_fusion op.")); + platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); cudnnDataType_t cudnn_dtype = CudnnDataType::type; auto& dev_ctx = ctx.template device_context(); @@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { dims_y[i] = 1; } - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetTensorNdDescriptor( - in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()), - platform::errors::External("Create cudnn tensorNd descriptor failed " - "in transpose_flatten_concat op.")); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSetTensorNdDescriptor( - out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()), - platform::errors::External("Create cudnn tensorNd descriptor failed " - "in transpose_flatten_concat op.")); - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnTransformTensor( - handle, CudnnDataType::kOne(), in_desc, - static_cast(ins[k]->data()), - CudnnDataType::kZero(), out_desc, static_cast(odata)), - platform::errors::External("Create cudnn transform tensor failed in " - "transpose_flatten_concat op.")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data())); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data())); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor( + handle, CudnnDataType::kOne(), in_desc, + static_cast(ins[k]->data()), + CudnnDataType::kZero(), out_desc, static_cast(odata))); if (concat_axis == 0) { odata += osize; } else { @@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { } } PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(in_desc), - platform::errors::External( - "Destory cudnn descriptor failed in transpose_flatten_concat op.")); + platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(out_desc), - platform::errors::External( - "Destory cudnn descriptor failed in transpose_flatten_concat op.")); + platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); } }; diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index c266b0d32b..3bf34fc685 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( DataLayout::kNCHW, framework::vectorize(output->dims())); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnSpatialTfSamplerForward( - handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, - input_data, grid_data, CudnnDataType::kZero(), cudnn_output_desc, - output_data), - platform::errors::InvalidArgument( - "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward( + handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, + input_data, grid_data, CudnnDataType::kZero(), cudnn_output_desc, + output_data)); } }; @@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { input_data, CudnnDataType::kZero(), cudnn_input_grad_desc, input_grad_data, CudnnDataType::kOne(), cudnn_output_grad_desc, output_grad_data, grid_data, CudnnDataType::kZero(), - grid_grad_data), - platform::errors::InvalidArgument( - "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed")); + grid_grad_data)); } }; diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index c0ab35b0e7..8e903a4ecc 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -41,16 +41,12 @@ struct CUBlas { template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasSscal(args...), - platform::errors::External("dynload cublasSscal lib failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasScopy(args...), - platform::errors::External("dynload cublasScopy lib failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...)); } template @@ -108,16 +104,12 @@ struct CUBlas { template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasDscal(args...), - platform::errors::External("dynload cublasDscal lib failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cublasDcopy(args...), - platform::errors::External("dynload cublasDcopy lib failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...)); } template diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index d2b01fafb7..081c077ab7 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel { auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream); - PADDLE_ENFORCE_CUDA_SUCCESS( - err, platform::errors::External( - "MeanOP failed to get reduce workspace size %s.", - cudaGetErrorString(err))); + PADDLE_ENFORCE_CUDA_SUCCESS(err); framework::Tensor tmp; auto* temp_storage = tmp.mutable_data( framework::make_ddim({static_cast(temp_storage_bytes)}), context.GetPlace()); err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream); - PADDLE_ENFORCE_CUDA_SUCCESS( - err, platform::errors::External( - "MeanOP failed to run CUDA reduce computation: %s.", - cudaGetErrorString(err))); + PADDLE_ENFORCE_CUDA_SUCCESS(err); } }; diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index b237df130a..e72820611d 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) { // gpu memory immediately without waiting gpu kernel ends platform::SetDeviceId(boost::get(place_).device); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventRecord(events_[i].get(), compute_stream_), - platform::errors::Fatal( - "cudaEventRecord raises unexpected exception")); + cudaEventRecord(events_[i].get(), compute_stream_)); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0), - platform::errors::Fatal( - "cudaStreamWaitEvent raises unexpected exception")); + cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); platform::RecordEvent record_event("BufferedReader:MemoryCopy"); for (size_t i = 0; i < cpu.size(); ++i) { @@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) { size); memory::Copy(boost::get(place_), gpu_ptr, cuda_pinned_place, cuda_pinned_ptr, size, stream_.get()); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamSynchronize(stream_.get()), - platform::errors::Fatal( - "cudaStreamSynchronize raises unexpected exception")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get())); } gpu[i].set_lod(cpu[i].lod()); } - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamSynchronize(stream_.get()), - platform::errors::Fatal( - "cudaStreamSynchronize raises unexpected exception")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get())); } #endif return i; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h index 083d22aa2a..cfb9e16942 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu.h +++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h @@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, if (comm) { int dtype = platform::ToNCCLDataType(mean_out->type()); // In-place operation - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1, - static_cast(dtype), - ncclSum, comm, stream), - platform::errors::InvalidArgument( - "ncclAllReduce in Op(sync_batch_norm) failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, + comm, stream)); } #endif @@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor( if (comm) { int dtype = platform::ToNCCLDataType(scale->type()); // In-place operation - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1, - static_cast(dtype), - ncclSum, comm, stream), - platform::errors::InvalidArgument( - "ncclAllReduce in Op(sync_batch_norm) failed")); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, + comm, stream)); } #endif diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index ddf3035a92..d0d74f6ea8 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,6 +1,6 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) proto_library(error_codes_proto SRCS error_codes.proto) - +proto_library(cuda_error_proto SRCS cuda_error.proto) if (WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags) cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) -cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors) +cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto) cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) set(CPU_INFO_DEPS gflags glog enforce) diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/cuda_error.proto new file mode 100644 index 0000000000..b55e0af81e --- /dev/null +++ b/paddle/fluid/platform/cuda_error.proto @@ -0,0 +1,35 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +package paddle.platform.proto; + +message MessageDesc { + // Indicates the type of error + required int32 errorCode = 1; + // Indicates the message of error + required string errorMessage = 2; +} + +message AllMessageDesc { + // Version of cuda API + required int32 version = 1; + // Error messages of different errortype + repeated MessageDesc Messages = 2; +} + +message cudaerrorDesc { + // Error messages of different cuda versions(9.0/10.0/10.2) + repeated AllMessageDesc AllMessages = 2; +} \ No newline at end of file diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 54f5e911e3..74cf554523 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -29,14 +29,7 @@ namespace platform { class CublasHandleHolder { public: CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cublasCreate(&handle_), - platform::errors::External( - "The cuBLAS library was not initialized. This is usually caused by " - "an error in the CUDA Runtime API called by the cuBLAS routine, or " - "an error in the hardware setup.\n" - "To correct: check that the hardware, an appropriate version of " - "the driver, and the cuBLAS library are correctly installed.")); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); #if CUDA_VERSION >= 9000 if (math_type == CUBLAS_TENSOR_OP_MATH) { diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc index 1828f0760a..65c8b96028 100644 --- a/paddle/fluid/platform/cuda_resource_pool.cc +++ b/paddle/fluid/platform/cuda_resource_pool.cc @@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() { platform::SetDeviceId(dev_idx); cudaStream_t stream; PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), - platform::errors::Fatal( - "cudaStreamCreateWithFlags raises unexpected exception")); + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); return stream; }; auto deleter = [dev_idx](cudaStream_t stream) { platform::SetDeviceId(dev_idx); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamDestroy(stream), - platform::errors::Fatal( - "cudaStreamDestroy raises unexpected exception")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream)); }; pool_.emplace_back( @@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() { platform::SetDeviceId(dev_idx); cudaEvent_t event; PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventCreateWithFlags(&event, cudaEventDisableTiming), - platform::errors::Fatal( - "cudaEventCreateWithFlags raises unexpected exception")); + cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); return event; }; auto deleter = [dev_idx](cudaEvent_t event) { platform::SetDeviceId(dev_idx); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventDestroy(event), - platform::errors::Fatal( - "cudaEventDestroy raises unexpected exception")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); }; pool_.emplace_back(ResourcePool::Create(creator, deleter)); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e32c8d4ea6..529992b47e 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -162,14 +162,9 @@ class CUDAContext { << "Please recompile or reinstall Paddle with compatible CUDNN " "version."; } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnCreate(&cudnn_handle_), - platform::errors::Fatal( - "Failed to create Cudnn handle in DeviceContext")); - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnSetStream(cudnn_handle_, RawStream()), - platform::errors::Fatal( - "Failed to set stream for Cudnn handle in DeviceContext")); + dynload::cudnnSetStream(cudnn_handle_, RawStream())); } else { cudnn_handle_ = nullptr; } @@ -177,9 +172,7 @@ class CUDAContext { void DestoryCuDNNContext() { if (cudnn_handle_) { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroy(cudnn_handle_), - platform::errors::Fatal("Failed to destory Cudnn handle")); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_)); } cudnn_handle_ = nullptr; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 99f83d9732..f2e0c52170 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -18,6 +18,13 @@ limitations under the License. */ #include // for __cxa_demangle #endif // __GNUC__ +#if !defined(_WIN32) +#include // dladdr +#else // _WIN32 +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#include // GetModuleFileName +#endif + #ifdef PADDLE_WITH_CUDA #include #include @@ -38,6 +45,7 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "glog/logging.h" +#include "paddle/fluid/platform/cuda_error.pb.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/port.h" @@ -220,10 +228,6 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, inline bool is_error(bool stat) { return !stat; } -inline std::string build_ex_string(bool stat, const std::string& msg) { - return msg; -} - inline void throw_on_error(bool stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG throw std::runtime_error(msg); @@ -284,23 +288,21 @@ struct EnforceNotMet : public std::exception { } \ } while (0) #else -#define PADDLE_ENFORCE(COND, ...) \ - do { \ - auto __cond__ = (COND); \ - if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ - try { \ - ::paddle::platform::throw_on_error( \ - __cond__, \ - ::paddle::platform::build_ex_string( \ - __cond__, \ - ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \ - } catch (...) { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ - __FILE__, __LINE__); \ - END_HANDLE_THE_ERROR \ - } \ - } \ +#define PADDLE_ENFORCE(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, \ + ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString()); \ + } catch (...) { \ + HANDLE_THE_ERROR \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } \ + } \ } while (0) #endif @@ -464,30 +466,148 @@ struct EOFException : public std::exception { } while (0) /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/ - #ifdef PADDLE_WITH_CUDA +/***** CUDA ERROR *****/ inline bool is_error(cudaError_t e) { return e != cudaSuccess; } -inline std::string build_ex_string(cudaError_t e, const std::string& msg) { - return msg; +inline std::string GetCudaErrorWebsite(int32_t cuda_version) { + std::ostringstream webstr; + webstr << "https://docs.nvidia.com/cuda/"; + if (cuda_version != -1) { + double version = cuda_version / 10; + webstr << "archive/" << std::fixed << std::setprecision(1) << version; + } + webstr << "/cuda-runtime-api/group__CUDART__TYPES.html" + "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038"; + return webstr.str(); +} + +inline std::string build_nvidia_error_msg(cudaError_t e) { +#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000 + int32_t cuda_version = 100; +#elif CUDA_VERSION >= 9000 + int32_t cuda_version = 90; +#else + int32_t cuda_version = -1; +#endif + std::ostringstream sout; + sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << "."; + static platform::proto::cudaerrorDesc cudaerror; + static bool _initSucceed = false; + if (cudaerror.ByteSizeLong() == 0) { + std::string filePath; +#if !defined(_WIN32) + Dl_info info; + if (dladdr(reinterpret_cast(GetCudaErrorWebsite), &info)) { + std::string strModule(info.dli_fname); + const size_t last_slash_idx = strModule.find_last_of("/"); + std::string compare_path = strModule.substr(strModule.length() - 6); + if (std::string::npos != last_slash_idx) { + strModule.erase(last_slash_idx, std::string::npos); + } + if (compare_path.compare("avx.so") == 0) { + filePath = strModule + + "/../include/third_party/cudaerror/data/cudaErrorMessage.pb"; + } else { + filePath = + strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb"; + } + } +#else + char buf[100]; + MEMORY_BASIC_INFORMATION mbi; + HMODULE h_module = + (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0) + ? (HMODULE)mbi.AllocationBase + : NULL; + GetModuleFileName(h_module, buf, 100); + std::string strModule(buf); + const size_t last_slash_idx = strModule.find_last_of("\\"); + std::string compare_path = strModule.substr(strModule.length() - 7); + if (std::string::npos != last_slash_idx) { + strModule.erase(last_slash_idx, std::string::npos); + } + if (compare_path.compare("avx.pyd") == 0) { + filePath = + strModule + + "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; + } else { + filePath = + strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; + } +#endif + std::ifstream fin(filePath, std::ios::in | std::ios::binary); + _initSucceed = cudaerror.ParseFromIstream(&fin); + } + if (_initSucceed) { + for (int i = 0; i < cudaerror.allmessages_size(); ++i) { + if (cuda_version == cudaerror.allmessages(i).version()) { + for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) { + if (e == cudaerror.allmessages(i).messages(j).errorcode()) { + sout << "\n [Advise: " + << cudaerror.allmessages(i).messages(j).errormessage() << "]"; + return sout.str(); + } + } + } + } + } + sout << "\n [Advise: Please search for the error code(" << e + << ") on website( " << GetCudaErrorWebsite(cuda_version) + << " ) to get Nvidia's official solution about CUDA Error.]"; + return sout.str(); } inline void throw_on_error(cudaError_t e, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), msg); + throw std::runtime_error(msg); #else LOG(FATAL) << msg; #endif } +/** curand ERROR **/ inline bool is_error(curandStatus_t stat) { return stat != CURAND_STATUS_SUCCESS; } -inline std::string build_ex_string(curandStatus_t stat, - const std::string& msg) { - return msg; +inline const char* curandGetErrorString(curandStatus_t stat) { + switch (stat) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + default: + return "Unknown curand status"; + } +} + +inline std::string build_nvidia_error_msg(curandStatus_t stat) { + std::string msg(" Curand error, "); + return msg + curandGetErrorString(stat) + " "; } inline void throw_on_error(curandStatus_t stat, const std::string& msg) { @@ -499,13 +619,14 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) { #endif } +/***** CUDNN ERROR *****/ inline bool is_error(cudnnStatus_t stat) { return stat != CUDNN_STATUS_SUCCESS; } -inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) { - return msg + "\n [Hint: " + platform::dynload::cudnnGetErrorString(stat) + - "]"; +inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { + std::string msg(" Cudnn error, "); + return msg + platform::dynload::cudnnGetErrorString(stat) + " "; } inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { @@ -516,33 +637,39 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { #endif } +/***** CUBLAS ERROR *****/ inline bool is_error(cublasStatus_t stat) { return stat != CUBLAS_STATUS_SUCCESS; } -inline std::string build_ex_string(cublasStatus_t stat, - const std::string& msg) { - std::string err; - if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { - err = "CUBLAS_STATUS_NOT_INITIALIZED"; - } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { - err = "CUBLAS_STATUS_ALLOC_FAILED"; - } else if (stat == CUBLAS_STATUS_INVALID_VALUE) { - err = "CUBLAS_STATUS_INVALID_VALUE"; - } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { - err = "CUBLAS_STATUS_ARCH_MISMATCH"; - } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { - err = "CUBLAS_STATUS_MAPPING_ERROR"; - } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { - err = "CUBLAS_STATUS_EXECUTION_FAILED"; - } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { - err = "CUBLAS_STATUS_INTERNAL_ERROR"; - } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { - err = "CUBLAS_STATUS_NOT_SUPPORTED"; - } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { - err = "CUBLAS_STATUS_LICENSE_ERROR"; +inline const char* cublasGetErrorString(cublasStatus_t stat) { + switch (stat) { + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + default: + return "Unknown cublas status"; } - return msg + "\n [Hint: " + err + "]"; +} + +inline std::string build_nvidia_error_msg(cublasStatus_t stat) { + std::string msg(" Cublas error, "); + return msg + cublasGetErrorString(stat) + " "; } inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { @@ -553,15 +680,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { #endif } +/****** NCCL ERROR ******/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { return nccl_result != ncclSuccess; } -inline std::string build_ex_string(ncclResult_t nccl_result, - const std::string& msg) { - return msg + "\n [" + platform::dynload::ncclGetErrorString(nccl_result) + - "]"; +inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { + std::string msg(" Nccl error, "); + return msg + platform::dynload::ncclGetErrorString(nccl_result) + " "; } inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) { @@ -571,11 +698,8 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) { LOG(FATAL) << msg; #endif } -#endif // __APPLE__ and windows - -#endif // PADDLE_WITH_CUDA +#endif // not(__APPLE__) and PADDLE_WITH_NCCL -#ifdef PADDLE_WITH_CUDA namespace details { template @@ -598,30 +722,28 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); #endif } // namespace details -#endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_CUDA -#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...) \ - do { \ - auto __cond__ = (COND); \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - try { \ - ::paddle::platform::throw_on_error( \ - __cond__, \ - ::paddle::platform::build_ex_string( \ - __cond__, \ - ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \ - } catch (...) { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ - __FILE__, __LINE__); \ - END_HANDLE_THE_ERROR \ - } \ - } \ +#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CudaStatusType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, \ + ::paddle::platform::errors::External( \ + ::paddle::platform::build_nvidia_error_msg(__cond__)) \ + .ToString()); \ + } catch (...) { \ + HANDLE_THE_ERROR \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } \ + } \ } while (0) #undef DEFINE_CUDA_STATUS_TYPE diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 0057c78452..db77ba9585 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) { #ifdef PADDLE_WITH_CUDA template bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { - PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); + PADDLE_ENFORCE_CUDA_SUCCESS(value); return true; } template -bool CheckCudaStatusFailure( - T value, const std::string& msg = "self-defined cuda status failed") { +bool CheckCudaStatusFailure(T value, const std::string& msg) { try { - PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); + PADDLE_ENFORCE_CUDA_SUCCESS(value); return false; } catch (paddle::platform::EnforceNotMet& error) { std::string ex_msg = error.what(); @@ -279,24 +278,29 @@ bool CheckCudaStatusFailure( TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue)); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation)); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error")); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error")); EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); - EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH)); - EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED)); + EXPECT_TRUE( + CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error")); + EXPECT_TRUE( + CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error")); EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); - EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED)); - EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED)); + EXPECT_TRUE( + CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error")); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); - EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED)); - EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE)); + EXPECT_TRUE( + CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error")); + EXPECT_TRUE( + CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error")); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError)); - EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError)); + EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error")); #endif } #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 40d6bc54cc..c07abba9e8 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include #include -#include #include "gflags/gflags.h" #include "paddle/fluid/platform/cuda_device_guard.h" @@ -42,18 +41,13 @@ faster way to query device properties. You can see details in https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/ */ -inline std::string CudaErrorWebsite() { - return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api" - "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824" - "6db0a94a430e0038"; -} - static int GetCUDADeviceCountImpl() { int driverVersion = 0; cudaError_t status = cudaDriverGetVersion(&driverVersion); if (!(status == cudaSuccess && driverVersion != 0)) { // No GPU driver + VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; return 0; } @@ -67,14 +61,8 @@ static int GetCUDADeviceCountImpl() { return 0; } } - int count; - auto error_code = cudaGetDeviceCount(&count); - PADDLE_ENFORCE( - error_code, - "cudaGetDeviceCount failed in " - "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count)); return count; } @@ -84,72 +72,63 @@ int GetCUDADeviceCount() { } int GetCUDAComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); int major, minor; auto major_error_code = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); auto minor_error_code = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id); - PADDLE_ENFORCE_EQ( - major_error_code, 0, - "cudaDevAttrComputeCapabilityMajor failed in " - "paddle::platform::GetCUDAComputeCapability, error code : %d, %s", - major_error_code, CudaErrorWebsite()); - PADDLE_ENFORCE_EQ( - minor_error_code, 0, - "cudaDevAttrComputeCapabilityMinor failed in " - "paddle::platform::GetCUDAComputeCapability, error code : %d, %s", - minor_error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code); + PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code); return major * 10 + minor; } dim3 GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); dim3 ret; int size; auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); - PADDLE_ENFORCE_EQ(error_code_x, 0, - "cudaDevAttrMaxGridDimX failed in " - "paddle::platform::GpuMaxGridDimSize, error code : %d, %s", - error_code_x, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x); ret.x = size; auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); - PADDLE_ENFORCE_EQ(error_code_y, 0, - "cudaDevAttrMaxGridDimY failed in " - "paddle::platform::GpuMaxGridDimSize, error code : %d, %s", - error_code_y, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y); ret.y = size; auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); - PADDLE_ENFORCE_EQ(error_code_z, 0, - "cudaDevAttrMaxGridDimZ failed in " - "paddle::platform::GpuMaxGridDimSize, error code : %d, %s", - error_code_z, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z); ret.z = size; return ret; } int GetCUDARuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); int runtime_version = 0; - auto error_code = cudaRuntimeGetVersion(&runtime_version); - PADDLE_ENFORCE(error_code, - "cudaRuntimeGetVersion failed in " - "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); return runtime_version; } int GetCUDADriverVersion(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); int driver_version = 0; - auto error_code = cudaDriverGetVersion(&driver_version); - PADDLE_ENFORCE(error_code, - "cudaDriverGetVersion failed in " - "paddle::platform::GetCUDADriverVersion, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version)); return driver_version; } @@ -164,56 +143,44 @@ bool TensorCoreAvailable() { } int GetCUDAMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); int count; - auto error_code = - cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id); - PADDLE_ENFORCE(error_code, - "cudaDeviceGetAttribute failed in " - "paddle::platform::GetCUDAMultiProcess, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); return count; } int GetCUDAMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); int count; - auto error_code = cudaDeviceGetAttribute( - &count, cudaDevAttrMaxThreadsPerMultiProcessor, id); - PADDLE_ENFORCE( - error_code, - "cudaDeviceGetAttribute failed in paddle::" - "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute( + &count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); return count; } int GetCUDAMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT( - id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must less than GPU count, but received id is:%d, " - "GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); int count; - auto error_code = - cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id); - PADDLE_ENFORCE_EQ( - error_code, 0, - platform::errors::InvalidArgument( - "cudaDeviceGetAttribute returned error code should be 0, " - "but received error code is: %d, %s", - error_code, CudaErrorWebsite())); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); return count; } int GetCurrentDeviceId() { int device_id; - auto error_code = cudaGetDevice(&device_id); - PADDLE_ENFORCE(error_code, - "cudaGetDevice failed in " - "paddle::platform::GetCurrentDeviceId, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id)); return device_id; } @@ -237,12 +204,12 @@ std::vector GetSelectedDevices() { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); - auto error_code = cudaSetDevice(id); - PADDLE_ENFORCE(error_code, - "cudaSetDevice failed in " - "paddle::platform::SetDeviced, error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, GetCUDADeviceCount())); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id)); } void GpuMemoryUsage(size_t *available, size_t *total) { @@ -306,74 +273,44 @@ size_t GpuMaxChunkSize() { void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { - auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream); - PADDLE_ENFORCE(error_code, - "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync " - "(%p -> %p, length: %d) error code : %d, %s", - src, dst, static_cast(count), error_code, - CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); } void GpuMemcpySync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { - auto error_code = cudaMemcpy(dst, src, count, kind); - PADDLE_ENFORCE(error_code, - "cudaMemcpy failed in paddle::platform::GpuMemcpySync " - "(%p -> %p, length: %d) error code : %d, %s", - src, dst, static_cast(count), error_code, - CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind)); } void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream) { - auto error_code = - cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream); - PADDLE_ENFORCE( - error_code, - "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync " - "error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); } void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, int src_device, size_t count) { - auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count); - PADDLE_ENFORCE(error_code, - "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync " - "error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyPeer(dst, dst_device, src, src_device, count)); } void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { - auto error_code = cudaMemsetAsync(dst, value, count, stream); - PADDLE_ENFORCE(error_code, - "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync " - "error code : %d, %s", - error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream)); } void GpuStreamSync(cudaStream_t stream) { - auto error_code = cudaStreamSynchronize(stream); - PADDLE_ENFORCE_CUDA_SUCCESS( - error_code, - platform::errors::External( - "cudaStreamSynchronize failed in paddle::platform::GpuStreamSync " - "error code : %d, %s", - error_code, CudaErrorWebsite())); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); } static void RaiseNonOutOfMemoryError(cudaError_t *status) { if (*status == cudaErrorMemoryAllocation) { *status = cudaSuccess; } - PADDLE_ENFORCE_CUDA_SUCCESS(*status); *status = cudaGetLastError(); if (*status == cudaErrorMemoryAllocation) { *status = cudaSuccess; } - PADDLE_ENFORCE_CUDA_SUCCESS(*status); } @@ -450,8 +387,7 @@ class RecordedCudaMallocHelper { CUDADeviceGuard guard(dev_id_); auto err = cudaFree(ptr); if (err != cudaErrorCudartUnloading) { - PADDLE_ENFORCE_CUDA_SUCCESS( - err, platform::errors::External("cudaFree raises unexpected error")); + PADDLE_ENFORCE_CUDA_SUCCESS(err); if (NeedRecord()) { std::lock_guard guard(*mtx_); cur_size_ -= size; diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 41d5180ffa..af27564b99 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -117,10 +117,7 @@ void SynchronizeAllDevice() { int count = GetCUDADeviceCount(); for (int i = 0; i < count; i++) { SetDeviceId(i); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaDeviceSynchronize(), - platform::errors::External( - "Device synchronize failed in cudaDeviceSynchronize()")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); } #endif } diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index 739892eafd..7a090ff7e5 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -30,13 +30,10 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) { CUDADeviceGuard guard(boost::get(place_).device); if (priority == Priority::kHigh) { PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1), - platform::errors::Fatal("High priority cuda stream creation failed.")); + cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1)); } else if (priority == Priority::kNormal) { PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0), - platform::errors::Fatal( - "Normal priority cuda stream creation failed.")); + cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0)); } callback_manager_.reset(new StreamCallbackManager(stream_)); VLOG(3) << "CUDAStream Init stream: " << stream_ @@ -49,9 +46,7 @@ void CUDAStream::Destroy() { Wait(); WaitCallback(); if (stream_) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamDestroy(stream_), - platform::errors::Fatal("Cuda stream destruction failed.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); } stream_ = nullptr; } @@ -67,10 +62,7 @@ void CUDAStream::Wait() const { } #endif - PADDLE_ENFORCE_CUDA_SUCCESS( - e_sync, platform::errors::Fatal( - "cudaStreamSynchronize raises error: %s, errono: %d", - cudaGetErrorString(e_sync), static_cast(e_sync))); + PADDLE_ENFORCE_CUDA_SUCCESS(e_sync); } } // namespace stream diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h index f7149f1e13..57e763d527 100644 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ b/paddle/fluid/platform/stream/cuda_stream.h @@ -53,21 +53,15 @@ class CUDAStream final { template void RecordEvent(cudaEvent_t ev, Callback callback) const { callback(); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventRecord(ev, stream_), - platform::errors::Fatal("CUDA event recording failed.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); } void RecordEvent(cudaEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventRecord(ev, stream_), - platform::errors::Fatal("CUDA event recording failed.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); } void WaitEvent(cudaEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(stream_, ev, 0), - platform::errors::Fatal("Failed to wait event.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0)); } void Wait() const; diff --git a/python/setup.py.in b/python/setup.py.in index 36851adde1..ed77787d4c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: # the prefix is sys.prefix which should always be usr paddle_bins = '' + if not '${WIN32}': paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]} if '${HAS_NOAVX_CORE}' == 'ON': package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')] + package_dir={ '': '${PADDLE_BINARY_DIR}/python', # The paddle.fluid.proto will be generated while compiling. @@ -329,6 +331,7 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) + list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) + list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) + + list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen @@ -400,7 +403,9 @@ class InstallHeaders(Command): return self.copy_file(header, install_dir) def run(self): + # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows if os.name == 'nt' or sys.platform == 'darwin': + self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb') return hdrs = self.distribution.headers if not hdrs: diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 51330bea8e..3e079d0433 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6836917 47554610 22561442 fi -ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true` -VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true` +ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true` +VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true` INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true` if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n" diff --git a/tools/count_invalid_enforce.sh b/tools/count_invalid_enforce.sh index a2dbc22119..fe99674f6b 100644 --- a/tools/count_invalid_enforce.sh +++ b/tools/count_invalid_enforce.sh @@ -30,9 +30,9 @@ ALL_PADDLE_CHECK_CNT=0 VALID_PADDLE_CHECK_CNT=0 function enforce_scan(){ - paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true` + paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true` total_check_cnt=`echo "$paddle_check" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` - valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` + valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` eval $2=$total_check_cnt eval $3=$valid_check_cnt } diff --git a/tools/cudaError/README.md b/tools/cudaError/README.md new file mode 100644 index 0000000000..df7434c33a --- /dev/null +++ b/tools/cudaError/README.md @@ -0,0 +1,22 @@ +Usage: + +Please run: +``` +bash start.sh +``` + +The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default. + +If you want to crawl a specified version of CUDA, Please run: +``` +bash start.sh +``` +URL can be derived by default, so you don't have to enter a URL. + +for example: +``` +bash start.sh 11.0 +``` +will capture error message of CUDA11.0(in future). + +Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz diff --git a/tools/cudaError/spider.py b/tools/cudaError/spider.py new file mode 100644 index 0000000000..c2c3dc97f4 --- /dev/null +++ b/tools/cudaError/spider.py @@ -0,0 +1,124 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ssl +import re +import urllib2 +import json +import collections +import sys, getopt +import cuda_error_pb2 + + +def parsing(cuda_errorDesc, version, url): + All_Messages = cuda_errorDesc.AllMessages.add() + All_Messages.version = int(version) + + ssl._create_default_https_context = ssl._create_unverified_context + html = urllib2.urlopen(url).read() + res_div = r'
.*?

CUDA error types

.*?
.*?
(.*?)
' + m_div = re.findall(res_div, html, re.S | re.M) + + url_list = url.split('/') + url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1]) + + dic = collections.OrderedDict() + dic_message = collections.OrderedDict() + for line in m_div: + res_dt = r'
(.*?)
.*?
(.*?)
' + m_dt = re.findall(res_dt, line, re.S | re.M) + for error in m_dt: + res_type = r'(.*?)' + m_type = re.findall(res_type, error[0], re.S | re.M)[0] + m_message = error[1] + m_message = m_message.replace('\n', '') + res_a = r'()' + res_shape = r'(.*?)' + list_a = re.findall(res_a, m_message, re.S | re.M) + list_shape = re.findall(res_shape, m_message, re.S | re.M) + assert len(list_a) == len(list_shape) + for idx in range(len(list_a)): + m_message = m_message.replace(list_a[idx], list_shape[idx]) + + m_message = m_message.replace( + '
Deprecated
', '') + + res_span = r'()' + res_span_detail = r'(.*?)' + list_span = re.findall(res_span, m_message, re.S | re.M) + list_span_detail = re.findall(res_span_detail, m_message, re.S | + re.M) + assert len(list_span) == len(list_span_detail) + for idx in range(len(list_span)): + m_message = m_message.replace(list_span[idx], + list_span_detail[idx]) + + res_p = r'(

.*?

)' + res_p_detail = r'

(.*?)

' + list_p = re.findall(res_p, m_message, re.S | re.M) + list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M) + assert len(list_p) == len(list_p_detail) + for idx in range(len(list_p)): + m_message = m_message.replace(list_p[idx], list_p_detail[idx]) + + m_message = m_message.replace(' ', '') + _Messages = All_Messages.Messages.add() + try: + _Messages.errorCode = int(m_type) + except ValueError: + if re.match('0x', m_type): + _Messages.errorCode = int(m_type, 16) + else: + raise ValueError + _Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface + + +def main(argv): + version = [] + url = [] + try: + opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="]) + except getopt.GetoptError: + print 'python spider.py -v -u ' + sys.exit(2) + for opt, arg in opts: + if opt in ("-h", "--help"): + print 'python spider.py -v -u ' + sys.exit() + elif opt in ("-v", "--version"): + version = arg + elif opt in ("-u", "--url"): + url = arg + version = version.split(',') + url = url.split(',') + assert len(version) == len(url) + cuda_errorDesc = cuda_error_pb2.cudaerrorDesc() + for idx in range(len(version)): + if version[idx] == "-1": + print("crawling errorMessage for CUDA%s from %s" % + ("-latest-version", url[idx])) + else: + print("crawling errorMessage for CUDA%s from %s" % + (version[idx], url[idx])) + parsing(cuda_errorDesc, version[idx], url[idx]) + + serializeToString = cuda_errorDesc.SerializeToString() + with open("cudaErrorMessage.pb", "wb") as f: + f.write(serializeToString + ) # save for cudaErrorMessage.pb from python-protobuf interface + print("crawling errorMessage for CUDA has been done!!!") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh new file mode 100644 index 0000000000..3c0e57ffe7 --- /dev/null +++ b/tools/cudaError/start.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -ex +SYSTEM=`uname -s` +rm -f protoc-3.11.3-linux-x86_64.* +if [ "$SYSTEM" == "Linux" ]; then + wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip + unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip + rm protoc-3.11.3-linux-x86_64.* +elif [ "$SYSTEM" == "Darwin" ]; then + wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip + unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip + rm protoc-3.11.3-osx-x86_64.* +else + echo "please run on Mac/Linux" + exit 1 +fi +protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto + +version=90,100,-1 # -1 represent the latest cuda-version +url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 + +if [ "$1" != "" ]; then + version=$version,$(($1*10)) + if [ "$2" != "" ]; then + url=$url,$2 + else + url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 + fi +fi + +python spider.py --version=$version --url=$url +tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb -- GitLab