From b425215a6d9bed0af94afd20f44454d16b4b095a Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 27 May 2021 10:07:13 +0800 Subject: [PATCH] Unify all external API error message mechanism and enhance third-party API error msg (#33003) * Unify all external API error message mechanism and enhance third-party API error msg * fix some comment * fix some comment --- cmake/inference_lib.cmake | 12 +- cmake/third_party.cmake | 21 +- paddle/fluid/platform/CMakeLists.txt | 4 +- paddle/fluid/platform/cudnn_helper.h | 29 -- paddle/fluid/platform/enforce.h | 377 ++++++++---------- paddle/fluid/platform/enforce_test.cc | 89 ++++- ...{cuda_error.proto => external_error.proto} | 29 +- paddle/scripts/paddle_build.bat | 2 +- python/setup.py.in | 3 +- tools/cudaError/README.md | 22 - tools/cudaError/spider.py | 124 ------ tools/externalError/README.md | 9 + tools/externalError/spider.py | 363 +++++++++++++++++ tools/{cudaError => externalError}/start.sh | 18 +- 14 files changed, 661 insertions(+), 441 deletions(-) rename paddle/fluid/platform/{cuda_error.proto => external_error.proto} (58%) delete mode 100644 tools/cudaError/README.md delete mode 100644 tools/cudaError/spider.py create mode 100644 tools/externalError/README.md create mode 100644 tools/externalError/spider.py rename tools/{cudaError => externalError}/start.sh (59%) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 9694a7bc59c..8220680cecf 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -146,12 +146,12 @@ copy(inference_lib_dist SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h DSTS ${dst_dir}) -# Only GPU need cudaErrorMessage.pb +# GPU must copy externalErrorMsg.pb IF(WITH_GPU) - set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") - copy(inference_lib_dist - SRCS ${cudaerror_INCLUDE_DIR} - DSTS ${dst_dir}) + set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data") + copy(inference_lib_dist + SRCS ${externalError_INCLUDE_DIR} + DSTS ${dst_dir}) ENDIF() # CMakeCache Info @@ -259,7 +259,7 @@ copy(fluid_lib_dist set(module "platform") set(platform_lib_deps profiler_proto error_codes_proto) if(WITH_GPU) - set(platform_lib_deps ${platform_lib_deps} cuda_error_proto) + set(platform_lib_deps ${platform_lib_deps} external_error_proto) endif(WITH_GPU) add_dependencies(fluid_lib_dist ${platform_lib_deps}) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 56edaff2a50..8adc7a4e396 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -111,10 +111,11 @@ FUNCTION(file_download_and_uncompress URL NAME) MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE) ExternalProject_Add( - extern_download_${NAME} + download_${NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${THIRD_PARTY_PATH}/${NAME} URL ${URL} + TIMEOUT 120 DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ DOWNLOAD_NO_PROGRESS 1 @@ -123,7 +124,7 @@ FUNCTION(file_download_and_uncompress URL NAME) UPDATE_COMMAND "" INSTALL_COMMAND "" ) - set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE) + set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE) ENDFUNCTION() @@ -242,8 +243,20 @@ if(WITH_GPU) include(external/cub) # download cub list(APPEND third_party_deps extern_cub) endif() - set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError") # download file externalErrorMsg.tar.gz + if(WITH_TESTING) + # copy externalErrorMsg.pb for unittest 'enforce_test' + set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) + if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) + set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data) + else() + set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data) + endif() + add_custom_command(TARGET download_externalError POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR} + COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}") + endif() endif(WITH_GPU) if(WITH_XPU) diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 12a54fd7e87..36a95676217 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,7 +1,7 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) proto_library(error_codes_proto SRCS error_codes.proto) if(WITH_GPU) - proto_library(cuda_error_proto SRCS cuda_error.proto) + proto_library(external_error_proto SRCS external_error.proto) endif(WITH_GPU) if(WITH_XPU) @@ -45,7 +45,7 @@ cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) set(enforce_deps flags errors boost) if(WITH_GPU) - set(enforce_deps ${enforce_deps} cuda_error_proto) + set(enforce_deps ${enforce_deps} external_error_proto) endif() cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps}) cc_library(monitor SRCS monitor.cc) diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 6c3c96b68c4..0d2a770ad82 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -34,35 +34,6 @@ DECLARE_bool(cudnn_deterministic); namespace paddle { namespace platform { -inline const char* cudnnGetErrorString(cudnnStatus_t status) { - switch (status) { - case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; - case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; - case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; - case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; - case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; - case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; - case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; - case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; - case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; - case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; - case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; - default: - return "Unknown cudnn error number"; - } -} - #define CUDNN_VERSION_MIN(major, minor, patch) \ (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index d42733823e6..d3890de89a5 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -34,7 +34,7 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/platform/cuda_error.pb.h" +#include "paddle/fluid/platform/external_error.pb.h" #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP @@ -682,41 +682,83 @@ struct EOFException : public std::exception { END_HANDLE_THE_ERROR \ } while (0) -/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/ +/**************************************************************************/ +/**************************** NVIDIA ERROR ********************************/ #ifdef PADDLE_WITH_CUDA -/***** CUDA ERROR *****/ -inline bool is_error(cudaError_t e) { return e != cudaSuccess; } +namespace details { -inline std::string GetCudaErrorWebsite(int32_t cuda_version) { - std::ostringstream webstr; - webstr << "https://docs.nvidia.com/cuda/"; - if (cuda_version != -1) { - double version = cuda_version / 10; - webstr << "archive/" << std::fixed << std::setprecision(1) << version; +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + static constexpr const char* kTypeString = #proto_type; \ + static constexpr platform::proto::ApiType kProtoType = \ + platform::proto::ApiType::proto_type; \ } - webstr << "/cuda-runtime-api/group__CUDART__TYPES.html" - "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038"; - return webstr.str(); -} -inline std::string build_nvidia_error_msg(cudaError_t e) { -#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000 - int32_t cuda_version = 100; -#elif CUDA_VERSION >= 9000 - int32_t cuda_version = 90; -#else - int32_t cuda_version = -1; +DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA); +DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); +DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); +DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); +DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); #endif + +} // namespace details + +template +inline const char* GetErrorMsgUrl(T status) { + using __CUDA_STATUS_TYPE__ = decltype(status); + platform::proto::ApiType proto_type = + details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; + switch (proto_type) { + case platform::proto::ApiType::CUDA: + return "https://docs.nvidia.com/cuda/cuda-runtime-api/" + "group__CUDART__TYPES.html#group__CUDART__TYPES_" + "1g3f51e3575c2178246db0a94a430e0038"; + break; + case platform::proto::ApiType::CURAND: + return "https://docs.nvidia.com/cuda/curand/" + "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437"; + break; + case platform::proto::ApiType::CUDNN: + return "https://docs.nvidia.com/deeplearning/cudnn/api/" + "index.html#cudnnStatus_t"; + break; + case platform::proto::ApiType::CUBLAS: + return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t"; + break; + case platform::proto::ApiType::CUSOLVER: + return "https://docs.nvidia.com/cuda/cusolver/" + "index.html#cuSolverSPstatus"; + break; + case platform::proto::ApiType::NCCL: + return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/" + "types.html#ncclresult-t"; + break; + default: + return "Unknown type of External API, can't get error message URL!"; + break; + } +} + +template +inline std::string GetExternalErrorMsg(T status) { std::ostringstream sout; - sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << "."; - static platform::proto::cudaerrorDesc cudaerror; - static bool _initSucceed = false; - if (cudaerror.ByteSizeLong() == 0) { + bool _initSucceed = false; + platform::proto::ExternalErrorDesc externalError; + if (externalError.ByteSizeLong() == 0) { std::string filePath; #if !defined(_WIN32) Dl_info info; - if (dladdr(reinterpret_cast(GetCudaErrorWebsite), &info)) { + if (dladdr(reinterpret_cast(GetCurrentTraceBackString), &info)) { std::string strModule(info.dli_fname); const size_t last_slash_idx = strModule.find_last_of("/"); std::string compare_path = strModule.substr(strModule.length() - 6); @@ -724,18 +766,19 @@ inline std::string build_nvidia_error_msg(cudaError_t e) { strModule.erase(last_slash_idx, std::string::npos); } if (compare_path.compare("avx.so") == 0) { - filePath = strModule + - "/../include/third_party/cudaerror/data/cudaErrorMessage.pb"; - } else { filePath = - strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb"; + strModule + + "/../include/third_party/externalError/data/externalErrorMsg.pb"; + } else { + filePath = strModule + + "/../../third_party/externalError/data/externalErrorMsg.pb"; } } #else char buf[100]; MEMORY_BASIC_INFORMATION mbi; HMODULE h_module = - (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0) + (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0) ? (HMODULE)mbi.AllocationBase : NULL; GetModuleFileName(h_module, buf, 100); @@ -746,198 +789,118 @@ inline std::string build_nvidia_error_msg(cudaError_t e) { strModule.erase(last_slash_idx, std::string::npos); } if (compare_path.compare("avx.pyd") == 0) { - filePath = - strModule + - "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; + filePath = strModule + + "\\..\\include\\third_" + "party\\externalerror\\data\\externalErrorMsg.pb"; } else { filePath = - strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; + strModule + + "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb"; } #endif std::ifstream fin(filePath, std::ios::in | std::ios::binary); - _initSucceed = cudaerror.ParseFromIstream(&fin); + _initSucceed = externalError.ParseFromIstream(&fin); } + using __CUDA_STATUS_TYPE__ = decltype(status); + platform::proto::ApiType proto_type = + details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; if (_initSucceed) { - for (int i = 0; i < cudaerror.allmessages_size(); ++i) { - if (cuda_version == cudaerror.allmessages(i).version()) { - for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) { - if (e == cudaerror.allmessages(i).messages(j).errorcode()) { - sout << "\n [Advise: " - << cudaerror.allmessages(i).messages(j).errormessage() << "]"; + for (int i = 0; i < externalError.errors_size(); ++i) { + if (proto_type == externalError.errors(i).type()) { + for (int j = 0; j < externalError.errors(i).messages_size(); ++j) { + if (status == externalError.errors(i).messages(j).code()) { + sout << "\n [Hint: " + << externalError.errors(i).messages(j).message() << "]"; return sout.str(); } } } } } - sout << "\n [Advise: Please search for the error code(" << e - << ") on website( " << GetCudaErrorWebsite(cuda_version) - << " ) to get Nvidia's official solution about CUDA Error.]"; + + sout << "\n [Hint: Please search for the error code(" << status + << ") on website (" << GetErrorMsgUrl(status) + << ") to get Nvidia's official solution and advice about " + << details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString + << " Error.]"; return sout.str(); } -/** curand ERROR **/ -inline bool is_error(curandStatus_t stat) { - return stat != CURAND_STATUS_SUCCESS; +template std::string GetExternalErrorMsg(cudaError_t); +template std::string GetExternalErrorMsg(curandStatus_t); +template std::string GetExternalErrorMsg(cudnnStatus_t); +template std::string GetExternalErrorMsg(cublasStatus_t); +template std::string GetExternalErrorMsg(cusolverStatus_t); +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +template std::string GetExternalErrorMsg(ncclResult_t); +#endif + +/*************** CUDA ERROR ***************/ +inline bool is_error(cudaError_t e) { return e != cudaSuccess; } + +inline std::string build_nvidia_error_msg(cudaError_t e) { + std::ostringstream sout; + sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". " + << GetExternalErrorMsg(e); + return sout.str(); } -inline const char* curandGetErrorString(curandStatus_t stat) { - switch (stat) { - case CURAND_STATUS_SUCCESS: - return "`CURAND_STATUS_SUCCESS`. No errors."; - case CURAND_STATUS_VERSION_MISMATCH: - return "`CURAND_STATUS_VERSION_MISMATCH`. Header file and linked library " - "version do not match."; - case CURAND_STATUS_NOT_INITIALIZED: - return "`CURAND_STATUS_NOT_INITIALIZED`. Generator not initialized."; - case CURAND_STATUS_ALLOCATION_FAILED: - return "`CURAND_STATUS_ALLOCATION_FAILED`. Memory allocation failed."; - case CURAND_STATUS_TYPE_ERROR: - return "`CURAND_STATUS_TYPE_ERROR`. Generator is wrong type."; - case CURAND_STATUS_OUT_OF_RANGE: - return "`CURAND_STATUS_OUT_OF_RANGE`. Argument out of range."; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "`CURAND_STATUS_LENGTH_NOT_MULTIPLE`. Length requested is not a " - "multple of dimension."; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "`CURAND_STATUS_DOUBLE_PRECISION_REQUIRED`. GPU does not have " - "double precision required by MRG32k3a."; - case CURAND_STATUS_LAUNCH_FAILURE: - return "`CURAND_STATUS_LAUNCH_FAILURE`. Kernel launch failure."; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "`CURAND_STATUS_PREEXISTING_FAILURE`. Preexisting failure on " - "library entry."; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "`CURAND_STATUS_INITIALIZATION_FAILED`. Initialization of CUDA " - "failed."; - case CURAND_STATUS_ARCH_MISMATCH: - return "`CURAND_STATUS_ARCH_MISMATCH`. Architecture mismatch, GPU does " - "not support requested feature."; - case CURAND_STATUS_INTERNAL_ERROR: - return "`CURAND_STATUS_INTERNAL_ERROR`. Internal library error."; - default: - return "Unknown curand status"; - } +/*************** CURAND ERROR ***************/ +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } inline std::string build_nvidia_error_msg(curandStatus_t stat) { - std::string msg(" Curand error, "); - return msg + curandGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); } -/***** CUDNN ERROR *****/ +/*************** CUDNN ERROR ***************/ inline bool is_error(cudnnStatus_t stat) { return stat != CUDNN_STATUS_SUCCESS; } inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { - std::string msg(" Cudnn error, "); - return msg + platform::dynload::cudnnGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CUDNN error(" << stat << "), " + << platform::dynload::cudnnGetErrorString(stat) << ". " + << GetExternalErrorMsg(stat); + return sout.str(); } -/***** CUBLAS ERROR *****/ +/*************** CUBLAS ERROR ***************/ inline bool is_error(cublasStatus_t stat) { return stat != CUBLAS_STATUS_SUCCESS; } -inline const char* cublasGetErrorString(cublasStatus_t stat) { - switch (stat) { - case CUBLAS_STATUS_NOT_INITIALIZED: - return "`CUBLAS_STATUS_NOT_INITIALIZED`. The cuBLAS library was not " - "initialized."; - case CUBLAS_STATUS_ALLOC_FAILED: - return "`CUBLAS_STATUS_ALLOC_FAILED`. Resource allocation failed inside " - "the cuBLAS library."; - case CUBLAS_STATUS_INVALID_VALUE: - return "`CUBLAS_STATUS_INVALID_VALUE`. An unsupported value or parameter " - "was passed to the function (a negative vector size, for " - "example)."; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "`CUBLAS_STATUS_ARCH_MISMATCH`. The function requires a feature " - "absent from the device architecture; usually caused by the lack " - "of support for double precision."; - case CUBLAS_STATUS_MAPPING_ERROR: - return "`CUBLAS_STATUS_MAPPING_ERROR`. An access to GPU memory space " - "failed, which is usually caused by a failure to bind a texture."; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "`CUBLAS_STATUS_EXECUTION_FAILED`. The GPU program failed to " - "execute. This is often caused by a launch failure of the kernel " - "on the GPU, which can be caused by multiple reasons."; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "`CUBLAS_STATUS_INTERNAL_ERROR`. An internal cuBLAS operation " - "failed. This error is usually caused by a cudaMemcpyAsync() " - "failure."; - case CUBLAS_STATUS_NOT_SUPPORTED: - return "`CUBLAS_STATUS_NOT_SUPPORTED`. The functionality requested is " - "not supported."; - case CUBLAS_STATUS_LICENSE_ERROR: - return "`CUBLAS_STATUS_LICENSE_ERROR`. The functionality requested " - "requires some license and an error was detected when trying to " - "check the current licensing."; - default: - return "Unknown cublas status"; - } -} - inline std::string build_nvidia_error_msg(cublasStatus_t stat) { - std::string msg(" Cublas error, "); - return msg + cublasGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); } -/***** CUSOLVER ERROR *****/ +/*************** CUSOLVER ERROR ***************/ inline bool is_error(cusolverStatus_t stat) { return stat != CUSOLVER_STATUS_SUCCESS; } -inline const char* cusolverGetErrorString(cusolverStatus_t stat) { - switch (stat) { - case CUSOLVER_STATUS_NOT_INITIALIZED: - return "`CUSOLVER_STATUS_NOT_INITIALIZED`. The cuSolver library was not " - "initialized. This is usually caused by the lack of a prior call, " - "an error in the CUDA Runtime API called by the cuSolver routine, " - "or an error in the hardware setup."; - case CUSOLVER_STATUS_ALLOC_FAILED: - return "`CUSOLVER_STATUS_ALLOC_FAILED`. Resource allocation failed " - "inside the cuSolver library. This is usually caused by a " - "cudaMalloc() failure."; - case CUSOLVER_STATUS_INVALID_VALUE: - return "`CUSOLVER_STATUS_INVALID_VALUE`. An unsupported value or " - "parameter was passed to the function (a negative vector size, " - "for example)."; - case CUSOLVER_STATUS_ARCH_MISMATCH: - return "`CUSOLVER_STATUS_ARCH_MISMATCH`. The function requires a feature " - "absent from the device architecture; usually caused by the lack " - "of support for atomic operations or double precision."; - case CUSOLVER_STATUS_EXECUTION_FAILED: - return "`CUSOLVER_STATUS_EXECUTION_FAILED`. The GPU program failed to " - "execute. This is often caused by a launch failure of the kernel " - "on the GPU, which can be caused by multiple reasons."; - case CUSOLVER_STATUS_INTERNAL_ERROR: - return "`CUSOLVER_STATUS_INTERNAL_ERROR`. An internal cuSolver operation " - "failed. This error is usually caused by a cudaMemcpyAsync() " - "failure."; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "`CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED`. The matrix type is " - "not supported by this function. This is usually caused by " - "passing an invalid matrix descriptor to the function."; - default: - return "Unknown cusolver status"; - } -} - inline std::string build_nvidia_error_msg(cusolverStatus_t stat) { - std::string msg(" Cublas error, "); - return msg + cusolverGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); } -/****** NCCL ERROR ******/ +/**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { return nccl_result != ncclSuccess; } inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { - std::string msg(" Nccl error, "); + std::ostringstream sout; + sout << "NCCL error(" << nccl_result << "), " + << platform::dynload::ncclGetErrorString(nccl_result) << ". "; if (errno == ENOSPC || errno == EAGAIN) { std::string detail(strerror(errno)); detail += "\nPlease try one of the following solutions:"; @@ -947,42 +910,19 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { "\n3. Increase shared memory by setting the -shm-size " "option when starting docker container, e.g., setting " " -shm-size=2g.\n"; - return msg + platform::dynload::ncclGetErrorString(nccl_result) + - ", detail: " + detail + " "; + sout << " Detail: " + detail; } - return msg + platform::dynload::ncclGetErrorString(nccl_result) + " "; + sout << GetExternalErrorMsg(nccl_result); + return sout.str(); } #endif // not(__APPLE__) and PADDLE_WITH_NCCL -namespace details { - -template -struct CudaStatusType {}; - -#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \ - template <> \ - struct CudaStatusType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess); -DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS); - -#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); -#endif -} // namespace details - #define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ do { \ auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ auto __summary__ = ::paddle::platform::errors::External( \ @@ -1023,7 +963,7 @@ inline void retry_sleep(unsigned milliseconds) { int retry_count = 1; \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ retry_sleep(FLAGS_gpu_allocator_retry_time); \ @@ -1037,10 +977,11 @@ inline void retry_sleep(unsigned milliseconds) { } \ } while (0) -#undef DEFINE_CUDA_STATUS_TYPE +#undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_CUDA -/** HIP PADDLE ENFORCE FUNCTIONS AND MACROS **/ +/**************************************************************************/ +/***************************** HIP ERROR **********************************/ #ifdef PADDLE_WITH_HIP /***** HIP ERROR *****/ @@ -1052,7 +993,7 @@ inline std::string build_rocm_error_msg(hipError_t e) { return sout.str(); } -/** HIPRAND ERROR **/ +/***** HIPRAND ERROR *****/ inline bool is_error(hiprandStatus_t stat) { return stat != HIPRAND_STATUS_SUCCESS; } @@ -1153,22 +1094,22 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { namespace details { template -struct CudaStatusType {}; +struct ExternalApiType {}; -#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \ - template <> \ - struct CudaStatusType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ } -DEFINE_CUDA_STATUS_TYPE(hipError_t, hipSuccess); -DEFINE_CUDA_STATUS_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(miopenStatus_t, miopenStatusSuccess); -DEFINE_CUDA_STATUS_TYPE(rocblas_status, rocblas_status_success); +DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess); +DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess); +DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) -DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); #endif } // namespace details @@ -1178,7 +1119,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ auto __summary__ = ::paddle::platform::errors::External( \ @@ -1201,7 +1142,7 @@ inline void retry_sleep(unsigned millisecond) { int retry_count = 1; \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ retry_sleep(FLAGS_gpu_allocator_retry_time); \ @@ -1215,7 +1156,7 @@ inline void retry_sleep(unsigned millisecond) { } \ } while (0) -#undef DEFINE_CUDA_STATUS_TYPE +#undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_HIP #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 39f3d3f00c9..842d4cc1392 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -304,6 +304,7 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) { return false; } catch (paddle::platform::EnforceNotMet& error) { std::string ex_msg = error.what(); + std::cout << ex_msg << std::endl; return ex_msg.find(msg) != std::string::npos; } } @@ -338,30 +339,98 @@ TEST(enforce, hip_success) { #else TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error")); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error")); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "CUDA error")); + + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "CUDA error")); + + EXPECT_TRUE(CheckCudaStatusFailure( + cudaErrorInsufficientDriver, + "This indicates that the installed NVIDIA CUDA driver is older than the " + "CUDA runtime library. This is not a supported configuration.Users " + "should install an updated NVIDIA display driver to allow the " + "application to run")); + EXPECT_TRUE(CheckCudaStatusFailure( + cudaErrorContextIsDestroyed, + "This error indicates that the context current to the calling thread has " + "been destroyed using cuCtxDestroy, or is a primary context which has " + "not yet been initialized")); EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); EXPECT_TRUE( - CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error")); + CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "CURAND error")); EXPECT_TRUE( - CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error")); + CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "CURAND error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CURAND_STATUS_ARCH_MISMATCH, + "Architecture mismatch, GPU does not support requested feature")); + EXPECT_TRUE( + CheckCudaStatusFailure(CURAND_STATUS_LENGTH_NOT_MULTIPLE, + "Length requested is not a multple of dimension")); EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); EXPECT_TRUE( - CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error")); - EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error")); + CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "CUDNN error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "CUDNN error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUDNN_STATUS_BAD_PARAM, + "An incorrect value or parameter was passed to the function. To correct, " + "ensure that all the parameters being passed have valid values")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUDNN_STATUS_LICENSE_ERROR, + "The functionality requested requires some license and an error was " + "detected when trying to check the current licensing. This error can " + "happen if the license is not present or is expired or if the " + "environment variable NVIDIA_LICENSE_FILE is not set properly")); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); EXPECT_TRUE( - CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error")); + CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS error")); + EXPECT_TRUE( + CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUBLAS_STATUS_EXECUTION_FAILED, + "The GPU program failed to execute. This is often caused by a launch " + "failure of the kernel on the GPU, which can be caused by multiple " + "reasons. To correct: check that the hardware, an appropriate version " + "of the driver, and the cuBLAS library are correctly installed")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUBLAS_STATUS_MAPPING_ERROR, + "An access to GPU memory space failed, which is usually caused by a " + "failure to bind a texture. To correct: prior to the function call, " + "unbind any previously bound textures")); + + EXPECT_TRUE(CheckCudaStatusSuccess(CUSOLVER_STATUS_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CUSOLVER_STATUS_NOT_INITIALIZED, + "CUSOLVER error")); EXPECT_TRUE( - CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error")); + CheckCudaStatusFailure(CUSOLVER_STATUS_ALLOC_FAILED, "CUSOLVER error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUSOLVER_STATUS_INTERNAL_ERROR, + "An internal cuSolver operation failed. This error is usually caused by " + "a cudaMemcpyAsync() failure.To correct: check that the hardware, an " + "appropriate version of the driver, and the cuSolver library are " + "correctly installed. Also, check that the memory passed as a parameter " + "to the routine is not being deallocated prior to the routine’s " + "completion")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUSOLVER_STATUS_INVALID_VALUE, + "An unsupported value or parameter was passed to the function (a " + "negative vector size, for example).To correct: ensure that all the " + "parameters being passed have valid values")); + /* #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error")); - EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError, + "An internal check failed. This is either " + "a bug in NCCL or due to memory " + "corruption")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclInvalidUsage, + "The call to NCCL is incorrect. This is " + "usually reflecting a programming error")); #endif +*/ } #endif #endif diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/external_error.proto similarity index 58% rename from paddle/fluid/platform/cuda_error.proto rename to paddle/fluid/platform/external_error.proto index b55e0af81ee..2094de7e10f 100644 --- a/paddle/fluid/platform/cuda_error.proto +++ b/paddle/fluid/platform/external_error.proto @@ -15,21 +15,32 @@ limitations under the License. */ syntax = "proto2"; package paddle.platform.proto; +// (NOTE:zhouwei): ApiType describes which kind of external third party API +// More external third party API can be added. +enum ApiType { + CUDA = 0; + CURAND = 1; + CUDNN = 2; + CUBLAS = 3; + CUSOLVER = 4; + NCCL = 5; +} + message MessageDesc { - // Indicates the type of error - required int32 errorCode = 1; + // Indicates the code of error + required int32 code = 1; // Indicates the message of error - required string errorMessage = 2; + required string message = 2; } message AllMessageDesc { - // Version of cuda API - required int32 version = 1; + // Indicates which kind of third-party API + required ApiType type = 1; // Error messages of different errortype - repeated MessageDesc Messages = 2; + repeated MessageDesc messages = 2; } -message cudaerrorDesc { - // Error messages of different cuda versions(9.0/10.0/10.2) - repeated AllMessageDesc AllMessages = 2; +message ExternalErrorDesc { + // Error messages of different kind of external third party API + repeated AllMessageDesc errors = 1; } \ No newline at end of file diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index dd8146aa3a1..8c323490cc9 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -76,6 +76,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 rem -------set cache build directory----------- rmdir build\python /s/q +rmdir build\paddle\third_party\externalError /s/q rmdir build\paddle\fluid\pybind /s/q rmdir build\paddle_install_dir /s/q rmdir build\paddle_inference_install_dir /s/q @@ -506,7 +507,6 @@ echo ======================================== echo Step 4. Running unit tests ... echo ======================================== - : set CI_SKIP_CPP_TEST if only *.py changed git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON diff --git a/python/setup.py.in b/python/setup.py.in index 79c67182f9c..3fbe796a813 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -412,7 +412,8 @@ if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON': - headers += list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) # errorMessage.pb for errormessage + # externalErrorMsg.pb for External Error message + headers += list(find_files('*.pb', '${externalError_INCLUDE_DIR}')) class InstallCommand(InstallCommandBase): def finalize_options(self): diff --git a/tools/cudaError/README.md b/tools/cudaError/README.md deleted file mode 100644 index df7434c33a9..00000000000 --- a/tools/cudaError/README.md +++ /dev/null @@ -1,22 +0,0 @@ -Usage: - -Please run: -``` -bash start.sh -``` - -The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default. - -If you want to crawl a specified version of CUDA, Please run: -``` -bash start.sh -``` -URL can be derived by default, so you don't have to enter a URL. - -for example: -``` -bash start.sh 11.0 -``` -will capture error message of CUDA11.0(in future). - -Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz diff --git a/tools/cudaError/spider.py b/tools/cudaError/spider.py deleted file mode 100644 index c2c3dc97f42..00000000000 --- a/tools/cudaError/spider.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ssl -import re -import urllib2 -import json -import collections -import sys, getopt -import cuda_error_pb2 - - -def parsing(cuda_errorDesc, version, url): - All_Messages = cuda_errorDesc.AllMessages.add() - All_Messages.version = int(version) - - ssl._create_default_https_context = ssl._create_unverified_context - html = urllib2.urlopen(url).read() - res_div = r'
.*?

CUDA error types

.*?
.*?
(.*?)
' - m_div = re.findall(res_div, html, re.S | re.M) - - url_list = url.split('/') - url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1]) - - dic = collections.OrderedDict() - dic_message = collections.OrderedDict() - for line in m_div: - res_dt = r'
(.*?)
.*?
(.*?)
' - m_dt = re.findall(res_dt, line, re.S | re.M) - for error in m_dt: - res_type = r'(.*?)' - m_type = re.findall(res_type, error[0], re.S | re.M)[0] - m_message = error[1] - m_message = m_message.replace('\n', '') - res_a = r'()' - res_shape = r'(.*?)' - list_a = re.findall(res_a, m_message, re.S | re.M) - list_shape = re.findall(res_shape, m_message, re.S | re.M) - assert len(list_a) == len(list_shape) - for idx in range(len(list_a)): - m_message = m_message.replace(list_a[idx], list_shape[idx]) - - m_message = m_message.replace( - '
Deprecated
', '') - - res_span = r'()' - res_span_detail = r'(.*?)' - list_span = re.findall(res_span, m_message, re.S | re.M) - list_span_detail = re.findall(res_span_detail, m_message, re.S | - re.M) - assert len(list_span) == len(list_span_detail) - for idx in range(len(list_span)): - m_message = m_message.replace(list_span[idx], - list_span_detail[idx]) - - res_p = r'(

.*?

)' - res_p_detail = r'

(.*?)

' - list_p = re.findall(res_p, m_message, re.S | re.M) - list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M) - assert len(list_p) == len(list_p_detail) - for idx in range(len(list_p)): - m_message = m_message.replace(list_p[idx], list_p_detail[idx]) - - m_message = m_message.replace(' ', '') - _Messages = All_Messages.Messages.add() - try: - _Messages.errorCode = int(m_type) - except ValueError: - if re.match('0x', m_type): - _Messages.errorCode = int(m_type, 16) - else: - raise ValueError - _Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface - - -def main(argv): - version = [] - url = [] - try: - opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="]) - except getopt.GetoptError: - print 'python spider.py -v -u ' - sys.exit(2) - for opt, arg in opts: - if opt in ("-h", "--help"): - print 'python spider.py -v -u ' - sys.exit() - elif opt in ("-v", "--version"): - version = arg - elif opt in ("-u", "--url"): - url = arg - version = version.split(',') - url = url.split(',') - assert len(version) == len(url) - cuda_errorDesc = cuda_error_pb2.cudaerrorDesc() - for idx in range(len(version)): - if version[idx] == "-1": - print("crawling errorMessage for CUDA%s from %s" % - ("-latest-version", url[idx])) - else: - print("crawling errorMessage for CUDA%s from %s" % - (version[idx], url[idx])) - parsing(cuda_errorDesc, version[idx], url[idx]) - - serializeToString = cuda_errorDesc.SerializeToString() - with open("cudaErrorMessage.pb", "wb") as f: - f.write(serializeToString - ) # save for cudaErrorMessage.pb from python-protobuf interface - print("crawling errorMessage for CUDA has been done!!!") - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/externalError/README.md b/tools/externalError/README.md new file mode 100644 index 00000000000..029efd8cb94 --- /dev/null +++ b/tools/externalError/README.md @@ -0,0 +1,9 @@ +Usage: + +Please run: +``` +bash start.sh +``` + +If you want to update all external error message, you need to run command `bash start.sh` in current directory, +and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py new file mode 100644 index 00000000000..a74d82f40eb --- /dev/null +++ b/tools/externalError/spider.py @@ -0,0 +1,363 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ssl +import re +import urllib.request +import json +import collections +import sys, getopt +import external_error_pb2 + + +def parsing(externalErrorDesc): + #*********************************************************************************************# + #*********************************** CUDA Error Message **************************************# + print("start crawling errorMessage for nvidia CUDA API--->") + url = 'https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CUDA + + ssl._create_default_https_context = ssl._create_unverified_context + html = urllib.request.urlopen(url).read().decode('utf-8') + res_div = r'
.*?

CUDA error types

.*?
.*?
(.*?)
' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'
(.*?)
.*?
(.*?)
' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + for error in m_dt: + res_type = r'(.*?) = (.*?)' + m_type = re.findall(res_type, error[0], re.S | re.M)[0] + m_message = error[1] + m_message = m_message.replace('\n', '') + res_a = r'()' + res_shape = r'(.*?)' + list_a = re.findall(res_a, m_message, re.S | re.M) + list_shape = re.findall(res_shape, m_message, re.S | re.M) + assert len(list_a) == len(list_shape) + for idx in range(len(list_a)): + m_message = m_message.replace(list_a[idx], list_shape[idx]) + + m_message = m_message.replace( + '
Deprecated
', '') + + res_span = r'()' + res_span_detail = r'(.*?)' + list_span = re.findall(res_span, m_message, re.S | re.M) + list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M) + assert len(list_span) == len(list_span_detail) + for idx in range(len(list_span)): + m_message = m_message.replace(list_span[idx], list_span_detail[idx]) + + res_p = r'(

.*?

)' + res_p_detail = r'

(.*?)

' + list_p = re.findall(res_p, m_message, re.S | re.M) + list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M) + assert len(list_p) == len(list_p_detail) + for idx in range(len(list_p)): + m_message = m_message.replace(list_p[idx], list_p_detail[idx]) + + m_message = m_message.replace(' ', '') + _Messages = allMessageDesc.messages.add() + try: + _Messages.code = int(m_type[1]) + except ValueError: + if re.match('0x', m_type[1]): + _Messages.code = int(m_type[1], 16) + else: + raise ValueError + _Messages.message = "'%s'. %s" % (m_type[0], m_message) + print("End crawling errorMessage for nvidia CUDA API!\n") + + #***********************************************************************************************# + #*********************************** CURAND Error Message **************************************# + print("start crawling errorMessage for nvidia CURAND API--->") + url = 'https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CURAND + + html = urllib.request.urlopen(url).read().decode('utf-8') + + res_div = r'
.*?

CURAND function call status types

.*?
.*?
(.*?)
' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'
(.*?)
.*?
(.*?)
' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + for error in m_dt: + res_type = r'(.*?) = (.*?)' + m_type = re.findall(res_type, error[0], re.S | re.M)[0] + m_message = error[1] + + _Messages = allMessageDesc.messages.add() + try: + _Messages.code = int(m_type[1]) + except ValueError: + if re.match('0x', m_type[1]): + _Messages.code = int(m_type[1], 16) + else: + raise ValueError + _Messages.message = "'%s'. %s" % (m_type[0], m_message) + print("End crawling errorMessage for nvidia CURAND API!\n") + + #**************************************************************************************************# + #*********************************** CUDNN Error Message ******************************************# + cudnnStatus_t = { + "CUDNN_STATUS_SUCCESS": 0, + "CUDNN_STATUS_NOT_INITIALIZED": 1, + "CUDNN_STATUS_ALLOC_FAILED": 2, + "CUDNN_STATUS_BAD_PARAM": 3, + "CUDNN_STATUS_INTERNAL_ERROR": 4, + "CUDNN_STATUS_INVALID_VALUE": 5, + "CUDNN_STATUS_ARCH_MISMATCH": 6, + "CUDNN_STATUS_MAPPING_ERROR": 7, + "CUDNN_STATUS_EXECUTION_FAILED": 8, + "CUDNN_STATUS_NOT_SUPPORTED": 9, + "CUDNN_STATUS_LICENSE_ERROR": 10, + "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING": 11, + "CUDNN_STATUS_RUNTIME_IN_PROGRESS": 12, + "CUDNN_STATUS_RUNTIME_FP_OVERFLOW": 13, + } + + print("start crawling errorMessage for nvidia CUDNN API--->") + url = 'https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnStatus_t' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CUDNN + + html = urllib.request.urlopen(url).read().decode('utf-8') + f = open('1.txt', 'w') + f.write(html) + + res_div = r'' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'
(.*?)
.*?
(.*?)
' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + for error in m_dt: + m_message = error[1] + + res_class = r'

.*?

' + res_class_detail = r'

(.*?)

' + list_class = re.findall(res_class, m_message, re.S | re.M) + list_class_detail = re.findall(res_class_detail, m_message, re.S | re.M) + assert len(list_class) == len(list_class_detail) + for idx in range(len(list_class)): + m_message = m_message.replace(list_class[idx], + list_class_detail[idx]) + + res_a = r'(
)' + res_shape = r'(.*?)' + list_a = re.findall(res_a, m_message, re.S | re.M) + list_shape = re.findall(res_shape, m_message, re.S | re.M) + assert len(list_a) == len(list_shape) + for idx in range(len(list_a)): + m_message = m_message.replace(list_a[idx], list_shape[idx]) + + res_span = r'(.*?)' + res_span_detail = r'(.*?)' + list_span = re.findall(res_span, m_message, re.S | re.M) + list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M) + assert len(list_span) == len(list_span_detail) + for idx in range(len(list_span)): + m_message = m_message.replace(list_span[idx], list_span_detail[idx]) + + res_samp = r'(.*?)' + res_samp_detail = r'(.*?)' + list_samp = re.findall(res_samp, m_message, re.S | re.M) + list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M) + assert len(list_samp) == len(list_samp_detail) + for idx in range(len(list_samp)): + m_message = m_message.replace(list_samp[idx], list_samp_detail[idx]) + + m_message = re.sub(r'\n +', ' ', m_message) + + _Messages = allMessageDesc.messages.add() + _Messages.code = int(cudnnStatus_t[error[0]]) + _Messages.message = "'%s'. %s" % (error[0], m_message) + print("End crawling errorMessage for nvidia CUDNN API!\n") + + #*************************************************************************************************# + #*********************************** CUBLAS Error Message ****************************************# + cublasStatus_t = { + "CUBLAS_STATUS_SUCCESS": 0, + "CUBLAS_STATUS_NOT_INITIALIZED": 1, + "CUBLAS_STATUS_ALLOC_FAILED": 3, + "CUBLAS_STATUS_INVALID_VALUE": 7, + "CUBLAS_STATUS_ARCH_MISMATCH": 8, + "CUBLAS_STATUS_MAPPING_ERROR": 11, + "CUBLAS_STATUS_EXECUTION_FAILED": 13, + "CUBLAS_STATUS_INTERNAL_ERROR": 14, + "CUBLAS_STATUS_NOT_SUPPORTED": 15, + "CUBLAS_STATUS_LICENSE_ERROR": 16 + } + + print("start crawling errorMessage for nvidia CUBLAS API--->") + url = 'https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CUBLAS + + html = urllib.request.urlopen(url).read().decode('utf-8') + + res_div = r'

The type is used for function status returns. All cuBLAS library.*?

(.*?)
' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'

(.*?)

.*?colspan="1">(.*?)' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + + for error in m_dt: + m_message = error[1] + m_message = re.sub(r'\n +', ' ', m_message) + + res_p = r'

.*?

' + res_p_detail = r'

(.*?)

' + list_p = re.findall(res_p, m_message, re.S | re.M) + list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M) + assert len(list_p) == len(list_p_detail) + for idx in range(len(list_p)): + m_message = m_message.replace(list_p[idx], list_p_detail[idx]) + + res_samp = r'.*?' + res_samp_detail = r'(.*?)' + list_samp = re.findall(res_samp, m_message, re.S | re.M) + list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M) + assert len(list_samp) == len(list_samp_detail) + for idx in range(len(list_samp)): + m_message = m_message.replace(list_samp[idx], list_samp_detail[idx]) + + _Messages = allMessageDesc.messages.add() + _Messages.code = int(cublasStatus_t[error[0]]) + _Messages.message = "'%s'. %s" % (error[0], m_message) + print("End crawling errorMessage for nvidia CUBLAS API!\n") + + #*************************************************************************************************# + #*********************************** CUSOLVER Error Message **************************************# + cusolverStatus_t = { + "CUSOLVER_STATUS_SUCCESS": 0, + "CUSOLVER_STATUS_NOT_INITIALIZED": 1, + "CUSOLVER_STATUS_ALLOC_FAILED": 2, + "CUSOLVER_STATUS_INVALID_VALUE": 3, + "CUSOLVER_STATUS_ARCH_MISMATCH": 4, + "CUSOLVER_STATUS_MAPPING_ERROR": 5, + "CUSOLVER_STATUS_EXECUTION_FAILED": 6, + "CUSOLVER_STATUS_INTERNAL_ERROR": 7, + "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED": 8, + "CUSOLVER_STATUS_NOT_SUPPORTED": 9, + "CUSOLVER_STATUS_ZERO_PIVOT": 10, + "CUSOLVER_STATUS_INVALID_LICENSE": 11, + "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED": 12, + "CUSOLVER_STATUS_IRS_PARAMS_INVALID": 13, + "CUSOLVER_STATUS_IRS_INTERNAL_ERROR": 14, + "CUSOLVER_STATUS_IRS_NOT_SUPPORTED": 15, + "CUSOLVER_STATUS_IRS_OUT_OF_RANGE": 16, + "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES": 17, + "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED": 18 + } + print("start crawling errorMessage for nvidia CUSOLVER API--->") + url = 'https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverSPstatus' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CUSOLVER + + html = urllib.request.urlopen(url).read().decode('utf-8') + + res_div = r'This is a status type returned by the library functions and.*?
(.*?)
' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'(.*?).*?colspan="1">(.*?)' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + + for error in m_dt: + m_message = error[1] + m_message = re.sub(r'\n +', '', m_message) + m_message = re.sub(r'

', '', m_message) + + res_p = r'

.*?

' + res_p_detail = r'

(.*?)

' + list_p = re.findall(res_p, m_message, re.S | re.M) + list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M) + assert len(list_p) == len(list_p_detail) + for idx in range(len(list_p)): + m_message = m_message.replace(list_p[idx], list_p_detail[idx]) + + res_samp = r'.*?' + res_samp_detail = r'(.*?)' + list_samp = re.findall(res_samp, m_message, re.S | re.M) + list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M) + assert len(list_samp) == len(list_samp_detail) + for idx in range(len(list_samp)): + m_message = m_message.replace(list_samp[idx], list_samp_detail[idx]) + + res_strong = r'.*?' + res_strong_detail = r'(.*?)' + list_strong = re.findall(res_strong, m_message, re.S | re.M) + list_strong_detail = re.findall(res_strong_detail, m_message, re.S | + re.M) + assert len(list_strong) == len(list_strong_detail) + for idx in range(len(list_strong)): + m_message = m_message.replace(list_strong[idx], + list_strong_detail[idx]) + + _Messages = allMessageDesc.messages.add() + _Messages.code = int(cusolverStatus_t[error[0]]) + _Messages.message = "'%s'. %s" % (error[0], m_message) + print("End crawling errorMessage for nvidia CUSOLVER API!\n") + + #**********************************************************************************************# + #*************************************** NCCL error *******************************************# + print("start crawling errorMessage for nvidia NCCL API--->") + url = 'https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclresult-t' + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.NCCL + html = urllib.request.urlopen(url).read().decode('utf-8') + res_div = r'ncclResult_t(.*?)' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'(.*?).*?(.*?)\)(.*?)

\n' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + for error in m_dt: + m_message = re.sub(r'\n', '', error[2]) + _Messages = allMessageDesc.messages.add() + _Messages.code = int(error[1]) + _Messages.message = "'%s'. %s" % (error[0], m_message) + print("End crawling errorMessage for nvidia NCCL API!\n") + + +def main(argv): + try: + opts, _ = getopt.getopt(argv, "h", ["help"]) + except getopt.GetoptError: + print('python spider.py') + sys.exit(2) + for opt, _ in opts: + if opt in ("-h", "--help"): + print('python spider.py') + sys.exit(2) + externalErrorDesc = external_error_pb2.ExternalErrorDesc() + parsing(externalErrorDesc) + + serializedString = externalErrorDesc.SerializeToString() + with open("externalErrorMsg.pb", "wb") as f: + # save for externalErrorMsg.pb from Python-protobuf interface + # load from C++-protobuf interface and get error message + f.write(serializedString) + print( + "Generating data file [externalErrorMsg.pb] for external third_party API error has been done!" + ) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/tools/cudaError/start.sh b/tools/externalError/start.sh similarity index 59% rename from tools/cudaError/start.sh rename to tools/externalError/start.sh index 66e56b8485d..32ef63c2612 100644 --- a/tools/cudaError/start.sh +++ b/tools/externalError/start.sh @@ -29,19 +29,7 @@ else echo "please run on Mac/Linux" exit 1 fi -protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto +protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto -version=90,100,-1 # -1 represent the latest cuda-version -url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 - -if [ "$1" != "" ]; then - version=$version,$(($1*10)) - if [ "$2" != "" ]; then - url=$url,$2 - else - url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 - fi -fi - -python spider.py --version=$version --url=$url -tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb +python3.7 spider.py +tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb -- GitLab