未验证 提交 b425215a 编写于 作者: Z Zhou Wei 提交者: GitHub

Unify all external API error message mechanism and enhance third-party API error msg (#33003)

* Unify all external API error message mechanism and enhance third-party API error msg

* fix some comment

* fix some comment
上级 e05a7a49
......@@ -146,11 +146,11 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir})
# Only GPU need cudaErrorMessage.pb
# GPU must copy externalErrorMsg.pb
IF(WITH_GPU)
set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR}
SRCS ${externalError_INCLUDE_DIR}
DSTS ${dst_dir})
ENDIF()
......@@ -259,7 +259,7 @@ copy(fluid_lib_dist
set(module "platform")
set(platform_lib_deps profiler_proto error_codes_proto)
if(WITH_GPU)
set(platform_lib_deps ${platform_lib_deps} cuda_error_proto)
set(platform_lib_deps ${platform_lib_deps} external_error_proto)
endif(WITH_GPU)
add_dependencies(fluid_lib_dist ${platform_lib_deps})
......
......@@ -111,10 +111,11 @@ FUNCTION(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE)
ExternalProject_Add(
extern_download_${NAME}
download_${NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL}
TIMEOUT 120
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1
......@@ -123,7 +124,7 @@ FUNCTION(file_download_and_uncompress URL NAME)
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE)
set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE)
ENDFUNCTION()
......@@ -242,8 +243,20 @@ if(WITH_GPU)
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${URL} "externalError") # download file externalErrorMsg.tar.gz
if(WITH_TESTING)
# copy externalErrorMsg.pb for unittest 'enforce_test'
set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
else()
set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
endif()
add_custom_command(TARGET download_externalError POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR}
COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
endif()
endif(WITH_GPU)
if(WITH_XPU)
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
proto_library(error_codes_proto SRCS error_codes.proto)
if(WITH_GPU)
proto_library(cuda_error_proto SRCS cuda_error.proto)
proto_library(external_error_proto SRCS external_error.proto)
endif(WITH_GPU)
if(WITH_XPU)
......@@ -45,7 +45,7 @@ cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
set(enforce_deps flags errors boost)
if(WITH_GPU)
set(enforce_deps ${enforce_deps} cuda_error_proto)
set(enforce_deps ${enforce_deps} external_error_proto)
endif()
cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps})
cc_library(monitor SRCS monitor.cc)
......
......@@ -34,35 +34,6 @@ DECLARE_bool(cudnn_deterministic);
namespace paddle {
namespace platform {
inline const char* cudnnGetErrorString(cudnnStatus_t status) {
switch (status) {
case CUDNN_STATUS_SUCCESS:
return "CUDNN_STATUS_SUCCESS";
case CUDNN_STATUS_NOT_INITIALIZED:
return "CUDNN_STATUS_NOT_INITIALIZED";
case CUDNN_STATUS_ALLOC_FAILED:
return "CUDNN_STATUS_ALLOC_FAILED";
case CUDNN_STATUS_BAD_PARAM:
return "CUDNN_STATUS_BAD_PARAM";
case CUDNN_STATUS_INTERNAL_ERROR:
return "CUDNN_STATUS_INTERNAL_ERROR";
case CUDNN_STATUS_INVALID_VALUE:
return "CUDNN_STATUS_INVALID_VALUE";
case CUDNN_STATUS_ARCH_MISMATCH:
return "CUDNN_STATUS_ARCH_MISMATCH";
case CUDNN_STATUS_MAPPING_ERROR:
return "CUDNN_STATUS_MAPPING_ERROR";
case CUDNN_STATUS_EXECUTION_FAILED:
return "CUDNN_STATUS_EXECUTION_FAILED";
case CUDNN_STATUS_NOT_SUPPORTED:
return "CUDNN_STATUS_NOT_SUPPORTED";
case CUDNN_STATUS_LICENSE_ERROR:
return "CUDNN_STATUS_LICENSE_ERROR";
default:
return "Unknown cudnn error number";
}
}
#define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
......
......@@ -34,7 +34,7 @@ limitations under the License. */
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#include "paddle/fluid/platform/cuda_error.pb.h"
#include "paddle/fluid/platform/external_error.pb.h"
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
......@@ -682,41 +682,83 @@ struct EOFException : public std::exception {
END_HANDLE_THE_ERROR \
} while (0)
/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
/**************************************************************************/
/**************************** NVIDIA ERROR ********************************/
#ifdef PADDLE_WITH_CUDA
/***** CUDA ERROR *****/
inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
namespace details {
template <typename T>
struct ExternalApiType {};
inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
std::ostringstream webstr;
webstr << "https://docs.nvidia.com/cuda/";
if (cuda_version != -1) {
double version = cuda_version / 10;
webstr << "archive/" << std::fixed << std::setprecision(1) << version;
#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \
template <> \
struct ExternalApiType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
static constexpr const char* kTypeString = #proto_type; \
static constexpr platform::proto::ApiType kProtoType = \
platform::proto::ApiType::proto_type; \
}
webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
"#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
return webstr.str();
}
inline std::string build_nvidia_error_msg(cudaError_t e) {
#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
int32_t cuda_version = 100;
#elif CUDA_VERSION >= 9000
int32_t cuda_version = 90;
#else
int32_t cuda_version = -1;
DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA);
DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
#endif
} // namespace details
template <typename T>
inline const char* GetErrorMsgUrl(T status) {
using __CUDA_STATUS_TYPE__ = decltype(status);
platform::proto::ApiType proto_type =
details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
switch (proto_type) {
case platform::proto::ApiType::CUDA:
return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
"group__CUDART__TYPES.html#group__CUDART__TYPES_"
"1g3f51e3575c2178246db0a94a430e0038";
break;
case platform::proto::ApiType::CURAND:
return "https://docs.nvidia.com/cuda/curand/"
"group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437";
break;
case platform::proto::ApiType::CUDNN:
return "https://docs.nvidia.com/deeplearning/cudnn/api/"
"index.html#cudnnStatus_t";
break;
case platform::proto::ApiType::CUBLAS:
return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t";
break;
case platform::proto::ApiType::CUSOLVER:
return "https://docs.nvidia.com/cuda/cusolver/"
"index.html#cuSolverSPstatus";
break;
case platform::proto::ApiType::NCCL:
return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
"types.html#ncclresult-t";
break;
default:
return "Unknown type of External API, can't get error message URL!";
break;
}
}
template <typename T>
inline std::string GetExternalErrorMsg(T status) {
std::ostringstream sout;
sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
static platform::proto::cudaerrorDesc cudaerror;
static bool _initSucceed = false;
if (cudaerror.ByteSizeLong() == 0) {
bool _initSucceed = false;
platform::proto::ExternalErrorDesc externalError;
if (externalError.ByteSizeLong() == 0) {
std::string filePath;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
......@@ -724,18 +766,19 @@ inline std::string build_nvidia_error_msg(cudaError_t e) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.so") == 0) {
filePath = strModule +
"/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
} else {
filePath =
strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
strModule +
"/../include/third_party/externalError/data/externalErrorMsg.pb";
} else {
filePath = strModule +
"/../../third_party/externalError/data/externalErrorMsg.pb";
}
}
#else
char buf[100];
MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module =
(::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
(::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 100);
......@@ -746,198 +789,118 @@ inline std::string build_nvidia_error_msg(cudaError_t e) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.pyd") == 0) {
filePath =
strModule +
"\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
filePath = strModule +
"\\..\\include\\third_"
"party\\externalerror\\data\\externalErrorMsg.pb";
} else {
filePath =
strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
strModule +
"\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
}
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
_initSucceed = cudaerror.ParseFromIstream(&fin);
_initSucceed = externalError.ParseFromIstream(&fin);
}
using __CUDA_STATUS_TYPE__ = decltype(status);
platform::proto::ApiType proto_type =
details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
if (_initSucceed) {
for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
if (cuda_version == cudaerror.allmessages(i).version()) {
for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
sout << "\n [Advise: "
<< cudaerror.allmessages(i).messages(j).errormessage() << "]";
for (int i = 0; i < externalError.errors_size(); ++i) {
if (proto_type == externalError.errors(i).type()) {
for (int j = 0; j < externalError.errors(i).messages_size(); ++j) {
if (status == externalError.errors(i).messages(j).code()) {
sout << "\n [Hint: "
<< externalError.errors(i).messages(j).message() << "]";
return sout.str();
}
}
}
}
}
sout << "\n [Advise: Please search for the error code(" << e
<< ") on website( " << GetCudaErrorWebsite(cuda_version)
<< " ) to get Nvidia's official solution about CUDA Error.]";
sout << "\n [Hint: Please search for the error code(" << status
<< ") on website (" << GetErrorMsgUrl(status)
<< ") to get Nvidia's official solution and advice about "
<< details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString
<< " Error.]";
return sout.str();
}
/** curand ERROR **/
inline bool is_error(curandStatus_t stat) {
return stat != CURAND_STATUS_SUCCESS;
template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
#endif
/*************** CUDA ERROR ***************/
inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
inline std::string build_nvidia_error_msg(cudaError_t e) {
std::ostringstream sout;
sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". "
<< GetExternalErrorMsg(e);
return sout.str();
}
inline const char* curandGetErrorString(curandStatus_t stat) {
switch (stat) {
case CURAND_STATUS_SUCCESS:
return "`CURAND_STATUS_SUCCESS`. No errors.";
case CURAND_STATUS_VERSION_MISMATCH:
return "`CURAND_STATUS_VERSION_MISMATCH`. Header file and linked library "
"version do not match.";
case CURAND_STATUS_NOT_INITIALIZED:
return "`CURAND_STATUS_NOT_INITIALIZED`. Generator not initialized.";
case CURAND_STATUS_ALLOCATION_FAILED:
return "`CURAND_STATUS_ALLOCATION_FAILED`. Memory allocation failed.";
case CURAND_STATUS_TYPE_ERROR:
return "`CURAND_STATUS_TYPE_ERROR`. Generator is wrong type.";
case CURAND_STATUS_OUT_OF_RANGE:
return "`CURAND_STATUS_OUT_OF_RANGE`. Argument out of range.";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "`CURAND_STATUS_LENGTH_NOT_MULTIPLE`. Length requested is not a "
"multple of dimension.";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "`CURAND_STATUS_DOUBLE_PRECISION_REQUIRED`. GPU does not have "
"double precision required by MRG32k3a.";
case CURAND_STATUS_LAUNCH_FAILURE:
return "`CURAND_STATUS_LAUNCH_FAILURE`. Kernel launch failure.";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "`CURAND_STATUS_PREEXISTING_FAILURE`. Preexisting failure on "
"library entry.";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "`CURAND_STATUS_INITIALIZATION_FAILED`. Initialization of CUDA "
"failed.";
case CURAND_STATUS_ARCH_MISMATCH:
return "`CURAND_STATUS_ARCH_MISMATCH`. Architecture mismatch, GPU does "
"not support requested feature.";
case CURAND_STATUS_INTERNAL_ERROR:
return "`CURAND_STATUS_INTERNAL_ERROR`. Internal library error.";
default:
return "Unknown curand status";
}
/*************** CURAND ERROR ***************/
inline bool is_error(curandStatus_t stat) {
return stat != CURAND_STATUS_SUCCESS;
}
inline std::string build_nvidia_error_msg(curandStatus_t stat) {
std::string msg(" Curand error, ");
return msg + curandGetErrorString(stat) + " ";
std::ostringstream sout;
sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat);
return sout.str();
}
/***** CUDNN ERROR *****/
/*************** CUDNN ERROR ***************/
inline bool is_error(cudnnStatus_t stat) {
return stat != CUDNN_STATUS_SUCCESS;
}
inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
std::string msg(" Cudnn error, ");
return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
std::ostringstream sout;
sout << "CUDNN error(" << stat << "), "
<< platform::dynload::cudnnGetErrorString(stat) << ". "
<< GetExternalErrorMsg(stat);
return sout.str();
}
/***** CUBLAS ERROR *****/
/*************** CUBLAS ERROR ***************/
inline bool is_error(cublasStatus_t stat) {
return stat != CUBLAS_STATUS_SUCCESS;
}
inline const char* cublasGetErrorString(cublasStatus_t stat) {
switch (stat) {
case CUBLAS_STATUS_NOT_INITIALIZED:
return "`CUBLAS_STATUS_NOT_INITIALIZED`. The cuBLAS library was not "
"initialized.";
case CUBLAS_STATUS_ALLOC_FAILED:
return "`CUBLAS_STATUS_ALLOC_FAILED`. Resource allocation failed inside "
"the cuBLAS library.";
case CUBLAS_STATUS_INVALID_VALUE:
return "`CUBLAS_STATUS_INVALID_VALUE`. An unsupported value or parameter "
"was passed to the function (a negative vector size, for "
"example).";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "`CUBLAS_STATUS_ARCH_MISMATCH`. The function requires a feature "
"absent from the device architecture; usually caused by the lack "
"of support for double precision.";
case CUBLAS_STATUS_MAPPING_ERROR:
return "`CUBLAS_STATUS_MAPPING_ERROR`. An access to GPU memory space "
"failed, which is usually caused by a failure to bind a texture.";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "`CUBLAS_STATUS_EXECUTION_FAILED`. The GPU program failed to "
"execute. This is often caused by a launch failure of the kernel "
"on the GPU, which can be caused by multiple reasons.";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "`CUBLAS_STATUS_INTERNAL_ERROR`. An internal cuBLAS operation "
"failed. This error is usually caused by a cudaMemcpyAsync() "
"failure.";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "`CUBLAS_STATUS_NOT_SUPPORTED`. The functionality requested is "
"not supported.";
case CUBLAS_STATUS_LICENSE_ERROR:
return "`CUBLAS_STATUS_LICENSE_ERROR`. The functionality requested "
"requires some license and an error was detected when trying to "
"check the current licensing.";
default:
return "Unknown cublas status";
}
}
inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
std::string msg(" Cublas error, ");
return msg + cublasGetErrorString(stat) + " ";
std::ostringstream sout;
sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat);
return sout.str();
}
/***** CUSOLVER ERROR *****/
/*************** CUSOLVER ERROR ***************/
inline bool is_error(cusolverStatus_t stat) {
return stat != CUSOLVER_STATUS_SUCCESS;
}
inline const char* cusolverGetErrorString(cusolverStatus_t stat) {
switch (stat) {
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "`CUSOLVER_STATUS_NOT_INITIALIZED`. The cuSolver library was not "
"initialized. This is usually caused by the lack of a prior call, "
"an error in the CUDA Runtime API called by the cuSolver routine, "
"or an error in the hardware setup.";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "`CUSOLVER_STATUS_ALLOC_FAILED`. Resource allocation failed "
"inside the cuSolver library. This is usually caused by a "
"cudaMalloc() failure.";
case CUSOLVER_STATUS_INVALID_VALUE:
return "`CUSOLVER_STATUS_INVALID_VALUE`. An unsupported value or "
"parameter was passed to the function (a negative vector size, "
"for example).";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "`CUSOLVER_STATUS_ARCH_MISMATCH`. The function requires a feature "
"absent from the device architecture; usually caused by the lack "
"of support for atomic operations or double precision.";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "`CUSOLVER_STATUS_EXECUTION_FAILED`. The GPU program failed to "
"execute. This is often caused by a launch failure of the kernel "
"on the GPU, which can be caused by multiple reasons.";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "`CUSOLVER_STATUS_INTERNAL_ERROR`. An internal cuSolver operation "
"failed. This error is usually caused by a cudaMemcpyAsync() "
"failure.";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "`CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED`. The matrix type is "
"not supported by this function. This is usually caused by "
"passing an invalid matrix descriptor to the function.";
default:
return "Unknown cusolver status";
}
}
inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
std::string msg(" Cublas error, ");
return msg + cusolverGetErrorString(stat) + " ";
std::ostringstream sout;
sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat);
return sout.str();
}
/****** NCCL ERROR ******/
/**************** NCCL ERROR ****************/
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
inline bool is_error(ncclResult_t nccl_result) {
return nccl_result != ncclSuccess;
}
inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
std::string msg(" Nccl error, ");
std::ostringstream sout;
sout << "NCCL error(" << nccl_result << "), "
<< platform::dynload::ncclGetErrorString(nccl_result) << ". ";
if (errno == ENOSPC || errno == EAGAIN) {
std::string detail(strerror(errno));
detail += "\nPlease try one of the following solutions:";
......@@ -947,42 +910,19 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
"\n3. Increase shared memory by setting the -shm-size "
"option when starting docker container, e.g., setting "
" -shm-size=2g.\n";
return msg + platform::dynload::ncclGetErrorString(nccl_result) +
", detail: " + detail + " ";
sout << " Detail: " + detail;
}
return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
sout << GetExternalErrorMsg(nccl_result);
return sout.str();
}
#endif // not(__APPLE__) and PADDLE_WITH_NCCL
namespace details {
template <typename T>
struct CudaStatusType {};
#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
template <> \
struct CudaStatusType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess);
DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
#endif
} // namespace details
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
......@@ -1023,7 +963,7 @@ inline void retry_sleep(unsigned milliseconds) {
int retry_count = 1; \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
......@@ -1037,10 +977,11 @@ inline void retry_sleep(unsigned milliseconds) {
} \
} while (0)
#undef DEFINE_CUDA_STATUS_TYPE
#undef DEFINE_EXTERNAL_API_TYPE
#endif // PADDLE_WITH_CUDA
/** HIP PADDLE ENFORCE FUNCTIONS AND MACROS **/
/**************************************************************************/
/***************************** HIP ERROR **********************************/
#ifdef PADDLE_WITH_HIP
/***** HIP ERROR *****/
......@@ -1052,7 +993,7 @@ inline std::string build_rocm_error_msg(hipError_t e) {
return sout.str();
}
/** HIPRAND ERROR **/
/***** HIPRAND ERROR *****/
inline bool is_error(hiprandStatus_t stat) {
return stat != HIPRAND_STATUS_SUCCESS;
}
......@@ -1153,22 +1094,22 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
namespace details {
template <typename T>
struct CudaStatusType {};
struct ExternalApiType {};
#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
template <> \
struct CudaStatusType<type> { \
struct ExternalApiType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_CUDA_STATUS_TYPE(hipError_t, hipSuccess);
DEFINE_CUDA_STATUS_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
DEFINE_CUDA_STATUS_TYPE(miopenStatus_t, miopenStatusSuccess);
DEFINE_CUDA_STATUS_TYPE(rocblas_status, rocblas_status_success);
DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess);
DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess);
DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
#endif
} // namespace details
......@@ -1178,7 +1119,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
auto __cond__ = (COND); \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
......@@ -1201,7 +1142,7 @@ inline void retry_sleep(unsigned millisecond) {
int retry_count = 1; \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
......@@ -1215,7 +1156,7 @@ inline void retry_sleep(unsigned millisecond) {
} \
} while (0)
#undef DEFINE_CUDA_STATUS_TYPE
#undef DEFINE_EXTERNAL_API_TYPE
#endif // PADDLE_WITH_HIP
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -304,6 +304,7 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) {
return false;
} catch (paddle::platform::EnforceNotMet& error) {
std::string ex_msg = error.what();
std::cout << ex_msg << std::endl;
return ex_msg.find(msg) != std::string::npos;
}
}
......@@ -338,30 +339,98 @@ TEST(enforce, hip_success) {
#else
TEST(enforce, cuda_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "CUDA error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "CUDA error"));
EXPECT_TRUE(CheckCudaStatusFailure(
cudaErrorInsufficientDriver,
"This indicates that the installed NVIDIA CUDA driver is older than the "
"CUDA runtime library. This is not a supported configuration.Users "
"should install an updated NVIDIA display driver to allow the "
"application to run"));
EXPECT_TRUE(CheckCudaStatusFailure(
cudaErrorContextIsDestroyed,
"This error indicates that the context current to the calling thread has "
"been destroyed using cuCtxDestroy, or is a primary context which has "
"not yet been initialized"));
EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "CURAND error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));
CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "CURAND error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CURAND_STATUS_ARCH_MISMATCH,
"Architecture mismatch, GPU does not support requested feature"));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_LENGTH_NOT_MULTIPLE,
"Length requested is not a multple of dimension"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
EXPECT_TRUE(
CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));
CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "CUDNN error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "CUDNN error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUDNN_STATUS_BAD_PARAM,
"An incorrect value or parameter was passed to the function. To correct, "
"ensure that all the parameters being passed have valid values"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUDNN_STATUS_LICENSE_ERROR,
"The functionality requested requires some license and an error was "
"detected when trying to check the current licensing. This error can "
"happen if the license is not present or is expired or if the "
"environment variable NVIDIA_LICENSE_FILE is not set properly"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUBLAS_STATUS_EXECUTION_FAILED,
"The GPU program failed to execute. This is often caused by a launch "
"failure of the kernel on the GPU, which can be caused by multiple "
"reasons. To correct: check that the hardware, an appropriate version "
"of the driver, and the cuBLAS library are correctly installed"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUBLAS_STATUS_MAPPING_ERROR,
"An access to GPU memory space failed, which is usually caused by a "
"failure to bind a texture. To correct: prior to the function call, "
"unbind any previously bound textures"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUSOLVER_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUSOLVER_STATUS_NOT_INITIALIZED,
"CUSOLVER error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
CheckCudaStatusFailure(CUSOLVER_STATUS_ALLOC_FAILED, "CUSOLVER error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUSOLVER_STATUS_INTERNAL_ERROR,
"An internal cuSolver operation failed. This error is usually caused by "
"a cudaMemcpyAsync() failure.To correct: check that the hardware, an "
"appropriate version of the driver, and the cuSolver library are "
"correctly installed. Also, check that the memory passed as a parameter "
"to the routine is not being deallocated prior to the routine’s "
"completion"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUSOLVER_STATUS_INVALID_VALUE,
"An unsupported value or parameter was passed to the function (a "
"negative vector size, for example).To correct: ensure that all the "
"parameters being passed have valid values"));
/*
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError,
"An internal check failed. This is either "
"a bug in NCCL or due to memory "
"corruption"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclInvalidUsage,
"The call to NCCL is incorrect. This is "
"usually reflecting a programming error"));
#endif
*/
}
#endif
#endif
......
......@@ -15,21 +15,32 @@ limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
// (NOTE:zhouwei): ApiType describes which kind of external third party API
// More external third party API can be added.
enum ApiType {
CUDA = 0;
CURAND = 1;
CUDNN = 2;
CUBLAS = 3;
CUSOLVER = 4;
NCCL = 5;
}
message MessageDesc {
// Indicates the type of error
required int32 errorCode = 1;
// Indicates the code of error
required int32 code = 1;
// Indicates the message of error
required string errorMessage = 2;
required string message = 2;
}
message AllMessageDesc {
// Version of cuda API
required int32 version = 1;
// Indicates which kind of third-party API
required ApiType type = 1;
// Error messages of different errortype
repeated MessageDesc Messages = 2;
repeated MessageDesc messages = 2;
}
message cudaerrorDesc {
// Error messages of different cuda versions(9.0/10.0/10.2)
repeated AllMessageDesc AllMessages = 2;
message ExternalErrorDesc {
// Error messages of different kind of external third party API
repeated AllMessageDesc errors = 1;
}
\ No newline at end of file
......@@ -76,6 +76,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
rem -------set cache build directory-----------
rmdir build\python /s/q
rmdir build\paddle\third_party\externalError /s/q
rmdir build\paddle\fluid\pybind /s/q
rmdir build\paddle_install_dir /s/q
rmdir build\paddle_inference_install_dir /s/q
......@@ -506,7 +507,6 @@ echo ========================================
echo Step 4. Running unit tests ...
echo ========================================
: set CI_SKIP_CPP_TEST if only *.py changed
git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
......
......@@ -412,7 +412,8 @@ if '${WITH_MKLDNN}' == 'ON':
headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
headers += list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) # errorMessage.pb for errormessage
# externalErrorMsg.pb for External Error message
headers += list(find_files('*.pb', '${externalError_INCLUDE_DIR}'))
class InstallCommand(InstallCommandBase):
def finalize_options(self):
......
Usage:
Please run:
```
bash start.sh
```
The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
If you want to crawl a specified version of CUDA, Please run:
```
bash start.sh <version> <URL(optional)>
```
URL can be derived by default, so you don't have to enter a URL.
for example:
```
bash start.sh 11.0
```
will capture error message of CUDA11.0(in future).
Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl
import re
import urllib2
import json
import collections
import sys, getopt
import cuda_error_pb2
def parsing(cuda_errorDesc, version, url):
All_Messages = cuda_errorDesc.AllMessages.add()
All_Messages.version = int(version)
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib2.urlopen(url).read()
res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)
url_list = url.split('/')
url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
dic = collections.OrderedDict()
dic_message = collections.OrderedDict()
for line in m_div:
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, line, re.S | re.M)
for error in m_dt:
res_type = r'<span class="ph ph apiData">(.*?)</span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
m_message = m_message.replace('\n', '')
res_a = r'(<a class=.*?</a>)'
res_shape = r'<a class=.*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
m_message = m_message.replace(
'<h6 class=\"deprecated_header\">Deprecated</h6>', '')
res_span = r'(<span class=.*?</span>)'
res_span_detail = r'<span class=.*?>(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S |
re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx],
list_span_detail[idx])
res_p = r'(<p>.*?</p>)'
res_p_detail = r'<p>(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
m_message = m_message.replace(' ', '')
_Messages = All_Messages.Messages.add()
try:
_Messages.errorCode = int(m_type)
except ValueError:
if re.match('0x', m_type):
_Messages.errorCode = int(m_type, 16)
else:
raise ValueError
_Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface
def main(argv):
version = []
url = []
try:
opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
except getopt.GetoptError:
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit()
elif opt in ("-v", "--version"):
version = arg
elif opt in ("-u", "--url"):
url = arg
version = version.split(',')
url = url.split(',')
assert len(version) == len(url)
cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
for idx in range(len(version)):
if version[idx] == "-1":
print("crawling errorMessage for CUDA%s from %s" %
("-latest-version", url[idx]))
else:
print("crawling errorMessage for CUDA%s from %s" %
(version[idx], url[idx]))
parsing(cuda_errorDesc, version[idx], url[idx])
serializeToString = cuda_errorDesc.SerializeToString()
with open("cudaErrorMessage.pb", "wb") as f:
f.write(serializeToString
) # save for cudaErrorMessage.pb from python-protobuf interface
print("crawling errorMessage for CUDA has been done!!!")
if __name__ == "__main__":
main(sys.argv[1:])
Usage:
Please run:
```
bash start.sh
```
If you want to update all external error message, you need to run command `bash start.sh` in current directory,
and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl
import re
import urllib.request
import json
import collections
import sys, getopt
import external_error_pb2
def parsing(externalErrorDesc):
#*********************************************************************************************#
#*********************************** CUDA Error Message **************************************#
print("start crawling errorMessage for nvidia CUDA API--->")
url = 'https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUDA
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
res_type = r'<span class="enum-member-name-def">(.*?) = <span class="ph ph apiData">(.*?)</span></span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
m_message = m_message.replace('\n', '')
res_a = r'(<a class=.*?</a>)'
res_shape = r'<a class=.*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
m_message = m_message.replace(
'<h6 class=\"deprecated_header\">Deprecated</h6>', '')
res_span = r'(<span class=.*?</span>)'
res_span_detail = r'<span class=.*?>(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx], list_span_detail[idx])
res_p = r'(<p>.*?</p>)'
res_p_detail = r'<p>(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
m_message = m_message.replace(' ', '')
_Messages = allMessageDesc.messages.add()
try:
_Messages.code = int(m_type[1])
except ValueError:
if re.match('0x', m_type[1]):
_Messages.code = int(m_type[1], 16)
else:
raise ValueError
_Messages.message = "'%s'. %s" % (m_type[0], m_message)
print("End crawling errorMessage for nvidia CUDA API!\n")
#***********************************************************************************************#
#*********************************** CURAND Error Message **************************************#
print("start crawling errorMessage for nvidia CURAND API--->")
url = 'https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CURAND
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<div class="section">.*?<p>CURAND function call status types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
res_type = r'<span class="enum-member-name-def">(.*?) = <span class="ph ph apiData">(.*?)</span></span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
_Messages = allMessageDesc.messages.add()
try:
_Messages.code = int(m_type[1])
except ValueError:
if re.match('0x', m_type[1]):
_Messages.code = int(m_type[1], 16)
else:
raise ValueError
_Messages.message = "'%s'. %s" % (m_type[0], m_message)
print("End crawling errorMessage for nvidia CURAND API!\n")
#**************************************************************************************************#
#*********************************** CUDNN Error Message ******************************************#
cudnnStatus_t = {
"CUDNN_STATUS_SUCCESS": 0,
"CUDNN_STATUS_NOT_INITIALIZED": 1,
"CUDNN_STATUS_ALLOC_FAILED": 2,
"CUDNN_STATUS_BAD_PARAM": 3,
"CUDNN_STATUS_INTERNAL_ERROR": 4,
"CUDNN_STATUS_INVALID_VALUE": 5,
"CUDNN_STATUS_ARCH_MISMATCH": 6,
"CUDNN_STATUS_MAPPING_ERROR": 7,
"CUDNN_STATUS_EXECUTION_FAILED": 8,
"CUDNN_STATUS_NOT_SUPPORTED": 9,
"CUDNN_STATUS_LICENSE_ERROR": 10,
"CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING": 11,
"CUDNN_STATUS_RUNTIME_IN_PROGRESS": 12,
"CUDNN_STATUS_RUNTIME_FP_OVERFLOW": 13,
}
print("start crawling errorMessage for nvidia CUDNN API--->")
url = 'https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnStatus_t'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUDNN
html = urllib.request.urlopen(url).read().decode('utf-8')
f = open('1.txt', 'w')
f.write(html)
res_div = r'<div class="section" id="cudnnStatus_t__section_lmp_dgr_2jb"><a name="cudnnStatus_t__section_lmp_dgr_2jb" shape="rect">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<dt class="dt dlterm"><samp class="ph codeph">(.*?)</samp></dt>.*?<dd class="dd">(.*?)</dd>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = error[1]
res_class = r'<p class="p">.*?</p>'
res_class_detail = r'<p class="p">(.*?)</p>'
list_class = re.findall(res_class, m_message, re.S | re.M)
list_class_detail = re.findall(res_class_detail, m_message, re.S | re.M)
assert len(list_class) == len(list_class_detail)
for idx in range(len(list_class)):
m_message = m_message.replace(list_class[idx],
list_class_detail[idx])
res_a = r'(<a class="xref".*?</a>)'
res_shape = r'<a class="xref".*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
res_span = r'(<span class="ph">.*?</span>)'
res_span_detail = r'<span class="ph">(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx], list_span_detail[idx])
res_samp = r'(<samp class="ph codeph">.*?</samp>)'
res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
list_samp = re.findall(res_samp, m_message, re.S | re.M)
list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
assert len(list_samp) == len(list_samp_detail)
for idx in range(len(list_samp)):
m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
m_message = re.sub(r'\n +', ' ', m_message)
_Messages = allMessageDesc.messages.add()
_Messages.code = int(cudnnStatus_t[error[0]])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia CUDNN API!\n")
#*************************************************************************************************#
#*********************************** CUBLAS Error Message ****************************************#
cublasStatus_t = {
"CUBLAS_STATUS_SUCCESS": 0,
"CUBLAS_STATUS_NOT_INITIALIZED": 1,
"CUBLAS_STATUS_ALLOC_FAILED": 3,
"CUBLAS_STATUS_INVALID_VALUE": 7,
"CUBLAS_STATUS_ARCH_MISMATCH": 8,
"CUBLAS_STATUS_MAPPING_ERROR": 11,
"CUBLAS_STATUS_EXECUTION_FAILED": 13,
"CUBLAS_STATUS_INTERNAL_ERROR": 14,
"CUBLAS_STATUS_NOT_SUPPORTED": 15,
"CUBLAS_STATUS_LICENSE_ERROR": 16
}
print("start crawling errorMessage for nvidia CUBLAS API--->")
url = 'https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUBLAS
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<p class="p">The type is used for function status returns. All cuBLAS library.*?<div class="tablenoborder">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<p class="p"><samp class="ph codeph">(.*?)</samp></p>.*?colspan="1">(.*?)</td>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = error[1]
m_message = re.sub(r'\n +', ' ', m_message)
res_p = r'<p class="p">.*?</p>'
res_p_detail = r'<p class="p">(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
res_samp = r'<samp class="ph codeph">.*?</samp>'
res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
list_samp = re.findall(res_samp, m_message, re.S | re.M)
list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
assert len(list_samp) == len(list_samp_detail)
for idx in range(len(list_samp)):
m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
_Messages = allMessageDesc.messages.add()
_Messages.code = int(cublasStatus_t[error[0]])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia CUBLAS API!\n")
#*************************************************************************************************#
#*********************************** CUSOLVER Error Message **************************************#
cusolverStatus_t = {
"CUSOLVER_STATUS_SUCCESS": 0,
"CUSOLVER_STATUS_NOT_INITIALIZED": 1,
"CUSOLVER_STATUS_ALLOC_FAILED": 2,
"CUSOLVER_STATUS_INVALID_VALUE": 3,
"CUSOLVER_STATUS_ARCH_MISMATCH": 4,
"CUSOLVER_STATUS_MAPPING_ERROR": 5,
"CUSOLVER_STATUS_EXECUTION_FAILED": 6,
"CUSOLVER_STATUS_INTERNAL_ERROR": 7,
"CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED": 8,
"CUSOLVER_STATUS_NOT_SUPPORTED": 9,
"CUSOLVER_STATUS_ZERO_PIVOT": 10,
"CUSOLVER_STATUS_INVALID_LICENSE": 11,
"CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED": 12,
"CUSOLVER_STATUS_IRS_PARAMS_INVALID": 13,
"CUSOLVER_STATUS_IRS_INTERNAL_ERROR": 14,
"CUSOLVER_STATUS_IRS_NOT_SUPPORTED": 15,
"CUSOLVER_STATUS_IRS_OUT_OF_RANGE": 16,
"CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES": 17,
"CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED": 18
}
print("start crawling errorMessage for nvidia CUSOLVER API--->")
url = 'https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverSPstatus'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUSOLVER
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'This is a status type returned by the library functions and.*?<div class="tablenoborder">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<samp class="ph codeph">(.*?)</samp></td>.*?colspan="1">(.*?)</td>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = error[1]
m_message = re.sub(r'\n +', '', m_message)
m_message = re.sub(r'<p class="p"></p>', '', m_message)
res_p = r'<p class="p">.*?</p>'
res_p_detail = r'<p class="p">(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
res_samp = r'<samp class="ph codeph">.*?</samp>'
res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
list_samp = re.findall(res_samp, m_message, re.S | re.M)
list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
assert len(list_samp) == len(list_samp_detail)
for idx in range(len(list_samp)):
m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
res_strong = r'<strong class="ph b">.*?</strong>'
res_strong_detail = r'<strong class="ph b">(.*?)</strong>'
list_strong = re.findall(res_strong, m_message, re.S | re.M)
list_strong_detail = re.findall(res_strong_detail, m_message, re.S |
re.M)
assert len(list_strong) == len(list_strong_detail)
for idx in range(len(list_strong)):
m_message = m_message.replace(list_strong[idx],
list_strong_detail[idx])
_Messages = allMessageDesc.messages.add()
_Messages.code = int(cusolverStatus_t[error[0]])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia CUSOLVER API!\n")
#**********************************************************************************************#
#*************************************** NCCL error *******************************************#
print("start crawling errorMessage for nvidia NCCL API--->")
url = 'https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclresult-t'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.NCCL
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<code class="descname">ncclResult_t</code>(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<code class="descname">(.*?)</code>.*?<span class="pre">(.*?)</span></code>\)(.*?)</p>\n</dd></dl>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = re.sub(r'\n', '', error[2])
_Messages = allMessageDesc.messages.add()
_Messages.code = int(error[1])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia NCCL API!\n")
def main(argv):
try:
opts, _ = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
print('python spider.py')
sys.exit(2)
for opt, _ in opts:
if opt in ("-h", "--help"):
print('python spider.py')
sys.exit(2)
externalErrorDesc = external_error_pb2.ExternalErrorDesc()
parsing(externalErrorDesc)
serializedString = externalErrorDesc.SerializeToString()
with open("externalErrorMsg.pb", "wb") as f:
# save for externalErrorMsg.pb from Python-protobuf interface
# load from C++-protobuf interface and get error message
f.write(serializedString)
print(
"Generating data file [externalErrorMsg.pb] for external third_party API error has been done!"
)
if __name__ == "__main__":
main(sys.argv[1:])
......@@ -29,19 +29,7 @@ else
echo "please run on Mac/Linux"
exit 1
fi
protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto
version=90,100,-1 # -1 represent the latest cuda-version
url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
if [ "$1" != "" ]; then
version=$version,$(($1*10))
if [ "$2" != "" ]; then
url=$url,$2
else
url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
fi
fi
python spider.py --version=$version --url=$url
tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
python3.7 spider.py
tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册