未验证 提交 e64ce0bb 编写于 作者: Z zyfncg 提交者: GitHub

move some function of cuda error from enforce.h to enforce.cc (#52828)

上级 802129b3
...@@ -23,6 +23,10 @@ limitations under the License. */ ...@@ -23,6 +23,10 @@ limitations under the License. */
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/utils/blank.h" #include "paddle/utils/blank.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/core/external_error.pb.h"
#endif // PADDLE_WITH_CUDA
DECLARE_int32(call_stack_level); DECLARE_int32(call_stack_level);
namespace egr { namespace egr {
...@@ -177,5 +181,174 @@ std::string SimplifyErrorTypeFormat(const std::string& str) { ...@@ -177,5 +181,174 @@ std::string SimplifyErrorTypeFormat(const std::string& str) {
return sout.str(); return sout.str();
} }
/**************************************************************************/
/**************************** NVIDIA ERROR ********************************/
#ifdef PADDLE_WITH_CUDA
namespace details {
template <typename T>
struct ExternalApiProtoType {};
#define DEFINE_EXTERNAL_API_PROTO_TYPE(type, proto_type) \
template <> \
struct ExternalApiProtoType<type> { \
using Type = type; \
static constexpr const char* kTypeString = #proto_type; \
static constexpr phi::proto::ApiType kProtoType = \
phi::proto::ApiType::proto_type; \
}
DEFINE_EXTERNAL_API_PROTO_TYPE(cudaError_t, CUDA);
DEFINE_EXTERNAL_API_PROTO_TYPE(curandStatus_t, CURAND);
DEFINE_EXTERNAL_API_PROTO_TYPE(cudnnStatus_t, CUDNN);
DEFINE_EXTERNAL_API_PROTO_TYPE(cublasStatus_t, CUBLAS);
DEFINE_EXTERNAL_API_PROTO_TYPE(cusparseStatus_t, CUSPARSE);
DEFINE_EXTERNAL_API_PROTO_TYPE(cusolverStatus_t, CUSOLVER);
DEFINE_EXTERNAL_API_PROTO_TYPE(cufftResult_t, CUFFT);
DEFINE_EXTERNAL_API_PROTO_TYPE(CUresult, CU);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_EXTERNAL_API_PROTO_TYPE(ncclResult_t, NCCL);
#endif
#undef DEFINE_EXTERNAL_API_PROTO_TYPE
} // namespace details
template <typename T>
inline const char* GetErrorMsgUrl(T status) {
using __CUDA_STATUS_TYPE__ = decltype(status);
phi::proto::ApiType proto_type =
details::ExternalApiProtoType<__CUDA_STATUS_TYPE__>::kProtoType;
switch (proto_type) {
case phi::proto::ApiType::CUDA:
case phi::proto::ApiType::CU:
return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
"group__CUDART__TYPES.html#group__CUDART__TYPES_"
"1g3f51e3575c2178246db0a94a430e0038";
break;
case phi::proto::ApiType::CURAND:
return "https://docs.nvidia.com/cuda/curand/"
"group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437";
break;
case phi::proto::ApiType::CUDNN:
return "https://docs.nvidia.com/deeplearning/cudnn/api/"
"index.html#cudnnStatus_t";
break;
case phi::proto::ApiType::CUBLAS:
return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t";
break;
case phi::proto::ApiType::CUSOLVER:
return "https://docs.nvidia.com/cuda/cusolver/"
"index.html#cuSolverSPstatus";
break;
case phi::proto::ApiType::NCCL:
return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
"types.html#ncclresult-t";
break;
case phi::proto::ApiType::CUFFT:
return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
case phi::proto::ApiType::CUSPARSE:
return "https://docs.nvidia.com/cuda/cusparse/"
"index.html#cusparseStatus_t";
break;
default:
return "Unknown type of External API, can't get error message URL!";
break;
}
}
template <typename T>
std::string GetExternalErrorMsg(T status) {
std::ostringstream sout;
bool _initSucceed = false;
phi::proto::ExternalErrorDesc externalError;
if (externalError.ByteSizeLong() == 0) {
std::string filePath;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.so") == 0) {
filePath =
strModule +
"/../include/third_party/externalError/data/externalErrorMsg.pb";
} else {
filePath = strModule +
"/../../third_party/externalError/data/externalErrorMsg.pb";
}
}
#else
char buf[512];
MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module =
(::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 512);
std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.pyd") == 0) {
filePath = strModule +
"\\..\\include\\third_"
"party\\externalerror\\data\\externalErrorMsg.pb";
} else {
filePath =
strModule +
"\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
}
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
_initSucceed = externalError.ParseFromIstream(&fin);
}
using __CUDA_STATUS_TYPE__ = decltype(status);
phi::proto::ApiType proto_type =
details::ExternalApiProtoType<__CUDA_STATUS_TYPE__>::kProtoType;
if (_initSucceed) {
for (int i = 0; i < externalError.errors_size(); ++i) {
if (proto_type == externalError.errors(i).type()) {
for (int j = 0; j < externalError.errors(i).messages_size(); ++j) {
if (status == externalError.errors(i).messages(j).code()) {
sout << "\n [Hint: "
<< externalError.errors(i).messages(j).message() << "]";
return sout.str();
}
}
}
}
}
sout << "\n [Hint: Please search for the error code(" << status
<< ") on website (" << GetErrorMsgUrl(status)
<< ") to get Nvidia's official solution and advice about "
<< details::ExternalApiProtoType<__CUDA_STATUS_TYPE__>::kTypeString
<< " Error.]";
return sout.str();
}
template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
template std::string GetExternalErrorMsg<cusparseStatus_t>(cusparseStatus_t);
template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
template std::string GetExternalErrorMsg<CUresult>(CUresult);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
#endif
#endif // PADDLE_WITH_CUDA
} // namespace enforce } // namespace enforce
} // namespace phi } // namespace phi
...@@ -33,8 +33,6 @@ limitations under the License. */ ...@@ -33,8 +33,6 @@ limitations under the License. */
#include <cusparse.h> #include <cusparse.h>
#include <thrust/system/cuda/error.h> #include <thrust/system/cuda/error.h>
#include <thrust/system_error.h> #include <thrust/system_error.h>
#include "paddle/phi/core/external_error.pb.h"
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -90,7 +88,6 @@ limitations under the License. */ ...@@ -90,7 +88,6 @@ limitations under the License. */
#endif // PADDLE_WITH_HIP #endif // PADDLE_WITH_HIP
// Note: these headers for simplify demangle type string // Note: these headers for simplify demangle type string
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/type_defs.h" #include "paddle/phi/core/type_defs.h"
// Note: this header for simplify HIP and CUDA type string // Note: this header for simplify HIP and CUDA type string
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -615,162 +612,30 @@ namespace details { ...@@ -615,162 +612,30 @@ namespace details {
template <typename T> template <typename T>
struct ExternalApiType {}; struct ExternalApiType {};
#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \ #define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
template <> \ template <> \
struct ExternalApiType<type> { \ struct ExternalApiType<type> { \
using Type = type; \ using Type = type; \
static constexpr Type kSuccess = success_value; \ static constexpr Type kSuccess = success_value; \
static constexpr const char* kTypeString = #proto_type; \
static constexpr phi::proto::ApiType kProtoType = \
phi::proto::ApiType::proto_type; \
} }
DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA); DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess);
DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS, CUSPARSE); DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT); DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS, CU); DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
#endif #endif
} // namespace details } // namespace details
template <typename T> template <typename T>
inline const char* GetErrorMsgUrl(T status) { std::string GetExternalErrorMsg(T status);
using __CUDA_STATUS_TYPE__ = decltype(status);
phi::proto::ApiType proto_type =
details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
switch (proto_type) {
case phi::proto::ApiType::CUDA:
case phi::proto::ApiType::CU:
return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
"group__CUDART__TYPES.html#group__CUDART__TYPES_"
"1g3f51e3575c2178246db0a94a430e0038";
break;
case phi::proto::ApiType::CURAND:
return "https://docs.nvidia.com/cuda/curand/"
"group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437";
break;
case phi::proto::ApiType::CUDNN:
return "https://docs.nvidia.com/deeplearning/cudnn/api/"
"index.html#cudnnStatus_t";
break;
case phi::proto::ApiType::CUBLAS:
return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t";
break;
case phi::proto::ApiType::CUSOLVER:
return "https://docs.nvidia.com/cuda/cusolver/"
"index.html#cuSolverSPstatus";
break;
case phi::proto::ApiType::NCCL:
return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
"types.html#ncclresult-t";
break;
case phi::proto::ApiType::CUFFT:
return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
case phi::proto::ApiType::CUSPARSE:
return "https://docs.nvidia.com/cuda/cusparse/"
"index.html#cusparseStatus_t";
break;
default:
return "Unknown type of External API, can't get error message URL!";
break;
}
}
template <typename T>
inline std::string GetExternalErrorMsg(T status) {
std::ostringstream sout;
bool _initSucceed = false;
phi::proto::ExternalErrorDesc externalError;
if (externalError.ByteSizeLong() == 0) {
std::string filePath;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.so") == 0) {
filePath =
strModule +
"/../include/third_party/externalError/data/externalErrorMsg.pb";
} else {
filePath = strModule +
"/../../third_party/externalError/data/externalErrorMsg.pb";
}
}
#else
char buf[512];
MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module =
(::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 512);
std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.pyd") == 0) {
filePath = strModule +
"\\..\\include\\third_"
"party\\externalerror\\data\\externalErrorMsg.pb";
} else {
filePath =
strModule +
"\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
}
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
_initSucceed = externalError.ParseFromIstream(&fin);
}
using __CUDA_STATUS_TYPE__ = decltype(status);
phi::proto::ApiType proto_type =
details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
if (_initSucceed) {
for (int i = 0; i < externalError.errors_size(); ++i) {
if (proto_type == externalError.errors(i).type()) {
for (int j = 0; j < externalError.errors(i).messages_size(); ++j) {
if (status == externalError.errors(i).messages(j).code()) {
sout << "\n [Hint: "
<< externalError.errors(i).messages(j).message() << "]";
return sout.str();
}
}
}
}
}
sout << "\n [Hint: Please search for the error code(" << status
<< ") on website (" << GetErrorMsgUrl(status)
<< ") to get Nvidia's official solution and advice about "
<< details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString
<< " Error.]";
return sout.str();
}
template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
template std::string GetExternalErrorMsg<cusparseStatus_t>(cusparseStatus_t);
template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
template std::string GetExternalErrorMsg<CUresult>(CUresult);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
#endif
/*************** CUDA ERROR ***************/ /*************** CUDA ERROR ***************/
inline bool is_error(cudaError_t e) { return e != cudaSuccess; } inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册