未验证 提交 ecda253a 编写于 作者: zhouweiwei2014's avatar zhouweiwei2014 提交者: GitHub

[BUG] Optimize GPU error message file search path (#54180)

上级 dd9a04b7
......@@ -333,23 +333,21 @@ if(WITH_GPU)
${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa
)# download file externalErrorMsg.tar.gz
if(WITH_TESTING)
# copy externalErrorMsg.pb, just for unittest can get error message correctly.
# copy externalErrorMsg.pb for UnitTest
set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
# for python UT 'test_exception.py'
set(DST_DIR1
${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
else()
set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
endif()
set(DST_DIR2
${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data
)
# for C++ UT 'enforce_test'
set(DST_DIR2 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
add_custom_command(
TARGET download_externalError
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1}
COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2}
COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR1}"
COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR2}")
endif()
endif()
......
......@@ -1924,6 +1924,11 @@ All parameter, weight, gradient are variables in Paddle.
m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
platform::DeviceContextPool::Instance().Get(place)->Wait();
});
m.def("_test_enforce_gpu_success", []() {
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_GPU_SUCCESS(cudaErrorInsufficientDriver);
#endif
});
m.def("get_float_stats", []() {
std::vector<paddle::platform::ExportedStatValue<float>> float_stats;
......
......@@ -270,30 +270,27 @@ std::string GetExternalErrorMsg(T status) {
bool _initSucceed = false;
phi::proto::ExternalErrorDesc externalError;
if (externalError.ByteSizeLong() == 0) {
std::string filePath;
std::string search_path_1;
std::string search_path_2;
std::string search_path_3;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
std::string phi_so_path(info.dli_fname);
const size_t last_slash_idx = phi_so_path.find_last_of("/");
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
phi_so_path.erase(last_slash_idx, std::string::npos);
}
// TODO(lizhiyu02): I don't know what the 'compare_path.compare("avx.so")
// == 0' means, while
// 'compare_path.find("dist-packages") != std::string::npos' means that
// after using 'pip install paddle'.
if (compare_path.compare("avx.so") == 0 ||
strModule.find("dist-packages") != std::string::npos) {
filePath =
strModule +
// due to 'phi_so_path' may be 'site-packages/paddle/libs/libphi.so' or
// 'build/paddle/phi/libphi.so', we have different search path
search_path_1 =
phi_so_path +
"/../include/third_party/externalError/data/externalErrorMsg.pb";
} else {
// Just for unittest
filePath = strModule +
search_path_2 = phi_so_path +
"/../third_party/externalError/data/externalErrorMsg.pb";
}
search_path_3 =
phi_so_path +
"/../../third_party/externalError/data/externalErrorMsg.pb";
}
#else
char buf[512];
......@@ -303,24 +300,34 @@ std::string GetExternalErrorMsg(T status) {
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 512);
std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7);
std::string exe_path(buf);
const size_t last_slash_idx = exe_path.find_last_of("\\");
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (strModule.find("dist-packages") != std::string::npos) {
filePath = strModule +
"\\..\\include\\third_"
"party\\externalerror\\data\\externalErrorMsg.pb";
} else {
filePath = strModule +
"\\..\\..\\third_party"
"\\externalerror\\data\\externalErrorMsg.pb";
exe_path.erase(last_slash_idx, std::string::npos);
}
// due to 'exe_path' may be 'site-packages\\paddle\\fluid\\libpaddle.pyd' or
// 'build\\paddle\\fluid\\platform\\enforce_test.exe', we have different
// search path
search_path_1 =
exe_path +
"\\..\\include\\third_party\\externalError\\data\\externalErrorMsg.pb";
search_path_2 =
exe_path +
"\\..\\third_party\\externalError\\data\\externalErrorMsg.pb";
search_path_3 =
exe_path +
"\\..\\..\\third_party\\externalError\\data\\externalErrorMsg.pb";
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
std::ifstream fin(search_path_1, std::ios::in | std::ios::binary);
_initSucceed = externalError.ParseFromIstream(&fin);
if (!_initSucceed) {
std::ifstream fin(search_path_2, std::ios::in | std::ios::binary);
_initSucceed = externalError.ParseFromIstream(&fin);
}
if (!_initSucceed) {
std::ifstream fin(search_path_3, std::ios::in | std::ios::binary);
_initSucceed = externalError.ParseFromIstream(&fin);
}
}
using __CUDA_STATUS_TYPE__ = decltype(status);
phi::proto::ApiType proto_type =
......
......@@ -298,6 +298,7 @@ try:
from .libpaddle import _set_paddle_lib_path
from .libpaddle import _create_loaded_parameter
from .libpaddle import _cuda_synchronize
from .libpaddle import _test_enforce_gpu_success
from .libpaddle import _is_compiled_with_heterps
from .libpaddle import _promote_types_if_complex_exists
from .libpaddle import _set_cached_executor_build_strategy
......
......@@ -32,6 +32,23 @@ class TestException(unittest.TestCase):
self.assertIsNotNone(exception)
def test_gpu_success(self):
if not paddle.is_compiled_with_cuda():
return
try:
core._test_enforce_gpu_success()
except Exception as e:
self.assertTrue(isinstance(e, OSError))
self.assertIn(
"CUDA error(35), CUDA driver version is insufficient for CUDA runtime version.",
str(e),
)
self.assertIn(
"[Hint: 'cudaErrorInsufficientDriver'. This indicates that the installed NVIDIA CUDA driver is older than the CUDA runtime library. This is not a supported configuration.Users should install an updated NVIDIA display driver to allow the application to run.]",
str(e),
)
class TestExceptionNoCStack(unittest.TestCase):
def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册