From ecda253a8506af39ae04eedfd757219b86819623 Mon Sep 17 00:00:00 2001 From: zhwesky2010 <1183042833@qq.com> Date: Tue, 30 May 2023 19:31:46 +0800 Subject: [PATCH] [BUG] Optimize GPU error message file search path (#54180) --- cmake/third_party.cmake | 16 ++-- paddle/fluid/pybind/pybind.cc | 5 ++ paddle/phi/core/enforce.cc | 73 ++++++++++--------- python/paddle/fluid/core.py | 1 + .../fluid/tests/unittests/test_exception.py | 17 +++++ 5 files changed, 70 insertions(+), 42 deletions(-) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 28032f50726..8587d025dbb 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -333,23 +333,21 @@ if(WITH_GPU) ${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa )# download file externalErrorMsg.tar.gz if(WITH_TESTING) - # copy externalErrorMsg.pb, just for unittest can get error message correctly. + # copy externalErrorMsg.pb for UnitTest set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) - if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) - set(DST_DIR1 - ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data) - else() - set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data) - endif() - set(DST_DIR2 + # for python UT 'test_exception.py' + set(DST_DIR1 ${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data ) + # for C++ UT 'enforce_test' + set(DST_DIR2 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data) add_custom_command( TARGET download_externalError POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1} COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2} - COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}") + COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR1}" + COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR2}") endif() endif() diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7e09266271c..7db02616fa8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1924,6 +1924,11 @@ All parameter, weight, gradient are variables in Paddle. m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) { platform::DeviceContextPool::Instance().Get(place)->Wait(); }); + m.def("_test_enforce_gpu_success", []() { +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS(cudaErrorInsufficientDriver); +#endif + }); m.def("get_float_stats", []() { std::vector> float_stats; diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc index 1291571141c..174326c2368 100644 --- a/paddle/phi/core/enforce.cc +++ b/paddle/phi/core/enforce.cc @@ -270,30 +270,27 @@ std::string GetExternalErrorMsg(T status) { bool _initSucceed = false; phi::proto::ExternalErrorDesc externalError; if (externalError.ByteSizeLong() == 0) { - std::string filePath; + std::string search_path_1; + std::string search_path_2; + std::string search_path_3; #if !defined(_WIN32) Dl_info info; if (dladdr(reinterpret_cast(GetCurrentTraceBackString), &info)) { - std::string strModule(info.dli_fname); - const size_t last_slash_idx = strModule.find_last_of("/"); - std::string compare_path = strModule.substr(strModule.length() - 6); + std::string phi_so_path(info.dli_fname); + const size_t last_slash_idx = phi_so_path.find_last_of("/"); if (std::string::npos != last_slash_idx) { - strModule.erase(last_slash_idx, std::string::npos); - } - // TODO(lizhiyu02): I don't know what the 'compare_path.compare("avx.so") - // == 0' means, while - // 'compare_path.find("dist-packages") != std::string::npos' means that - // after using 'pip install paddle'. - if (compare_path.compare("avx.so") == 0 || - strModule.find("dist-packages") != std::string::npos) { - filePath = - strModule + - "/../include/third_party/externalError/data/externalErrorMsg.pb"; - } else { - // Just for unittest - filePath = strModule + - "/../third_party/externalError/data/externalErrorMsg.pb"; + phi_so_path.erase(last_slash_idx, std::string::npos); } + // due to 'phi_so_path' may be 'site-packages/paddle/libs/libphi.so' or + // 'build/paddle/phi/libphi.so', we have different search path + search_path_1 = + phi_so_path + + "/../include/third_party/externalError/data/externalErrorMsg.pb"; + search_path_2 = phi_so_path + + "/../third_party/externalError/data/externalErrorMsg.pb"; + search_path_3 = + phi_so_path + + "/../../third_party/externalError/data/externalErrorMsg.pb"; } #else char buf[512]; @@ -303,24 +300,34 @@ std::string GetExternalErrorMsg(T status) { ? (HMODULE)mbi.AllocationBase : NULL; GetModuleFileName(h_module, buf, 512); - std::string strModule(buf); - const size_t last_slash_idx = strModule.find_last_of("\\"); - std::string compare_path = strModule.substr(strModule.length() - 7); + std::string exe_path(buf); + const size_t last_slash_idx = exe_path.find_last_of("\\"); if (std::string::npos != last_slash_idx) { - strModule.erase(last_slash_idx, std::string::npos); - } - if (strModule.find("dist-packages") != std::string::npos) { - filePath = strModule + - "\\..\\include\\third_" - "party\\externalerror\\data\\externalErrorMsg.pb"; - } else { - filePath = strModule + - "\\..\\..\\third_party" - "\\externalerror\\data\\externalErrorMsg.pb"; + exe_path.erase(last_slash_idx, std::string::npos); } + // due to 'exe_path' may be 'site-packages\\paddle\\fluid\\libpaddle.pyd' or + // 'build\\paddle\\fluid\\platform\\enforce_test.exe', we have different + // search path + search_path_1 = + exe_path + + "\\..\\include\\third_party\\externalError\\data\\externalErrorMsg.pb"; + search_path_2 = + exe_path + + "\\..\\third_party\\externalError\\data\\externalErrorMsg.pb"; + search_path_3 = + exe_path + + "\\..\\..\\third_party\\externalError\\data\\externalErrorMsg.pb"; #endif - std::ifstream fin(filePath, std::ios::in | std::ios::binary); + std::ifstream fin(search_path_1, std::ios::in | std::ios::binary); _initSucceed = externalError.ParseFromIstream(&fin); + if (!_initSucceed) { + std::ifstream fin(search_path_2, std::ios::in | std::ios::binary); + _initSucceed = externalError.ParseFromIstream(&fin); + } + if (!_initSucceed) { + std::ifstream fin(search_path_3, std::ios::in | std::ios::binary); + _initSucceed = externalError.ParseFromIstream(&fin); + } } using __CUDA_STATUS_TYPE__ = decltype(status); phi::proto::ApiType proto_type = diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 1d587c44912..bcfd845d7fc 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -298,6 +298,7 @@ try: from .libpaddle import _set_paddle_lib_path from .libpaddle import _create_loaded_parameter from .libpaddle import _cuda_synchronize + from .libpaddle import _test_enforce_gpu_success from .libpaddle import _is_compiled_with_heterps from .libpaddle import _promote_types_if_complex_exists from .libpaddle import _set_cached_executor_build_strategy diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py index fb8dd8e1d0f..5194acb9890 100644 --- a/python/paddle/fluid/tests/unittests/test_exception.py +++ b/python/paddle/fluid/tests/unittests/test_exception.py @@ -32,6 +32,23 @@ class TestException(unittest.TestCase): self.assertIsNotNone(exception) + def test_gpu_success(self): + if not paddle.is_compiled_with_cuda(): + return + + try: + core._test_enforce_gpu_success() + except Exception as e: + self.assertTrue(isinstance(e, OSError)) + self.assertIn( + "CUDA error(35), CUDA driver version is insufficient for CUDA runtime version.", + str(e), + ) + self.assertIn( + "[Hint: 'cudaErrorInsufficientDriver'. This indicates that the installed NVIDIA CUDA driver is older than the CUDA runtime library. This is not a supported configuration.Users should install an updated NVIDIA display driver to allow the application to run.]", + str(e), + ) + class TestExceptionNoCStack(unittest.TestCase): def setUp(self): -- GitLab