提交 04b1a45a 编写于 作者: M Megvii Engine Team

fix(dnn): fix cudnn crash when finalize called after cudnn dtor

GitOrigin-RevId: b0ad639921e8ba1e370696f16a9d87024a83f4c9
上级 14a089c4
...@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) ...@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON)
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON)
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON)
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF)
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON)
option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF)
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF)
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON)
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF)
...@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF) ...@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF)
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
if(MSVC OR WIN32)
message(STATUS "windows force cudnn static link")
set(MGE_WITH_CUDNN_SHARED OFF)
endif()
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB)
set(MGE_WITH_ANY_CUDA_STUB ON) set(MGE_WITH_ANY_CUDA_STUB ON)
else() else()
...@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA) ...@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA)
endif() endif()
endif() endif()
if(MSVC OR WIN32) if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib)
else()
list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static)
endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublas.lib)
else() else()
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) if(MGE_WITH_CUBLAS_SHARED)
list(APPEND MGE_CUDA_LIBS cublas)
else()
list(APPEND MGE_CUDA_LIBS cublas_static)
endif()
endif() endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
if(MSVC OR WIN32) if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublasLt.lib) list(APPEND MGE_CUDA_LIBS cublasLt.lib)
else() else()
list(APPEND MGE_CUDA_LIBS cublasLt_static) if(MGE_WITH_CUBLAS_SHARED)
list(APPEND MGE_CUDA_LIBS cublasLt)
else()
list(APPEND MGE_CUDA_LIBS cublasLt_static)
endif()
endif() endif()
endif() endif()
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)
......
...@@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): ...@@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle):
#if CUDA_VERSION >= 10010 #if CUDA_VERSION >= 10010
megdnn_assert(cublasLtGetVersion() >= 10010, megdnn_assert(cublasLtGetVersion() >= 10010,
"cuda library version is too low to run cublasLt"); "cuda library version is too low to run cublasLt");
#endif
#if CUDNN_VERSION >= 8000
megdnn_log_warn(R"(
Cudnn8 will jit ptx code with cache. You can set
CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow).
For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)");
#endif #endif
cudnn_check(cudnnCreate(&m_cudnn_handle)); cudnn_check(cudnnCreate(&m_cudnn_handle));
cublas_check(cublasCreate(&m_cublas_handle)); cublas_check(cublasCreate(&m_cublas_handle));
......
...@@ -199,4 +199,4 @@ def test_dp_correctness(): ...@@ -199,4 +199,4 @@ def test_dp_correctness():
model_name = "mnist_model_with_test.mge" model_name = "mnist_model_with_test.mge"
model_path = os.path.join(os.path.dirname(__file__), model_name) model_path = os.path.join(os.path.dirname(__file__), model_name)
set_execution_strategy("HEURISTIC_REPRODUCIBLE") set_execution_strategy("HEURISTIC_REPRODUCIBLE")
run_test(model_path, False, False, max_err=1e-5) run_test(model_path, False, False, max_err=5e-5)
...@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference ...@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference
from megengine.utils.network import Network as Net from megengine.utils.network import Network as Net
def check_pygraph_dump(trace_func, inp_data, expect_results): def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None):
orig_model = io.BytesIO() orig_model = io.BytesIO()
inp_size = len(inp_data) inp_size = len(inp_data)
out_size = len(expect_results) out_size = len(expect_results)
...@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results): ...@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results):
results = graph.run(inp_dict=inp_dict) results = graph.run(inp_dict=inp_dict)
for ind, tensor in enumerate(expect_results): for ind, tensor in enumerate(expect_results):
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) if max_err:
np.testing.assert_almost_equal(
tensor.numpy(), results[output_names[ind]], max_err
)
else:
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
assert tensor.dtype == results[output_names[ind]].dtype assert tensor.dtype == results[output_names[ind]].dtype
...@@ -178,7 +183,8 @@ def test_convtranspose(): ...@@ -178,7 +183,8 @@ def test_convtranspose():
data = Tensor(np.random.random((1, 32, 32, 32))) data = Tensor(np.random.random((1, 32, 32, 32)))
result = fwd(data) result = fwd(data)
check_pygraph_dump(fwd, [data], [result]) # cu111 has 1e-7 diff
check_pygraph_dump(fwd, [data], [result], 5)
@pytest.mark.skip(reason="pytest aborted") @pytest.mark.skip(reason="pytest aborted")
......
...@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}" ...@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}"
if [ $SDK_NAME == "cu101" ];then if [ $SDK_NAME == "cu101" ];then
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF"
BUILD_GCC8="ON" BUILD_GCC8="ON"
REQUIR_CUDA_VERSION="10010" REQUIR_CUDA_VERSION="10010"
REQUIR_CUDNN_VERSION="7.6.3" REQUIR_CUDNN_VERSION="7.6.3"
...@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then ...@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8" ${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \ -gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \ arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \ arch=compute_75,code=sm_75 \
...@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then ...@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8" ${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \ -gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \ arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \ arch=compute_75,code=sm_75 \
......
...@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, ...@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
mgb_assert( mgb_assert(
m_property.mem_alignment == m_property.mem_alignment ==
MegDNNHandle::get(*this).handle()->alignment_requirement()); MegDNNHandle::get(*this).handle()->alignment_requirement());
auto err = atexit(&CompNode::finalize);
mgb_assert(!err, "failed to register CompNode::finalize at exit");
} }
MGB_CATCH(std::exception & exc, { MGB_CATCH(std::exception & exc, {
mgb_log_error("async cuda init failed: %s", exc.what()); mgb_log_error("async cuda init failed: %s", exc.what());
...@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node, ...@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node,
mgb_assert( mgb_assert(
m_property.mem_alignment == m_property.mem_alignment ==
MegDNNHandle::get(*this).handle()->alignment_requirement()); MegDNNHandle::get(*this).handle()->alignment_requirement());
auto err = atexit(&CompNode::finalize);
mgb_assert(!err, "failed to register CompNode::finalize at exit");
} }
MGB_CATCH(std::exception & exc, { MGB_CATCH(std::exception & exc, {
mgb_log_error("async rocm init failed: %s", exc.what()); mgb_log_error("async rocm init failed: %s", exc.what());
......
...@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) { ...@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
} }
//! close for cu111 ci, reopen it when bug fixed
#if CUDA_VERSION < 11000
TEST(TestEnableTensorCore, Nchw4Nchw) { TEST(TestEnableTensorCore, Nchw4Nchw) {
REQUIRE_GPU(1); REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0"); auto cn = CompNode::load("gpu0");
...@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { ...@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
} }
} }
#endif
TEST(TestEnableTensorCore, ConvBiasWithZ) { TEST(TestEnableTensorCore, ConvBiasWithZ) {
REQUIRE_GPU(1); REQUIRE_GPU(1);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册