提交 04b1a45a 编写于 作者: M Megvii Engine Team

fix(dnn): fix cudnn crash when finalize called after cudnn dtor

GitOrigin-RevId: b0ad639921e8ba1e370696f16a9d87024a83f4c9
上级 14a089c4
......@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON)
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON)
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON)
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF)
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF)
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON)
option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF)
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF)
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON)
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF)
......@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF)
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
if(MSVC OR WIN32)
message(STATUS "windows force cudnn static link")
set(MGE_WITH_CUDNN_SHARED OFF)
endif()
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB)
set(MGE_WITH_ANY_CUDA_STUB ON)
else()
......@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA)
endif()
endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib)
list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib)
else()
list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static)
endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublas.lib)
else()
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static)
if(MGE_WITH_CUBLAS_SHARED)
list(APPEND MGE_CUDA_LIBS cublas)
else()
list(APPEND MGE_CUDA_LIBS cublas_static)
endif()
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublasLt.lib)
else()
list(APPEND MGE_CUDA_LIBS cublasLt_static)
if(MGE_WITH_CUBLAS_SHARED)
list(APPEND MGE_CUDA_LIBS cublasLt)
else()
list(APPEND MGE_CUDA_LIBS cublasLt_static)
endif()
endif()
endif()
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)
......
......@@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle):
#if CUDA_VERSION >= 10010
megdnn_assert(cublasLtGetVersion() >= 10010,
"cuda library version is too low to run cublasLt");
#endif
#if CUDNN_VERSION >= 8000
megdnn_log_warn(R"(
Cudnn8 will jit ptx code with cache. You can set
CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow).
For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)");
#endif
cudnn_check(cudnnCreate(&m_cudnn_handle));
cublas_check(cublasCreate(&m_cublas_handle));
......
......@@ -199,4 +199,4 @@ def test_dp_correctness():
model_name = "mnist_model_with_test.mge"
model_path = os.path.join(os.path.dirname(__file__), model_name)
set_execution_strategy("HEURISTIC_REPRODUCIBLE")
run_test(model_path, False, False, max_err=1e-5)
run_test(model_path, False, False, max_err=5e-5)
......@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference
from megengine.utils.network import Network as Net
def check_pygraph_dump(trace_func, inp_data, expect_results):
def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None):
orig_model = io.BytesIO()
inp_size = len(inp_data)
out_size = len(expect_results)
......@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results):
results = graph.run(inp_dict=inp_dict)
for ind, tensor in enumerate(expect_results):
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
if max_err:
np.testing.assert_almost_equal(
tensor.numpy(), results[output_names[ind]], max_err
)
else:
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
assert tensor.dtype == results[output_names[ind]].dtype
......@@ -178,7 +183,8 @@ def test_convtranspose():
data = Tensor(np.random.random((1, 32, 32, 32)))
result = fwd(data)
check_pygraph_dump(fwd, [data], [result])
# cu111 has 1e-7 diff
check_pygraph_dump(fwd, [data], [result], 5)
@pytest.mark.skip(reason="pytest aborted")
......
......@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}"
if [ $SDK_NAME == "cu101" ];then
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF"
BUILD_GCC8="ON"
REQUIR_CUDA_VERSION="10010"
REQUIR_CUDNN_VERSION="7.6.3"
......@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \
......@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \
......
......@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
mgb_assert(
m_property.mem_alignment ==
MegDNNHandle::get(*this).handle()->alignment_requirement());
auto err = atexit(&CompNode::finalize);
mgb_assert(!err, "failed to register CompNode::finalize at exit");
}
MGB_CATCH(std::exception & exc, {
mgb_log_error("async cuda init failed: %s", exc.what());
......@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node,
mgb_assert(
m_property.mem_alignment ==
MegDNNHandle::get(*this).handle()->alignment_requirement());
auto err = atexit(&CompNode::finalize);
mgb_assert(!err, "failed to register CompNode::finalize at exit");
}
MGB_CATCH(std::exception & exc, {
mgb_log_error("async rocm init failed: %s", exc.what());
......
......@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}
//! close for cu111 ci, reopen it when bug fixed
#if CUDA_VERSION < 11000
TEST(TestEnableTensorCore, Nchw4Nchw) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
......@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}
}
#endif
TEST(TestEnableTensorCore, ConvBiasWithZ) {
REQUIRE_GPU(1);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册