diff --git a/CMakeLists.txt b/CMakeLists.txt index de972698499c3335f571956ccba812adc71c5d5e..b78b7a9517c9dbc32f9d0f61ef77569caf7eea0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) -option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) +option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON) +option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF) option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) @@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF) option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) +if(MSVC OR WIN32) +message(STATUS "windows force cudnn static link") +set(MGE_WITH_CUDNN_SHARED OFF) +endif() + if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) set(MGE_WITH_ANY_CUDA_STUB ON) else() @@ -472,15 +478,28 @@ if(MGE_WITH_CUDA) endif() endif() if(MSVC OR WIN32) - list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) + list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) + else() + list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static) + endif() + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS cublas.lib) else() - list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) + if(MGE_WITH_CUBLAS_SHARED) + list(APPEND MGE_CUDA_LIBS cublas) + else() + list(APPEND MGE_CUDA_LIBS cublas_static) + endif() endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") if(MSVC OR WIN32) list(APPEND MGE_CUDA_LIBS cublasLt.lib) else() - list(APPEND MGE_CUDA_LIBS cublasLt_static) + if(MGE_WITH_CUBLAS_SHARED) + list(APPEND MGE_CUDA_LIBS cublasLt) + else() + list(APPEND MGE_CUDA_LIBS cublasLt_static) + endif() endif() endif() if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) diff --git a/dnn/src/cuda/handle.cpp b/dnn/src/cuda/handle.cpp index fe0650aa7ed0235926a1f348f501ec43a52c27cb..e81241fbaaf3be6e7c8b9140e3288a2a059c76f5 100644 --- a/dnn/src/cuda/handle.cpp +++ b/dnn/src/cuda/handle.cpp @@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): #if CUDA_VERSION >= 10010 megdnn_assert(cublasLtGetVersion() >= 10010, "cuda library version is too low to run cublasLt"); +#endif +#if CUDNN_VERSION >= 8000 + megdnn_log_warn(R"( + Cudnn8 will jit ptx code with cache. You can set + CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow). + For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)"); #endif cudnn_check(cudnnCreate(&m_cudnn_handle)); cublas_check(cublasCreate(&m_cublas_handle)); diff --git a/imperative/python/test/integration/test_dp_correctness.py b/imperative/python/test/integration/test_dp_correctness.py index 1ef34175380e747b14bdb6c9d2a8f82fe9e45413..f4e986329f1c617385238e348d87a3544b241e66 100644 --- a/imperative/python/test/integration/test_dp_correctness.py +++ b/imperative/python/test/integration/test_dp_correctness.py @@ -199,4 +199,4 @@ def test_dp_correctness(): model_name = "mnist_model_with_test.mge" model_path = os.path.join(os.path.dirname(__file__), model_name) set_execution_strategy("HEURISTIC_REPRODUCIBLE") - run_test(model_path, False, False, max_err=1e-5) + run_test(model_path, False, False, max_err=5e-5) diff --git a/imperative/python/test/unit/utils/test_network_node.py b/imperative/python/test/unit/utils/test_network_node.py index d293c3c475f59b42e34c91f42ff649e198eb7c4f..34e7c599f31b311e4ab7b588f21287a43ec81d67 100644 --- a/imperative/python/test/unit/utils/test_network_node.py +++ b/imperative/python/test/unit/utils/test_network_node.py @@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference from megengine.utils.network import Network as Net -def check_pygraph_dump(trace_func, inp_data, expect_results): +def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None): orig_model = io.BytesIO() inp_size = len(inp_data) out_size = len(expect_results) @@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results): results = graph.run(inp_dict=inp_dict) for ind, tensor in enumerate(expect_results): - np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) + if max_err: + np.testing.assert_almost_equal( + tensor.numpy(), results[output_names[ind]], max_err + ) + else: + np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) assert tensor.dtype == results[output_names[ind]].dtype @@ -178,7 +183,8 @@ def test_convtranspose(): data = Tensor(np.random.random((1, 32, 32, 32))) result = fwd(data) - check_pygraph_dump(fwd, [data], [result]) + # cu111 has 1e-7 diff + check_pygraph_dump(fwd, [data], [result], 5) @pytest.mark.skip(reason="pytest aborted") diff --git a/scripts/whl/manylinux2014/build_wheel_common.sh b/scripts/whl/manylinux2014/build_wheel_common.sh index b1d39fa626ec9c2956d1fd81d639d6ae67fa154c..06303852660766edfec5d490ec354b88a9c709c8 100755 --- a/scripts/whl/manylinux2014/build_wheel_common.sh +++ b/scripts/whl/manylinux2014/build_wheel_common.sh @@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}" if [ $SDK_NAME == "cu101" ];then CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" - EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" + EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" BUILD_GCC8="ON" REQUIR_CUDA_VERSION="10010" REQUIR_CUDNN_VERSION="7.6.3" @@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ ${CUDNN_LIB_DIR}/libcudnn.so.8" - EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ + EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ -gencode arch=compute_61,code=sm_61 \ arch=compute_70,code=sm_70 \ arch=compute_75,code=sm_75 \ @@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ ${CUDNN_LIB_DIR}/libcudnn.so.8" - EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ + EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ -gencode arch=compute_61,code=sm_61 \ arch=compute_70,code=sm_70 \ arch=compute_75,code=sm_75 \ diff --git a/src/core/impl/comp_node_env.cpp b/src/core/impl/comp_node_env.cpp index 3ca26df4337b3788c04f4bd084108b4ff45f1fb5..41594ed90a9896905e96ab8cdd9c16e8842ce880 100644 --- a/src/core/impl/comp_node_env.cpp +++ b/src/core/impl/comp_node_env.cpp @@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, mgb_assert( m_property.mem_alignment == MegDNNHandle::get(*this).handle()->alignment_requirement()); + auto err = atexit(&CompNode::finalize); + mgb_assert(!err, "failed to register CompNode::finalize at exit"); } MGB_CATCH(std::exception & exc, { mgb_log_error("async cuda init failed: %s", exc.what()); @@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node, mgb_assert( m_property.mem_alignment == MegDNNHandle::get(*this).handle()->alignment_requirement()); + auto err = atexit(&CompNode::finalize); + mgb_assert(!err, "failed to register CompNode::finalize at exit"); } MGB_CATCH(std::exception & exc, { mgb_log_error("async rocm init failed: %s", exc.what()); diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index c18e774ffd68448ed70df8039bd6abbdba59f0ce..5646ea3d741348464a75ab40bd4ef2d88a9f57c6 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) { MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); } -//! close for cu111 ci, reopen it when bug fixed -#if CUDA_VERSION < 11000 TEST(TestEnableTensorCore, Nchw4Nchw) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); @@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); } } -#endif TEST(TestEnableTensorCore, ConvBiasWithZ) { REQUIRE_GPU(1);