diff --git a/CMakeLists.txt b/CMakeLists.txt
index de972698499c3335f571956ccba812adc71c5d5e..b78b7a9517c9dbc32f9d0f61ef77569caf7eea0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON)
 option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON)
 option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON)
 option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF)
-option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF)
+option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON)
+option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF)
 option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF)
 option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON)
 option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF)
@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF)
 option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
 
 
+if(MSVC OR WIN32)
+message(STATUS "windows force cudnn static link")
+set(MGE_WITH_CUDNN_SHARED OFF)
+endif()
+
 if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB)
     set(MGE_WITH_ANY_CUDA_STUB ON)
 else()
@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA)
             endif()
         endif()
         if(MSVC OR WIN32)
-            list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib)
+            list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib)
+        else()
+            list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static)
+        endif()
+        if(MSVC OR WIN32)
+            list(APPEND MGE_CUDA_LIBS cublas.lib)
         else()
-            list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static)
+            if(MGE_WITH_CUBLAS_SHARED)
+                list(APPEND MGE_CUDA_LIBS cublas)
+            else()
+                list(APPEND MGE_CUDA_LIBS cublas_static)
+            endif()
         endif()
         if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
             if(MSVC OR WIN32)
                 list(APPEND MGE_CUDA_LIBS cublasLt.lib)
             else()
-                list(APPEND MGE_CUDA_LIBS cublasLt_static)
+                if(MGE_WITH_CUBLAS_SHARED)
+                    list(APPEND MGE_CUDA_LIBS cublasLt)
+                else()
+                    list(APPEND MGE_CUDA_LIBS cublasLt_static)
+                endif()
             endif()
         endif()
         if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)
diff --git a/dnn/src/cuda/handle.cpp b/dnn/src/cuda/handle.cpp
index fe0650aa7ed0235926a1f348f501ec43a52c27cb..e81241fbaaf3be6e7c8b9140e3288a2a059c76f5 100644
--- a/dnn/src/cuda/handle.cpp
+++ b/dnn/src/cuda/handle.cpp
@@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle):
 #if CUDA_VERSION >= 10010
     megdnn_assert(cublasLtGetVersion() >= 10010,
         "cuda library version is too low to run cublasLt");
+#endif
+#if CUDNN_VERSION >= 8000
+    megdnn_log_warn(R"(
+        Cudnn8 will jit ptx code with cache. You can set 
+        CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow).
+        For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)");
 #endif
     cudnn_check(cudnnCreate(&m_cudnn_handle));
     cublas_check(cublasCreate(&m_cublas_handle));
diff --git a/imperative/python/test/integration/test_dp_correctness.py b/imperative/python/test/integration/test_dp_correctness.py
index 1ef34175380e747b14bdb6c9d2a8f82fe9e45413..f4e986329f1c617385238e348d87a3544b241e66 100644
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -199,4 +199,4 @@ def test_dp_correctness():
     model_name = "mnist_model_with_test.mge"
     model_path = os.path.join(os.path.dirname(__file__), model_name)
     set_execution_strategy("HEURISTIC_REPRODUCIBLE")
-    run_test(model_path, False, False, max_err=1e-5)
+    run_test(model_path, False, False, max_err=5e-5)
diff --git a/imperative/python/test/unit/utils/test_network_node.py b/imperative/python/test/unit/utils/test_network_node.py
index d293c3c475f59b42e34c91f42ff649e198eb7c4f..34e7c599f31b311e4ab7b588f21287a43ec81d67 100644
--- a/imperative/python/test/unit/utils/test_network_node.py
+++ b/imperative/python/test/unit/utils/test_network_node.py
@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference
 from megengine.utils.network import Network as Net
 
 
-def check_pygraph_dump(trace_func, inp_data, expect_results):
+def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None):
     orig_model = io.BytesIO()
     inp_size = len(inp_data)
     out_size = len(expect_results)
@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results):
     results = graph.run(inp_dict=inp_dict)
 
     for ind, tensor in enumerate(expect_results):
-        np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
+        if max_err:
+            np.testing.assert_almost_equal(
+                tensor.numpy(), results[output_names[ind]], max_err
+            )
+        else:
+            np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
         assert tensor.dtype == results[output_names[ind]].dtype
 
 
@@ -178,7 +183,8 @@ def test_convtranspose():
 
     data = Tensor(np.random.random((1, 32, 32, 32)))
     result = fwd(data)
-    check_pygraph_dump(fwd, [data], [result])
+    # cu111 has 1e-7 diff
+    check_pygraph_dump(fwd, [data], [result], 5)
 
 
 @pytest.mark.skip(reason="pytest aborted")
diff --git a/scripts/whl/manylinux2014/build_wheel_common.sh b/scripts/whl/manylinux2014/build_wheel_common.sh
index b1d39fa626ec9c2956d1fd81d639d6ae67fa154c..06303852660766edfec5d490ec354b88a9c709c8 100755
--- a/scripts/whl/manylinux2014/build_wheel_common.sh
+++ b/scripts/whl/manylinux2014/build_wheel_common.sh
@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}"
 
 if [ $SDK_NAME == "cu101" ];then
     CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1"
-    EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" 
+    EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" 
     BUILD_GCC8="ON"  
     REQUIR_CUDA_VERSION="10010" 
     REQUIR_CUDNN_VERSION="7.6.3" 
@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then
         ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
         ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
         ${CUDNN_LIB_DIR}/libcudnn.so.8"
-    EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\
+    EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
         -gencode arch=compute_61,code=sm_61 \
         arch=compute_70,code=sm_70 \
         arch=compute_75,code=sm_75 \
@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then
         ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
         ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
         ${CUDNN_LIB_DIR}/libcudnn.so.8"
-    EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \
+    EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
         -gencode arch=compute_61,code=sm_61 \
         arch=compute_70,code=sm_70 \
         arch=compute_75,code=sm_75 \
diff --git a/src/core/impl/comp_node_env.cpp b/src/core/impl/comp_node_env.cpp
index 3ca26df4337b3788c04f4bd084108b4ff45f1fb5..41594ed90a9896905e96ab8cdd9c16e8842ce880 100644
--- a/src/core/impl/comp_node_env.cpp
+++ b/src/core/impl/comp_node_env.cpp
@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
             mgb_assert(
                     m_property.mem_alignment ==
                     MegDNNHandle::get(*this).handle()->alignment_requirement());
+            auto err = atexit(&CompNode::finalize);
+            mgb_assert(!err, "failed to register CompNode::finalize at exit");
         }
         MGB_CATCH(std::exception & exc, {
             mgb_log_error("async cuda init failed: %s", exc.what());
@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node,
             mgb_assert(
                     m_property.mem_alignment ==
                     MegDNNHandle::get(*this).handle()->alignment_requirement());
+            auto err = atexit(&CompNode::finalize);
+            mgb_assert(!err, "failed to register CompNode::finalize at exit");
         }
         MGB_CATCH(std::exception & exc, {
             mgb_log_error("async rocm init failed: %s", exc.what());
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index c18e774ffd68448ed70df8039bd6abbdba59f0ce..5646ea3d741348464a75ab40bd4ef2d88a9f57c6 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) {
     MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
 }
 
-//! close for cu111 ci, reopen it when bug fixed
-#if CUDA_VERSION < 11000
 TEST(TestEnableTensorCore, Nchw4Nchw) {
     REQUIRE_GPU(1);
     auto cn = CompNode::load("gpu0");
@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) {
         MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
     }
 }
-#endif
 
 TEST(TestEnableTensorCore, ConvBiasWithZ) {
     REQUIRE_GPU(1);