From 4a178a8dba03be3b18c7089d714f3b84816b95a1 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Thu, 20 Aug 2020 22:30:17 +0800
Subject: [PATCH] feat(windows/cuda/cmake): support cmake cuda build on windows

GitOrigin-RevId: 4d9832e5592cb0e2e39172f0ebf52546e1e5b269
---
 CMakeLists.txt                                | 70 +++++++++++++--
 cmake/cudnn.cmake                             |  4 +-
 cmake/tensorrt.cmake                          |  6 +-
 dnn/cuda-stub/CMakeLists.txt                  | 12 ++-
 dnn/include/megdnn/arch.h                     |  5 ++
 dnn/include/megdnn/basic_types.h              |  2 +-
 dnn/src/cuda/conv_bias/chanwise/fwd.cu        | 53 +++++------
 dnn/src/cuda/convolution/chanwise/bwd_data.cu | 79 +++++++++--------
 .../cuda/convolution/chanwise/bwd_filter.cu   | 88 ++++++++++++-------
 .../block_tile_iterator_basic.cuh             |  2 +-
 .../block_tile_iterator_coxhw.cuh             |  2 +-
 .../block_tile_iterator_unroll_width.cuh      |  2 +-
 .../block_tile_iterator_unroll_width_v2.cuh   |  2 +-
 dnn/src/cuda/convolution_helper/kernel.cuh    |  8 +-
 .../deformable_ps_roi_pooling/kimpl/kern.cu   |  8 +-
 dnn/src/cuda/elemwise_helper.cuh              |  4 +-
 scripts/cmake-build/BUILD_README.md           | 18 +++-
 scripts/cmake-build/host_build.sh             | 16 +++-
 scripts/whl/BUILD_PYTHON_WHL_README.md        |  6 +-
 src/CMakeLists.txt                            |  6 +-
 src/core/impl/utils/debug.cpp                 |  2 +
 21 files changed, 262 insertions(+), 133 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37359316..6bdb21c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,7 +116,7 @@ endif()
 if(MSVC OR WIN32)
     add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
     message("-- into windows build...")
-    message(" -- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
+    message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
     if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl")
         message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md")
     endif()
@@ -131,12 +131,20 @@ if(MSVC OR WIN32)
     set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion")
     set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default")
     set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated  -Wno-error=documentation  -Wno-error=unreachable-code-break")
-    set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601")
+    set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601 /wd4819")
     set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj")
     set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport")
 
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}")
+
+    #FIXME: fix halide JIT on windows
+    message("-- disable jit and halide on windows host build...")
+    set(MGE_WITH_HALIDE OFF)
+    set(MGE_WITH_JIT OFF)
+    #FIXME: fix MegRay on windows
+    message("-- Disable distributed build on windows host build...")
+    set(MGE_WITH_DISTRIBUTED OFF)
 else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
     set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
@@ -286,7 +294,16 @@ if(MGE_WITH_CUDA)
     set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
     set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g")
     set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os")
-    set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all")
+    if(MSVC OR WIN32)
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all")
+        set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068")
+        if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd")
+        endif()
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ")
+    else()
+        set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all")
+    endif()
 
     if(NOT MGE_ENABLE_RTTI)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti")
@@ -332,15 +349,29 @@ if(MGE_WITH_CUDA)
     endif()
     if(MGE_CUDA_USE_STATIC)
         if(MGE_WITH_TRT)
-            list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive)
+            if(MSVC OR WIN32)
+                list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY})
+                message("-- windows TRT_LIBRARY: ${TRT_LIBRARY}")
+                message("-- windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
+            else()
+                list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive)
+            endif()
         else()
             list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive)
         endif()
-        list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static)
+        if(MSVC OR WIN32)
+            list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib)
+        else()
+            list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static)
+        endif()
         if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
-            list(APPEND MGE_CUDA_LIBS cublasLt_static)
+            if(MSVC OR WIN32)
+                list(APPEND MGE_CUDA_LIBS cublasLt.lib)
+            else()
+                list(APPEND MGE_CUDA_LIBS cublasLt_static)
+            endif()
         endif()
-        if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0")
+        if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)
             # mark all symbols from liblapack_static.a as weak to avoid
             # duplicated definition with mkl
             find_library(
@@ -377,7 +408,11 @@ if(MGE_WITH_CUDA)
     endif()
 
     add_subdirectory(dnn/cuda-stub)
-    list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt)
+    if(MSVC OR WIN32)
+        list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub)
+    else()
+        list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt)
+    endif()
     set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}")
 endif()
 
@@ -699,3 +734,22 @@ if (NOT MGE_WITH_DISTRIBUTED)
                 ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake
             DESTINATION ${MGE_INSTALL_CMAKEDIR})
 endif()
+
+if(MSVC OR WIN32)
+    add_compile_options(
+        $<$<CONFIG:>:/MT>
+        $<$<CONFIG:Debug>:/MTd>
+        $<$<CONFIG:Release>:/MT>
+        )
+    foreach (CompilerFlag
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+        if(${CompilerFlag} MATCHES "/MD")
+            string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
+            set(${CompilerFlag} "${${CompilerFlag}}" CACHE STRING "msvc compiler flags" FORCE)
+            message("MSVC flags: ${CompilerFlag}:${${CompilerFlag}}")
+        endif()
+    endforeach()
+endif()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index b8cef397..6310c95d 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -9,7 +9,7 @@ endif()
 
 if(MGE_CUDA_USE_STATIC)
     find_library(CUDNN_LIBRARY 
-        NAMES libcudnn_static.a libcudnn_static.lib
+        NAMES libcudnn_static.a cudnn.lib
         PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
         HINTS ${SYSTEM_LIBRARY_PATHS}
         PATH_SUFFIXES lib lib64
@@ -30,7 +30,7 @@ endif()
 get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH)
 find_path(CUDNN_INCLUDE_DIR 
     NAMES cudnn.h
-    HINTS ${PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root}
+    HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root}
     PATH_SUFFIXES include 
     DOC "Path to CUDNN include directory." )
 
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 7205f907..49062cf8 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -1,17 +1,17 @@
-if($ENV{LIBRARY_PATH})
+if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "")
     string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH})
 endif()
 
 if(MGE_CUDA_USE_STATIC)
     find_library(TRT_LIBRARY 
-        NAMES libnvinfer_static.a libnvinfer_static.lib
+        NAMES libnvinfer_static.a nvinfer.lib
         PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
         HINTS ${SYSTEM_LIBRARY_PATHS}
         PATH_SUFFIXES lib lib64
         DOC "TRT library." )
 else()
     find_library(TRT_LIBRARY 
-        NAMES libnvinfer.so libnvinfer.dylib
+        NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll
         PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
         HINTS ${SYSTEM_LIBRARY_PATHS}
         PATH_SUFFIXES lib lib64
diff --git a/dnn/cuda-stub/CMakeLists.txt b/dnn/cuda-stub/CMakeLists.txt
index 2bd634a2..ac75d255 100644
--- a/dnn/cuda-stub/CMakeLists.txt
+++ b/dnn/cuda-stub/CMakeLists.txt
@@ -1,7 +1,15 @@
 file (GLOB_RECURSE SOURCES src/*.cpp)
 
-add_library (cuda-stub SHARED ${SOURCES})
+if(MSVC OR WIN32)
+    add_library (cuda-stub STATIC ${SOURCES})
+else()
+    add_library (cuda-stub SHARED ${SOURCES})
+endif()
 set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda)
 target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL)
-target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined)
+if (MSVC OR WIN32)
+    target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined)
+else()
+    target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined)
+endif()
 install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS})
diff --git a/dnn/include/megdnn/arch.h b/dnn/include/megdnn/arch.h
index 0eef2351..c83323b7 100644
--- a/dnn/include/megdnn/arch.h
+++ b/dnn/include/megdnn/arch.h
@@ -140,4 +140,9 @@
  #define MEGDNN_DEVICE
 #endif
 
+#if defined(_MSC_VER) || defined(WIN32)
+ #define ATTR_ALIGNED(v) __declspec(align(v))
+#else
+ #define ATTR_ALIGNED(v) __attribute__((aligned(v)))
+#endif
 // vim: syntax=cpp.doxygen
diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h
index 6c8c8cf1..1ba892a7 100644
--- a/dnn/include/megdnn/basic_types.h
+++ b/dnn/include/megdnn/basic_types.h
@@ -215,9 +215,9 @@ struct TensorLayout : public TensorShape {
     DType dtype;
     Format format;
 
-#if MEGDNN_CC_HOST
     TensorLayout();
 
+#if MEGDNN_CC_HOST
     TensorLayout(const TensorLayout& layout) = default;
 
     //! create empty layout with given dtype
diff --git a/dnn/src/cuda/conv_bias/chanwise/fwd.cu b/dnn/src/cuda/conv_bias/chanwise/fwd.cu
index 83e71ee2..477857bb 100644
--- a/dnn/src/cuda/conv_bias/chanwise/fwd.cu
+++ b/dnn/src/cuda/conv_bias/chanwise/fwd.cu
@@ -275,51 +275,52 @@ __global__ void kern_fwd_half(__half* dst, const __half* src,
 
 #define SET_SW(func, type, sw)                         \
     if (param.flt_h == 2 && param.flt_w == 2) {        \
-        kern = func<type, 1, 2, 2, sw>;                \
+        f_struct.f = func<type, 1, 2, 2, sw>;          \
     } else if (param.flt_h == 3 && param.flt_w == 3) { \
-        kern = func<type, 1, 3, 3, sw>;                \
+        f_struct.f = func<type, 1, 3, 3, sw>;          \
     } else if (param.flt_h == 5 && param.flt_w == 5) { \
-        kern = func<type, 1, 5, 5, sw>;                \
+        f_struct.f = func<type, 1, 5, 5, sw>;          \
     } else if (param.flt_h == 7 && param.flt_w == 7) { \
-        kern = func<type, 1, 7, 7, sw>;                \
+        f_struct.f = func<type, 1, 7, 7, sw>;          \
     } else {                                           \
-        kern = func<type, 1, 0, 0, sw>;                \
+        f_struct.f = func<type, 1, 0, 0, sw>;          \
     }
 
-#define GET_KERN(func, type)                              \
-    void (*kern)(type*, const type*, const type*, Param); \
-    if (param.chl_mul == 1) {                             \
-        if (param.stride_w == 1) {                        \
-            SET_SW(func, type, 1)                         \
-        } else {                                          \
-            SET_SW(func, type, 0)                         \
-        }                                                 \
-    } else {                                              \
-        kern = func<type, 0, 0, 0, 0>;                    \
-    }                                                     \
-    return kern;
+#define GET_KERN(func, type)                 \
+    FixFunction<type> f_struct;              \
+    if (param.chl_mul == 1) {                \
+        if (param.stride_w == 1) {           \
+            SET_SW(func, type, 1)            \
+        } else {                             \
+            SET_SW(func, type, 0)            \
+        }                                    \
+    } else {                                 \
+        f_struct.f = func<type, 0, 0, 0, 0>; \
+    }                                        \
+    return f_struct;
 
 template <typename T>
-void (*get_kern(const Param& param))(T*, const T*, const T*, const Param);
+struct FixFunction {
+    void (*f)(T*, const T*, const T*, Param);
+};
+
+template <typename T>
+FixFunction<T> get_kern(const Param& param);
 
 template <>
-void (*get_kern<float>(const Param& param))(float*, const float*, const float*,
-                                            const Param) {
+FixFunction<float> get_kern<float>(const Param& param) {
     GET_KERN(kern_fwd_float, float);
 }
 
 #if CUDA_VERSION >= 9000
 template <>
-void (*get_kern<__half>(const Param& param))(__half*, const __half*,
-                                             const __half*, const Param) {
+FixFunction<__half> get_kern<__half>(const Param& param) {
     GET_KERN(kern_fwd_half, __half);
 }
 #endif
 
 template <>
-void (*get_kern<dt_float16>(const Param& param))(dt_float16*, const dt_float16*,
-                                                 const dt_float16*,
-                                                 const Param) {
+FixFunction<dt_float16> get_kern<dt_float16>(const Param& param) {
     GET_KERN(kern_fwd_float, dt_float16);
 }
 
@@ -337,7 +338,7 @@ template <typename T>
 void run_fwd(T* dst, const T* src, const T* flt, const Param& param,
              cudaStream_t stream) {
     void (*kern)(T*, const T*, const T*, Param);
-    kern = get_kern<T>(param);
+    kern = get_kern<T>(param).f;
 
     int nr_thread = query_blocksize_for_kernel(kern),
         nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul;
diff --git a/dnn/src/cuda/convolution/chanwise/bwd_data.cu b/dnn/src/cuda/convolution/chanwise/bwd_data.cu
index b0d345a5..6c50c6bd 100644
--- a/dnn/src/cuda/convolution/chanwise/bwd_data.cu
+++ b/dnn/src/cuda/convolution/chanwise/bwd_data.cu
@@ -178,25 +178,29 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
                                 __half2 dst2 = {0.0, 0.0};
                                 if (static_cast<uint32_t>(ow) <
                                     static_cast<uint32_t>(owmin_y)) {
-                                    dst2 = {*(pd + ow), 0.0};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = 0.0;
                                     sum = fma2(dst2, flt3, sum);
                                     ++ow;
                                     --fw;
                                 }
                                 if (static_cast<uint32_t>(owmax_x) <
                                     static_cast<uint32_t>(owmax)) {
-                                    dst2 = {0.0, *(pd + owmax)};
+                                    dst2.x = 0.0;
+                                    dst2.y = *(pd + owmax);
                                     sum = fma2(dst2, flt0, sum);
                                 }
                                 if (static_cast<uint32_t>(fw) == 1) {
-                                    dst2 = {*(pd + ow), *(pd + ow)};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = *(pd + ow);
                                     sum = fma2(dst2, flt2, sum);
                                     ++ow;
                                     --fw;
                                 }
                                 if (static_cast<uint32_t>(ow) <=
                                     static_cast<uint32_t>(owmax_x)) {
-                                    dst2 = {*(pd + ow), *(pd + ow)};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = *(pd + ow);
                                     sum = fma2(dst2, flt1, sum);
                                 }
 
@@ -218,18 +222,21 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
                                 __half2 dst2 = {0.0, 0.0};
                                 if (static_cast<uint32_t>(ow) <
                                     static_cast<uint32_t>(owmin_y)) {
-                                    dst2 = {*(pd + ow), 0.0};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = 0.0;
                                     sum = fma2(dst2, flt5, sum);
                                     ++ow;
                                     --fw;
                                 }
                                 if (static_cast<uint32_t>(owmax_x) <
                                     static_cast<uint32_t>(owmax)) {
-                                    dst2 = {0.0, *(pd + owmax)};
+                                    dst2.x = 0.0;
+                                    dst2.y = *(pd + owmax);
                                     sum = fma2(dst2, flt0, sum);
                                 }
                                 if (static_cast<uint32_t>(fw) == 3) {
-                                    dst2 = {*(pd + ow), *(pd + ow)};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = *(pd + ow);
                                     sum = fma2(dst2, flt4, sum);
                                     ++ow;
                                     --fw;
@@ -237,7 +244,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
                                 if (static_cast<uint32_t>(fw) == 2 &&
                                     static_cast<uint32_t>(ow) <=
                                             static_cast<uint32_t>(owmax_x)) {
-                                    dst2 = {*(pd + ow), *(pd + ow)};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = *(pd + ow);
                                     sum = fma2(dst2, flt3, sum);
                                     ++ow;
                                     --fw;
@@ -245,7 +253,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
                                 if (static_cast<uint32_t>(fw) == 1 &&
                                     static_cast<uint32_t>(ow) <=
                                             static_cast<uint32_t>(owmax_x)) {
-                                    dst2 = {*(pd + ow), *(pd + ow)};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = *(pd + ow);
                                     sum = fma2(dst2, flt2, sum);
                                     ++ow;
                                     --fw;
@@ -253,7 +262,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
                                 if (static_cast<uint32_t>(fw) == 0 &&
                                     static_cast<uint32_t>(ow) <=
                                             static_cast<uint32_t>(owmax_x)) {
-                                    dst2 = {*(pd + ow), *(pd + ow)};
+                                    dst2.x = *(pd + ow);
+                                    dst2.y = *(pd + ow);
                                     sum = fma2(dst2, flt1, sum);
                                 }
 
@@ -270,8 +280,10 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
                                     uint32_t fw = iw - ow + PW;
                                     if (static_cast<uint32_t>(ow) <=
                                         static_cast<uint32_t>(owmax)) {
-                                        pd2 = {*(pd + ow), *(pd + ow)};
-                                        pf2 = {0.0, 0.0};
+                                        pd2.x = *(pd + ow);
+                                        pd2.y = *(pd + ow);
+                                        pf2.x = 0.0;
+                                        pf2.y = 0.0;
                                         if (static_cast<uint32_t>(ow) >=
                                             static_cast<uint32_t>(owmin_y))
                                             pf2.y = *(pf + fw + 1);
@@ -425,16 +437,17 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
 
 #define sh param.stride_h
 #define sw param.stride_w
-#define SET_STRIDE(func, type, chl_mul, fh, fw)       \
-    if (sh == 1 && sw == 1) {                         \
-        kern_ptr = func<type, chl_mul, fh, fw, 1, 1>; \
-    } else if (sh == 2 && sw == 2) {                  \
-        kern_ptr = func<type, chl_mul, fh, fw, 2, 2>; \
-    } else {                                          \
-        kern_ptr = func<type, chl_mul, fh, fw, 0, 0>; \
+#define SET_STRIDE(func, type, chl_mul, fh, fw)         \
+    if (sh == 1 && sw == 1) {                           \
+        f_struct.f = func<type, chl_mul, fh, fw, 1, 1>; \
+    } else if (sh == 2 && sw == 2) {                    \
+        f_struct.f = func<type, chl_mul, fh, fw, 2, 2>; \
+    } else {                                            \
+        f_struct.f = func<type, chl_mul, fh, fw, 0, 0>; \
     }
 
 #define GET_KERN(func, type)                               \
+    FixFunction<type> f_struct;                            \
     if (param.chl_mul == 1) {                              \
         if (param.flt_h == 3 && param.flt_w == 3) {        \
             SET_STRIDE(func, type, 1, 3, 3);               \
@@ -447,36 +460,32 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
         }                                                  \
     } else {                                               \
         SET_STRIDE(func, type, 0, 0, 0);                   \
-    }
+    }                                                      \
+    return f_struct;
+
+template <typename T>
+struct FixFunction {
+    void (*f)(T*, const T*, const T*, const Param);
+};
 
 template <typename T>
-void (*get_kern(const Param& param))(T*, const T*, const T*, const Param);
+FixFunction<T> get_kern(const Param& param);
 
 template <>
-void (*get_kern<float>(const Param& param))(float*, const float*, const float*,
-                                            const Param) {
-    void (*kern_ptr)(float*, const float*, const float*, Param);
+FixFunction<float> get_kern<float>(const Param& param) {
     GET_KERN(kern_bwd_data_float, float);
-    return kern_ptr;
 }
 
 #if CUDA_VERSION >= 9000
 template <>
-void (*get_kern<__half>(const Param& param))(__half*, const __half*,
-                                             const __half*, const Param) {
-    void (*kern_ptr)(__half*, const __half*, const __half*, Param);
+FixFunction<__half> get_kern<__half>(const Param& param) {
     GET_KERN(kern_bwd_data_hf, __half);
-    return kern_ptr;
 }
 #endif
 
 template <>
-void (*get_kern<dt_float16>(const Param& param))(dt_float16*, const dt_float16*,
-                                                 const dt_float16*,
-                                                 const Param) {
-    void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param);
+FixFunction<dt_float16> get_kern<dt_float16>(const Param& param) {
     GET_KERN(kern_bwd_data_float, dt_float16);
-    return kern_ptr;
 }
 
 #undef sh
@@ -494,7 +503,7 @@ template <typename T>
 void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
                   const Param& param, cudaStream_t stream) {
     void (*kern)(T*, const T*, const T*, Param);
-    kern = get_kern<T>(param);
+    kern = get_kern<T>(param).f;
 
     int nr_thread = query_blocksize_for_kernel(kern),
         nr_out_dimx = param.src_h * param.src_w * param.batch;
diff --git a/dnn/src/cuda/convolution/chanwise/bwd_filter.cu b/dnn/src/cuda/convolution/chanwise/bwd_filter.cu
index 6a317b86..0fbab61a 100644
--- a/dnn/src/cuda/convolution/chanwise/bwd_filter.cu
+++ b/dnn/src/cuda/convolution/chanwise/bwd_filter.cu
@@ -193,7 +193,8 @@ __global__ void kern_bwd_filter_hf(
                 return;
 		}
 
-		sum2 = {0.0, 0.0};
+		sum2.x = 0.0;
+		sum2.y = 0.0;
 		__half2 src2{0.0, 0.0};
 		__half2 dst2{0.0, 0.0};
 
@@ -330,51 +331,74 @@ __global__ void kern_bwd_filter_hf(
 }
 #endif
 
-#define GET_KERN(func, type) \
-	switch(_p) { \
-        case 1<<10: kern_ptr = func<type, 1<<10>; break; \
-		case 1<<9: kern_ptr = func<type, 1<<9>; break; \
-		case 1<<8: kern_ptr = func<type, 1<<8>; break; \
-		case 1<<7: kern_ptr = func<type, 1<<7>; break; \
-		case 1<<6: kern_ptr = func<type, 1<<6>; break; \
-		case 1<<5: kern_ptr = func<type, 1<<5>; break; \
-		case 1<<4: kern_ptr = func<type, 1<<4>; break; \
-		case 1<<3: kern_ptr = func<type, 1<<3>; break; \
-		case 1<<2: kern_ptr = func<type, 1<<2>; break; \
-		case 1<<1: kern_ptr = func<type, 1<<1>; break; \
-		case 1<<0: kern_ptr = func<type, 1<<0>; break; \
-	}
+#define GET_KERN(func, type)                                    \
+    FixFunction<type> f_struct;                                 \
+    switch (_p) {                                               \
+        case 1 << 10:                                           \
+            f_struct.f = func<type, 1 << 10>;                   \
+            break;                                              \
+        case 1 << 9:                                            \
+            f_struct.f = func<type, 1 << 9>;                    \
+            break;                                              \
+        case 1 << 8:                                            \
+            f_struct.f = func<type, 1 << 8>;                    \
+            break;                                              \
+        case 1 << 7:                                            \
+            f_struct.f = func<type, 1 << 7>;                    \
+            break;                                              \
+        case 1 << 6:                                            \
+            f_struct.f = func<type, 1 << 6>;                    \
+            break;                                              \
+        case 1 << 5:                                            \
+            f_struct.f = func<type, 1 << 5>;                    \
+            break;                                              \
+        case 1 << 4:                                            \
+            f_struct.f = func<type, 1 << 4>;                    \
+            break;                                              \
+        case 1 << 3:                                            \
+            f_struct.f = func<type, 1 << 3>;                    \
+            break;                                              \
+        case 1 << 2:                                            \
+            f_struct.f = func<type, 1 << 2>;                    \
+            break;                                              \
+        case 1 << 1:                                            \
+            f_struct.f = func<type, 1 << 1>;                    \
+            break;                                              \
+        case 1 << 0:                                            \
+            f_struct.f = func<type, 1 << 0>;                    \
+            break;                                              \
+        default:                                                \
+            megdnn_assert(false, "DO NOT IMP CASE FUNCTION!!"); \
+    }                                                           \
+    return f_struct;
+
+template <typename T>
+struct FixFunction {
+    void (*f)(T*, const T*, const T*, Param);
+};
 
 template <typename T>
-void (*get_kern(const uint32_t& _p))(T*, const T*, const T*, Param);
+FixFunction<T> get_kern(const uint32_t& _p);
 
 template <>
-void (*get_kern<float>(const uint32_t& _p))(float*, const float*, const float*, Param) {
-	void (*kern_ptr)(float*, const float*, const float*, Param) = NULL;
-	GET_KERN(kern_bwd_filter_float, float);
-	return kern_ptr;
+FixFunction<float> get_kern<float>(const uint32_t& _p) {
+    GET_KERN(kern_bwd_filter_float, float);
 }
 
 #if CUDA_VERSION >= 9000
 template <>
-void (*get_kern<__half>(const uint32_t& _p))(__half*, const __half*, const __half*, Param) {
-	void (*kern_ptr)(__half*, const __half*, const __half*, Param) = NULL;
-	GET_KERN(kern_bwd_filter_hf, __half);
-	return kern_ptr;
+FixFunction<__half> get_kern<__half>(const uint32_t& _p) {
+    GET_KERN(kern_bwd_filter_hf, __half);
 }
 #endif
 
 template <>
-void (*get_kern<dt_float16>(const uint32_t& _p))(dt_float16*, const dt_float16*,
-                                                 const dt_float16*, Param) {
-    void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param) = NULL;
+FixFunction<dt_float16> get_kern<dt_float16>(const uint32_t& _p) {
     GET_KERN(kern_bwd_filter_float, dt_float16);
-    return kern_ptr;
 }
 
 #undef GET_KERN
-} // anonymous namespace
-
+}  // anonymous namespace
 
 namespace megdnn {
 namespace cuda {
@@ -385,7 +409,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad,
 		const Param &param, cudaStream_t stream) {
 	void (*kern)(T*, const T*, const T*, Param) = NULL;
 	uint32_t                                           
-		nr_thread = query_blocksize_for_kernel(get_kern<T>(1024)),
+		nr_thread = query_blocksize_for_kernel(get_kern<T>(1024).f),
 		nr_thpf = std::min(nr_thread,                  
         	std::max<uint32_t>(                    
 				1,                                 
@@ -395,7 +419,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad,
 	do {
 #define CK(_n) \
 		if (nr_thpf >= _n) { \
-			kern = get_kern<T>(_n); \
+			kern = get_kern<T>(_n).f; \
 			nr_thpf = _n; \
 			break; \
 		}
diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh
index 2b3764d7..2d521326 100644
--- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh
+++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh
@@ -155,7 +155,7 @@ struct BlockTileIteratorBasic {
                         filter_gl2sh_visitor.copy();
                     }
 
-                    consumer.template consume_block(src_gl2sh_visitor,
+                    consumer.consume_block(src_gl2sh_visitor,
                                                     filter_gl2sh_visitor);
 
                     if (!(ci_outer == ci_blks - 1 && h == h_end &&
diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh
index 5b7d5bfc..ce06eea1 100644
--- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh
+++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh
@@ -171,7 +171,7 @@ struct BlockTileIterator_COxHW {
                     filter_gl2sh_visitor.copy();
                 }
 
-                consumer.template consume_block(src_gl2sh_visitor,
+                consumer.consume_block(src_gl2sh_visitor,
                                                 filter_gl2sh_visitor);
 
                 if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) {
diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh
index d93ad24c..5002c825 100644
--- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh
+++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh
@@ -162,7 +162,7 @@ struct BlockTileIteratorUnrollWidth {
                         filter_gl2sh_visitor.copy();
                     }
 
-                    consumer.template consume_block(src_gl2sh_visitor,
+                    consumer.consume_block(src_gl2sh_visitor,
                                                     filter_gl2sh_visitor);
 
                     if (!(ci_outer == ci_blks - 1 && h == h_end &&
diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh
index 3dffebbd..130968d8 100644
--- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh
+++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh
@@ -154,7 +154,7 @@ struct BlockTileIteratorUnrollWidthV2 {
                     filter_gl2sh_visitor.copy();
                 }
 
-                consumer.template consume_block(src_gl2sh_visitor,
+                consumer.consume_block(src_gl2sh_visitor,
                                                 filter_gl2sh_visitor);
 
                 if (!(ci_outer == ci_blks - 1 && h == h_end)) {
diff --git a/dnn/src/cuda/convolution_helper/kernel.cuh b/dnn/src/cuda/convolution_helper/kernel.cuh
index 72b598fe..8b10dfde 100644
--- a/dnn/src/cuda/convolution_helper/kernel.cuh
+++ b/dnn/src/cuda/convolution_helper/kernel.cuh
@@ -72,7 +72,7 @@ __global__ void convolution_kernel(
     DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src};
     FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
     if (check_bounds) {
-        block_iterator.template set_remain(src_gl2sh_visitor,
+        block_iterator.set_remain(src_gl2sh_visitor,
                                            filter_gl2sh_visitor);
     }
 
@@ -89,7 +89,7 @@ __global__ void convolution_kernel(
     GlobalMemoryWriter global_memory_writer;
     global_memory_writer.init(smem_dst, alpha, beta);
     if (check_bounds) {
-        block_iterator.template set_remain(global_memory_writer);
+        block_iterator.set_remain(global_memory_writer);
     }
     bias.move(block_iterator.block_batch, block_iterator.block_out_channel,
               block_iterator.block_out_height, block_iterator.block_out_width);
@@ -130,7 +130,7 @@ __global__ void convolution_kernel_precomp_offset(
     DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset};
     FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
     if (check_bounds) {
-        block_iterator.template set_remain(src_gl2sh_visitor,
+        block_iterator.set_remain(src_gl2sh_visitor,
                                            filter_gl2sh_visitor);
     }
 
@@ -147,7 +147,7 @@ __global__ void convolution_kernel_precomp_offset(
     GlobalMemoryWriter global_memory_writer;
     global_memory_writer.init(smem_dst, alpha, beta);
     if (check_bounds) {
-        block_iterator.template set_remain(global_memory_writer);
+        block_iterator.set_remain(global_memory_writer);
     }
     bias.move(block_iterator.block_batch, block_iterator.block_out_channel,
               block_iterator.block_out_height, block_iterator.block_out_width);
diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu
index a7877697..514d1d55 100644
--- a/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu
+++ b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu
@@ -259,8 +259,8 @@ void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois,
 
     auto&& out_data_elems = out_data.layout.total_nr_elems();
     auto&& out_count_elems = out_count.layout.total_nr_elems();
-    size_t out_data_bytes = sizeof(float[out_data_elems]);
-    size_t out_count_bytes = sizeof(float[out_count_elems]);
+    size_t out_data_bytes = sizeof(float) * out_data_elems;
+    size_t out_count_bytes = sizeof(float) * out_count_elems;
 
     cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream);
     cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream);
@@ -292,8 +292,8 @@ void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois,
 
     auto&& data_diff_elems = data_diff.layout.total_nr_elems();
     auto&& trans_diff_elems = trans_diff.layout.total_nr_elems();
-    size_t data_diff_bytes = sizeof(float[data_diff_elems]);
-    size_t trans_diff_bytes = sizeof(float[trans_diff_elems]);
+    size_t data_diff_bytes = sizeof(float) * data_diff_elems;
+    size_t trans_diff_bytes = sizeof(float) * trans_diff_elems;
 
     cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream);
     cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream);
diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh
index 94e184f6..db548bc6 100644
--- a/dnn/src/cuda/elemwise_helper.cuh
+++ b/dnn/src/cuda/elemwise_helper.cuh
@@ -58,7 +58,7 @@ enum BcastType {
 template <typename ctype>
 class VectTypeTrait;
 
-struct __attribute__((aligned(8))) half4 {
+struct ATTR_ALIGNED(8) half4 {
     dt_float16 x, y, z, w;
 };
 
@@ -69,7 +69,7 @@ __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y,
     return t;
 }
 
-struct __attribute__((aligned(8))) bhalf4 {
+struct ATTR_ALIGNED(8) bhalf4 {
     dt_bfloat16 x, y, z, w;
 };
 
diff --git a/scripts/cmake-build/BUILD_README.md b/scripts/cmake-build/BUILD_README.md
index d973b706..f9c70a51 100644
--- a/scripts/cmake-build/BUILD_README.md
+++ b/scripts/cmake-build/BUILD_README.md
@@ -1,8 +1,8 @@
 # build support status
 ##  host build
-*  windows build (ok)
-*  linux build   (ok)
-*  macos build   (ok)
+*  windows build (cpu + gpu)
+*  linux build   (cpu + gpu)
+*  macos build   (cpu only)
 ##  cross build
 *  windows cross build arm-android (ok)
 *  windows cross build arm-linux   (ok)
@@ -17,9 +17,19 @@
 ### windows host build
     ```
     1: installl Visual Studio (need support LLVM/clang-cl), eg 2019
-    clang-cl 9 linker have crash issue, pls install 7/8/10
+    pls install LLVM-10, VS llvm linker have issue, pls replace lld-link.exe,
+    download from https://releases.llvm.org/download.html#10.0.0
     2: install extension of VS: python/cmake/LLVM
     3: CUDA env(if enable CUDA), version detail: project_root_dir/README.md
+    4: now we support cuda10.1+cudnn7.6+TensorRT6.0 on windows, as windows can
+    only use dll in fact with cudnn/TensorRT, so please install the same version;
+    4a: install cuda10.1 to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+    4b: install cudnn7.6 to C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32
+    4c: install TensorRT6.0 to C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5
+    4d: add C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin to system Path env
+    4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env
+    4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path
+    if u do not do 4d/4e/4f, CUDA runtime can not find dll
     ```
 ### linux host build
     ```
diff --git a/scripts/cmake-build/host_build.sh b/scripts/cmake-build/host_build.sh
index 01aaecc2..78e1562e 100755
--- a/scripts/cmake-build/host_build.sh
+++ b/scripts/cmake-build/host_build.sh
@@ -162,8 +162,22 @@ function prepare_env_for_windows_build() {
         windows_env_err
     fi
 
-    export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH
     echo "put vcvarsall.bat path to PATH env.."
+    export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH
+
+    echo "config cuda/cudnn/TensorRT env..."
+    export NIVIDA_INSTALL_PRE=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit
+    export CUDA_V=v10.1
+    export CUDNN_V=cudnn-10.1-windows10-x64-v7.6.5.32
+    export TRT_V=TensorRT-6.0.1.5
+    export CUDA_PATH=$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}
+    export PATH=$PATH:$CUDA_PATH/bin
+    export CUDA_BIN_PATH=$CUDA_PATH
+    export PC_CUDNN_INCLUDE_DIRS=$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/include
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NIVIDA_INSTALL_PRE/${TRT_V}/lib:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/lib/x64:$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/lib/x64
+    export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS
+    export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH
+    export INCLUDE=$INCLUDE:$CPATH
 }
 
 WINDOWS_BUILD_TARGET="Ninja all > build.log"
diff --git a/scripts/whl/BUILD_PYTHON_WHL_README.md b/scripts/whl/BUILD_PYTHON_WHL_README.md
index 10066640..8b86b179 100644
--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
@@ -1,7 +1,7 @@
 # python whl package build support status
-*  windows build (ok,cpu only)
-*  linux   build (ok, cpu or gpu)
-*  macos   build (ok,cpu only)
+*  windows build (cpu + gpu)
+*  linux   build (cpu + gpu)
+*  macos   build (cpu only)
 # build env prepare
 ## linux
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4bedb37..4857ed2e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -52,8 +52,10 @@ foreach (INCPATH IN LISTS MGB_INC)
 endforeach()
 
 if(MGE_WITH_CUDA)
-    target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>"
-        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>")
+    if(NOT WIN32 AND NOT MSVC)
+        target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>"
+            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>")
+    endif()
 else()
     target_compile_options(megbrain PRIVATE "-Wno-unused-parameter")
 endif()
diff --git a/src/core/impl/utils/debug.cpp b/src/core/impl/utils/debug.cpp
index 23b0017d..63426ce0 100644
--- a/src/core/impl/utils/debug.cpp
+++ b/src/core/impl/utils/debug.cpp
@@ -195,12 +195,14 @@ public:
     static void set_flag(int f) { flag() = f; }
 
     static void init() {
+#if !defined(WIN32)
         int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr,
                                  nullptr);
         if (err) {
             mgb_throw(SystemError, "failed to setup atfork handler: %s",
                       strerror(err));
         }
+#endif
     }
 };
 #endif
-- 
GitLab