From 4a178a8dba03be3b18c7089d714f3b84816b95a1 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 20 Aug 2020 22:30:17 +0800 Subject: [PATCH] feat(windows/cuda/cmake): support cmake cuda build on windows GitOrigin-RevId: 4d9832e5592cb0e2e39172f0ebf52546e1e5b269 --- CMakeLists.txt | 70 +++++++++++++-- cmake/cudnn.cmake | 4 +- cmake/tensorrt.cmake | 6 +- dnn/cuda-stub/CMakeLists.txt | 12 ++- dnn/include/megdnn/arch.h | 5 ++ dnn/include/megdnn/basic_types.h | 2 +- dnn/src/cuda/conv_bias/chanwise/fwd.cu | 53 +++++------ dnn/src/cuda/convolution/chanwise/bwd_data.cu | 79 +++++++++-------- .../cuda/convolution/chanwise/bwd_filter.cu | 88 ++++++++++++------- .../block_tile_iterator_basic.cuh | 2 +- .../block_tile_iterator_coxhw.cuh | 2 +- .../block_tile_iterator_unroll_width.cuh | 2 +- .../block_tile_iterator_unroll_width_v2.cuh | 2 +- dnn/src/cuda/convolution_helper/kernel.cuh | 8 +- .../deformable_ps_roi_pooling/kimpl/kern.cu | 8 +- dnn/src/cuda/elemwise_helper.cuh | 4 +- scripts/cmake-build/BUILD_README.md | 18 +++- scripts/cmake-build/host_build.sh | 16 +++- scripts/whl/BUILD_PYTHON_WHL_README.md | 6 +- src/CMakeLists.txt | 6 +- src/core/impl/utils/debug.cpp | 2 + 21 files changed, 262 insertions(+), 133 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37359316..6bdb21c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,7 @@ endif() if(MSVC OR WIN32) add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) message("-- into windows build...") - message(" -- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") + message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl") message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md") endif() @@ -131,12 +131,20 @@ if(MSVC OR WIN32) set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion") set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default") set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break") - set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601") + set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601 /wd4819") set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj") set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}") + + #FIXME: fix halide JIT on windows + message("-- disable jit and halide on windows host build...") + set(MGE_WITH_HALIDE OFF) + set(MGE_WITH_JIT OFF) + #FIXME: fix MegRay on windows + message("-- Disable distributed build on windows host build...") + set(MGE_WITH_DISTRIBUTED OFF) else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") @@ -286,7 +294,16 @@ if(MGE_WITH_CUDA) set(CMAKE_CUDA_FLAGS_RELEASE "-O3") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") - set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") + if(MSVC OR WIN32) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all") + set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068") + if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") + set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd") + endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ") + else() + set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") + endif() if(NOT MGE_ENABLE_RTTI) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") @@ -332,15 +349,29 @@ if(MGE_WITH_CUDA) endif() if(MGE_CUDA_USE_STATIC) if(MGE_WITH_TRT) - list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) + message("-- windows TRT_LIBRARY: ${TRT_LIBRARY}") + message("-- windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") + else() + list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) + endif() else() list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) endif() - list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) + else() + list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") - list(APPEND MGE_CUDA_LIBS cublasLt_static) + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS cublasLt.lib) + else() + list(APPEND MGE_CUDA_LIBS cublasLt_static) + endif() endif() - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") + if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) # mark all symbols from liblapack_static.a as weak to avoid # duplicated definition with mkl find_library( @@ -377,7 +408,11 @@ if(MGE_WITH_CUDA) endif() add_subdirectory(dnn/cuda-stub) - list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) + if(MSVC OR WIN32) + list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub) + else() + list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) + endif() set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") endif() @@ -699,3 +734,22 @@ if (NOT MGE_WITH_DISTRIBUTED) ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake DESTINATION ${MGE_INSTALL_CMAKEDIR}) endif() + +if(MSVC OR WIN32) + add_compile_options( + $<$:/MT> + $<$:/MTd> + $<$:/MT> + ) + foreach (CompilerFlag + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${CompilerFlag} MATCHES "/MD") + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + set(${CompilerFlag} "${${CompilerFlag}}" CACHE STRING "msvc compiler flags" FORCE) + message("MSVC flags: ${CompilerFlag}:${${CompilerFlag}}") + endif() + endforeach() +endif() diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index b8cef397..6310c95d 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -9,7 +9,7 @@ endif() if(MGE_CUDA_USE_STATIC) find_library(CUDNN_LIBRARY - NAMES libcudnn_static.a libcudnn_static.lib + NAMES libcudnn_static.a cudnn.lib PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} HINTS ${SYSTEM_LIBRARY_PATHS} PATH_SUFFIXES lib lib64 @@ -30,7 +30,7 @@ endif() get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH) find_path(CUDNN_INCLUDE_DIR NAMES cudnn.h - HINTS ${PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} + HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} PATH_SUFFIXES include DOC "Path to CUDNN include directory." ) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 7205f907..49062cf8 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -1,17 +1,17 @@ -if($ENV{LIBRARY_PATH}) +if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "") string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) endif() if(MGE_CUDA_USE_STATIC) find_library(TRT_LIBRARY - NAMES libnvinfer_static.a libnvinfer_static.lib + NAMES libnvinfer_static.a nvinfer.lib PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} HINTS ${SYSTEM_LIBRARY_PATHS} PATH_SUFFIXES lib lib64 DOC "TRT library." ) else() find_library(TRT_LIBRARY - NAMES libnvinfer.so libnvinfer.dylib + NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} HINTS ${SYSTEM_LIBRARY_PATHS} PATH_SUFFIXES lib lib64 diff --git a/dnn/cuda-stub/CMakeLists.txt b/dnn/cuda-stub/CMakeLists.txt index 2bd634a2..ac75d255 100644 --- a/dnn/cuda-stub/CMakeLists.txt +++ b/dnn/cuda-stub/CMakeLists.txt @@ -1,7 +1,15 @@ file (GLOB_RECURSE SOURCES src/*.cpp) -add_library (cuda-stub SHARED ${SOURCES}) +if(MSVC OR WIN32) + add_library (cuda-stub STATIC ${SOURCES}) +else() + add_library (cuda-stub SHARED ${SOURCES}) +endif() set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) -target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) +if (MSVC OR WIN32) + target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) +else() + target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) +endif() install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS}) diff --git a/dnn/include/megdnn/arch.h b/dnn/include/megdnn/arch.h index 0eef2351..c83323b7 100644 --- a/dnn/include/megdnn/arch.h +++ b/dnn/include/megdnn/arch.h @@ -140,4 +140,9 @@ #define MEGDNN_DEVICE #endif +#if defined(_MSC_VER) || defined(WIN32) + #define ATTR_ALIGNED(v) __declspec(align(v)) +#else + #define ATTR_ALIGNED(v) __attribute__((aligned(v))) +#endif // vim: syntax=cpp.doxygen diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h index 6c8c8cf1..1ba892a7 100644 --- a/dnn/include/megdnn/basic_types.h +++ b/dnn/include/megdnn/basic_types.h @@ -215,9 +215,9 @@ struct TensorLayout : public TensorShape { DType dtype; Format format; -#if MEGDNN_CC_HOST TensorLayout(); +#if MEGDNN_CC_HOST TensorLayout(const TensorLayout& layout) = default; //! create empty layout with given dtype diff --git a/dnn/src/cuda/conv_bias/chanwise/fwd.cu b/dnn/src/cuda/conv_bias/chanwise/fwd.cu index 83e71ee2..477857bb 100644 --- a/dnn/src/cuda/conv_bias/chanwise/fwd.cu +++ b/dnn/src/cuda/conv_bias/chanwise/fwd.cu @@ -275,51 +275,52 @@ __global__ void kern_fwd_half(__half* dst, const __half* src, #define SET_SW(func, type, sw) \ if (param.flt_h == 2 && param.flt_w == 2) { \ - kern = func; \ + f_struct.f = func; \ } else if (param.flt_h == 3 && param.flt_w == 3) { \ - kern = func; \ + f_struct.f = func; \ } else if (param.flt_h == 5 && param.flt_w == 5) { \ - kern = func; \ + f_struct.f = func; \ } else if (param.flt_h == 7 && param.flt_w == 7) { \ - kern = func; \ + f_struct.f = func; \ } else { \ - kern = func; \ + f_struct.f = func; \ } -#define GET_KERN(func, type) \ - void (*kern)(type*, const type*, const type*, Param); \ - if (param.chl_mul == 1) { \ - if (param.stride_w == 1) { \ - SET_SW(func, type, 1) \ - } else { \ - SET_SW(func, type, 0) \ - } \ - } else { \ - kern = func; \ - } \ - return kern; +#define GET_KERN(func, type) \ + FixFunction f_struct; \ + if (param.chl_mul == 1) { \ + if (param.stride_w == 1) { \ + SET_SW(func, type, 1) \ + } else { \ + SET_SW(func, type, 0) \ + } \ + } else { \ + f_struct.f = func; \ + } \ + return f_struct; template -void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); +struct FixFunction { + void (*f)(T*, const T*, const T*, Param); +}; + +template +FixFunction get_kern(const Param& param); template <> -void (*get_kern(const Param& param))(float*, const float*, const float*, - const Param) { +FixFunction get_kern(const Param& param) { GET_KERN(kern_fwd_float, float); } #if CUDA_VERSION >= 9000 template <> -void (*get_kern<__half>(const Param& param))(__half*, const __half*, - const __half*, const Param) { +FixFunction<__half> get_kern<__half>(const Param& param) { GET_KERN(kern_fwd_half, __half); } #endif template <> -void (*get_kern(const Param& param))(dt_float16*, const dt_float16*, - const dt_float16*, - const Param) { +FixFunction get_kern(const Param& param) { GET_KERN(kern_fwd_float, dt_float16); } @@ -337,7 +338,7 @@ template void run_fwd(T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream) { void (*kern)(T*, const T*, const T*, Param); - kern = get_kern(param); + kern = get_kern(param).f; int nr_thread = query_blocksize_for_kernel(kern), nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul; diff --git a/dnn/src/cuda/convolution/chanwise/bwd_data.cu b/dnn/src/cuda/convolution/chanwise/bwd_data.cu index b0d345a5..6c50c6bd 100644 --- a/dnn/src/cuda/convolution/chanwise/bwd_data.cu +++ b/dnn/src/cuda/convolution/chanwise/bwd_data.cu @@ -178,25 +178,29 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, __half2 dst2 = {0.0, 0.0}; if (static_cast(ow) < static_cast(owmin_y)) { - dst2 = {*(pd + ow), 0.0}; + dst2.x = *(pd + ow); + dst2.y = 0.0; sum = fma2(dst2, flt3, sum); ++ow; --fw; } if (static_cast(owmax_x) < static_cast(owmax)) { - dst2 = {0.0, *(pd + owmax)}; + dst2.x = 0.0; + dst2.y = *(pd + owmax); sum = fma2(dst2, flt0, sum); } if (static_cast(fw) == 1) { - dst2 = {*(pd + ow), *(pd + ow)}; + dst2.x = *(pd + ow); + dst2.y = *(pd + ow); sum = fma2(dst2, flt2, sum); ++ow; --fw; } if (static_cast(ow) <= static_cast(owmax_x)) { - dst2 = {*(pd + ow), *(pd + ow)}; + dst2.x = *(pd + ow); + dst2.y = *(pd + ow); sum = fma2(dst2, flt1, sum); } @@ -218,18 +222,21 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, __half2 dst2 = {0.0, 0.0}; if (static_cast(ow) < static_cast(owmin_y)) { - dst2 = {*(pd + ow), 0.0}; + dst2.x = *(pd + ow); + dst2.y = 0.0; sum = fma2(dst2, flt5, sum); ++ow; --fw; } if (static_cast(owmax_x) < static_cast(owmax)) { - dst2 = {0.0, *(pd + owmax)}; + dst2.x = 0.0; + dst2.y = *(pd + owmax); sum = fma2(dst2, flt0, sum); } if (static_cast(fw) == 3) { - dst2 = {*(pd + ow), *(pd + ow)}; + dst2.x = *(pd + ow); + dst2.y = *(pd + ow); sum = fma2(dst2, flt4, sum); ++ow; --fw; @@ -237,7 +244,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, if (static_cast(fw) == 2 && static_cast(ow) <= static_cast(owmax_x)) { - dst2 = {*(pd + ow), *(pd + ow)}; + dst2.x = *(pd + ow); + dst2.y = *(pd + ow); sum = fma2(dst2, flt3, sum); ++ow; --fw; @@ -245,7 +253,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, if (static_cast(fw) == 1 && static_cast(ow) <= static_cast(owmax_x)) { - dst2 = {*(pd + ow), *(pd + ow)}; + dst2.x = *(pd + ow); + dst2.y = *(pd + ow); sum = fma2(dst2, flt2, sum); ++ow; --fw; @@ -253,7 +262,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, if (static_cast(fw) == 0 && static_cast(ow) <= static_cast(owmax_x)) { - dst2 = {*(pd + ow), *(pd + ow)}; + dst2.x = *(pd + ow); + dst2.y = *(pd + ow); sum = fma2(dst2, flt1, sum); } @@ -270,8 +280,10 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, uint32_t fw = iw - ow + PW; if (static_cast(ow) <= static_cast(owmax)) { - pd2 = {*(pd + ow), *(pd + ow)}; - pf2 = {0.0, 0.0}; + pd2.x = *(pd + ow); + pd2.y = *(pd + ow); + pf2.x = 0.0; + pf2.y = 0.0; if (static_cast(ow) >= static_cast(owmin_y)) pf2.y = *(pf + fw + 1); @@ -425,16 +437,17 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, #define sh param.stride_h #define sw param.stride_w -#define SET_STRIDE(func, type, chl_mul, fh, fw) \ - if (sh == 1 && sw == 1) { \ - kern_ptr = func; \ - } else if (sh == 2 && sw == 2) { \ - kern_ptr = func; \ - } else { \ - kern_ptr = func; \ +#define SET_STRIDE(func, type, chl_mul, fh, fw) \ + if (sh == 1 && sw == 1) { \ + f_struct.f = func; \ + } else if (sh == 2 && sw == 2) { \ + f_struct.f = func; \ + } else { \ + f_struct.f = func; \ } #define GET_KERN(func, type) \ + FixFunction f_struct; \ if (param.chl_mul == 1) { \ if (param.flt_h == 3 && param.flt_w == 3) { \ SET_STRIDE(func, type, 1, 3, 3); \ @@ -447,36 +460,32 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, } \ } else { \ SET_STRIDE(func, type, 0, 0, 0); \ - } + } \ + return f_struct; + +template +struct FixFunction { + void (*f)(T*, const T*, const T*, const Param); +}; template -void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); +FixFunction get_kern(const Param& param); template <> -void (*get_kern(const Param& param))(float*, const float*, const float*, - const Param) { - void (*kern_ptr)(float*, const float*, const float*, Param); +FixFunction get_kern(const Param& param) { GET_KERN(kern_bwd_data_float, float); - return kern_ptr; } #if CUDA_VERSION >= 9000 template <> -void (*get_kern<__half>(const Param& param))(__half*, const __half*, - const __half*, const Param) { - void (*kern_ptr)(__half*, const __half*, const __half*, Param); +FixFunction<__half> get_kern<__half>(const Param& param) { GET_KERN(kern_bwd_data_hf, __half); - return kern_ptr; } #endif template <> -void (*get_kern(const Param& param))(dt_float16*, const dt_float16*, - const dt_float16*, - const Param) { - void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param); +FixFunction get_kern(const Param& param) { GET_KERN(kern_bwd_data_float, dt_float16); - return kern_ptr; } #undef sh @@ -494,7 +503,7 @@ template void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, const Param& param, cudaStream_t stream) { void (*kern)(T*, const T*, const T*, Param); - kern = get_kern(param); + kern = get_kern(param).f; int nr_thread = query_blocksize_for_kernel(kern), nr_out_dimx = param.src_h * param.src_w * param.batch; diff --git a/dnn/src/cuda/convolution/chanwise/bwd_filter.cu b/dnn/src/cuda/convolution/chanwise/bwd_filter.cu index 6a317b86..0fbab61a 100644 --- a/dnn/src/cuda/convolution/chanwise/bwd_filter.cu +++ b/dnn/src/cuda/convolution/chanwise/bwd_filter.cu @@ -193,7 +193,8 @@ __global__ void kern_bwd_filter_hf( return; } - sum2 = {0.0, 0.0}; + sum2.x = 0.0; + sum2.y = 0.0; __half2 src2{0.0, 0.0}; __half2 dst2{0.0, 0.0}; @@ -330,51 +331,74 @@ __global__ void kern_bwd_filter_hf( } #endif -#define GET_KERN(func, type) \ - switch(_p) { \ - case 1<<10: kern_ptr = func; break; \ - case 1<<9: kern_ptr = func; break; \ - case 1<<8: kern_ptr = func; break; \ - case 1<<7: kern_ptr = func; break; \ - case 1<<6: kern_ptr = func; break; \ - case 1<<5: kern_ptr = func; break; \ - case 1<<4: kern_ptr = func; break; \ - case 1<<3: kern_ptr = func; break; \ - case 1<<2: kern_ptr = func; break; \ - case 1<<1: kern_ptr = func; break; \ - case 1<<0: kern_ptr = func; break; \ - } +#define GET_KERN(func, type) \ + FixFunction f_struct; \ + switch (_p) { \ + case 1 << 10: \ + f_struct.f = func; \ + break; \ + case 1 << 9: \ + f_struct.f = func; \ + break; \ + case 1 << 8: \ + f_struct.f = func; \ + break; \ + case 1 << 7: \ + f_struct.f = func; \ + break; \ + case 1 << 6: \ + f_struct.f = func; \ + break; \ + case 1 << 5: \ + f_struct.f = func; \ + break; \ + case 1 << 4: \ + f_struct.f = func; \ + break; \ + case 1 << 3: \ + f_struct.f = func; \ + break; \ + case 1 << 2: \ + f_struct.f = func; \ + break; \ + case 1 << 1: \ + f_struct.f = func; \ + break; \ + case 1 << 0: \ + f_struct.f = func; \ + break; \ + default: \ + megdnn_assert(false, "DO NOT IMP CASE FUNCTION!!"); \ + } \ + return f_struct; + +template +struct FixFunction { + void (*f)(T*, const T*, const T*, Param); +}; template -void (*get_kern(const uint32_t& _p))(T*, const T*, const T*, Param); +FixFunction get_kern(const uint32_t& _p); template <> -void (*get_kern(const uint32_t& _p))(float*, const float*, const float*, Param) { - void (*kern_ptr)(float*, const float*, const float*, Param) = NULL; - GET_KERN(kern_bwd_filter_float, float); - return kern_ptr; +FixFunction get_kern(const uint32_t& _p) { + GET_KERN(kern_bwd_filter_float, float); } #if CUDA_VERSION >= 9000 template <> -void (*get_kern<__half>(const uint32_t& _p))(__half*, const __half*, const __half*, Param) { - void (*kern_ptr)(__half*, const __half*, const __half*, Param) = NULL; - GET_KERN(kern_bwd_filter_hf, __half); - return kern_ptr; +FixFunction<__half> get_kern<__half>(const uint32_t& _p) { + GET_KERN(kern_bwd_filter_hf, __half); } #endif template <> -void (*get_kern(const uint32_t& _p))(dt_float16*, const dt_float16*, - const dt_float16*, Param) { - void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param) = NULL; +FixFunction get_kern(const uint32_t& _p) { GET_KERN(kern_bwd_filter_float, dt_float16); - return kern_ptr; } #undef GET_KERN -} // anonymous namespace - +} // anonymous namespace namespace megdnn { namespace cuda { @@ -385,7 +409,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, const Param ¶m, cudaStream_t stream) { void (*kern)(T*, const T*, const T*, Param) = NULL; uint32_t - nr_thread = query_blocksize_for_kernel(get_kern(1024)), + nr_thread = query_blocksize_for_kernel(get_kern(1024).f), nr_thpf = std::min(nr_thread, std::max( 1, @@ -395,7 +419,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, do { #define CK(_n) \ if (nr_thpf >= _n) { \ - kern = get_kern(_n); \ + kern = get_kern(_n).f; \ nr_thpf = _n; \ break; \ } diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh index 2b3764d7..2d521326 100644 --- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_basic.cuh @@ -155,7 +155,7 @@ struct BlockTileIteratorBasic { filter_gl2sh_visitor.copy(); } - consumer.template consume_block(src_gl2sh_visitor, + consumer.consume_block(src_gl2sh_visitor, filter_gl2sh_visitor); if (!(ci_outer == ci_blks - 1 && h == h_end && diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh index 5b7d5bfc..ce06eea1 100644 --- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_coxhw.cuh @@ -171,7 +171,7 @@ struct BlockTileIterator_COxHW { filter_gl2sh_visitor.copy(); } - consumer.template consume_block(src_gl2sh_visitor, + consumer.consume_block(src_gl2sh_visitor, filter_gl2sh_visitor); if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) { diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh index d93ad24c..5002c825 100644 --- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width.cuh @@ -162,7 +162,7 @@ struct BlockTileIteratorUnrollWidth { filter_gl2sh_visitor.copy(); } - consumer.template consume_block(src_gl2sh_visitor, + consumer.consume_block(src_gl2sh_visitor, filter_gl2sh_visitor); if (!(ci_outer == ci_blks - 1 && h == h_end && diff --git a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh index 3dffebbd..130968d8 100644 --- a/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh +++ b/dnn/src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator_unroll_width_v2.cuh @@ -154,7 +154,7 @@ struct BlockTileIteratorUnrollWidthV2 { filter_gl2sh_visitor.copy(); } - consumer.template consume_block(src_gl2sh_visitor, + consumer.consume_block(src_gl2sh_visitor, filter_gl2sh_visitor); if (!(ci_outer == ci_blks - 1 && h == h_end)) { diff --git a/dnn/src/cuda/convolution_helper/kernel.cuh b/dnn/src/cuda/convolution_helper/kernel.cuh index 72b598fe..8b10dfde 100644 --- a/dnn/src/cuda/convolution_helper/kernel.cuh +++ b/dnn/src/cuda/convolution_helper/kernel.cuh @@ -72,7 +72,7 @@ __global__ void convolution_kernel( DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src}; FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; if (check_bounds) { - block_iterator.template set_remain(src_gl2sh_visitor, + block_iterator.set_remain(src_gl2sh_visitor, filter_gl2sh_visitor); } @@ -89,7 +89,7 @@ __global__ void convolution_kernel( GlobalMemoryWriter global_memory_writer; global_memory_writer.init(smem_dst, alpha, beta); if (check_bounds) { - block_iterator.template set_remain(global_memory_writer); + block_iterator.set_remain(global_memory_writer); } bias.move(block_iterator.block_batch, block_iterator.block_out_channel, block_iterator.block_out_height, block_iterator.block_out_width); @@ -130,7 +130,7 @@ __global__ void convolution_kernel_precomp_offset( DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset}; FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; if (check_bounds) { - block_iterator.template set_remain(src_gl2sh_visitor, + block_iterator.set_remain(src_gl2sh_visitor, filter_gl2sh_visitor); } @@ -147,7 +147,7 @@ __global__ void convolution_kernel_precomp_offset( GlobalMemoryWriter global_memory_writer; global_memory_writer.init(smem_dst, alpha, beta); if (check_bounds) { - block_iterator.template set_remain(global_memory_writer); + block_iterator.set_remain(global_memory_writer); } bias.move(block_iterator.block_batch, block_iterator.block_out_channel, block_iterator.block_out_height, block_iterator.block_out_width); diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu index a7877697..514d1d55 100644 --- a/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu +++ b/dnn/src/cuda/deformable_ps_roi_pooling/kimpl/kern.cu @@ -259,8 +259,8 @@ void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois, auto&& out_data_elems = out_data.layout.total_nr_elems(); auto&& out_count_elems = out_count.layout.total_nr_elems(); - size_t out_data_bytes = sizeof(float[out_data_elems]); - size_t out_count_bytes = sizeof(float[out_count_elems]); + size_t out_data_bytes = sizeof(float) * out_data_elems; + size_t out_count_bytes = sizeof(float) * out_count_elems; cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream); cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream); @@ -292,8 +292,8 @@ void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois, auto&& data_diff_elems = data_diff.layout.total_nr_elems(); auto&& trans_diff_elems = trans_diff.layout.total_nr_elems(); - size_t data_diff_bytes = sizeof(float[data_diff_elems]); - size_t trans_diff_bytes = sizeof(float[trans_diff_elems]); + size_t data_diff_bytes = sizeof(float) * data_diff_elems; + size_t trans_diff_bytes = sizeof(float) * trans_diff_elems; cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream); cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream); diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh index 94e184f6..db548bc6 100644 --- a/dnn/src/cuda/elemwise_helper.cuh +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -58,7 +58,7 @@ enum BcastType { template class VectTypeTrait; -struct __attribute__((aligned(8))) half4 { +struct ATTR_ALIGNED(8) half4 { dt_float16 x, y, z, w; }; @@ -69,7 +69,7 @@ __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y, return t; } -struct __attribute__((aligned(8))) bhalf4 { +struct ATTR_ALIGNED(8) bhalf4 { dt_bfloat16 x, y, z, w; }; diff --git a/scripts/cmake-build/BUILD_README.md b/scripts/cmake-build/BUILD_README.md index d973b706..f9c70a51 100644 --- a/scripts/cmake-build/BUILD_README.md +++ b/scripts/cmake-build/BUILD_README.md @@ -1,8 +1,8 @@ # build support status ## host build -* windows build (ok) -* linux build (ok) -* macos build (ok) +* windows build (cpu + gpu) +* linux build (cpu + gpu) +* macos build (cpu only) ## cross build * windows cross build arm-android (ok) * windows cross build arm-linux (ok) @@ -17,9 +17,19 @@ ### windows host build ``` 1: installl Visual Studio (need support LLVM/clang-cl), eg 2019 - clang-cl 9 linker have crash issue, pls install 7/8/10 + pls install LLVM-10, VS llvm linker have issue, pls replace lld-link.exe, + download from https://releases.llvm.org/download.html#10.0.0 2: install extension of VS: python/cmake/LLVM 3: CUDA env(if enable CUDA), version detail: project_root_dir/README.md + 4: now we support cuda10.1+cudnn7.6+TensorRT6.0 on windows, as windows can + only use dll in fact with cudnn/TensorRT, so please install the same version; + 4a: install cuda10.1 to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 + 4b: install cudnn7.6 to C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32 + 4c: install TensorRT6.0 to C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5 + 4d: add C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin to system Path env + 4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env + 4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path + if u do not do 4d/4e/4f, CUDA runtime can not find dll ``` ### linux host build ``` diff --git a/scripts/cmake-build/host_build.sh b/scripts/cmake-build/host_build.sh index 01aaecc2..78e1562e 100755 --- a/scripts/cmake-build/host_build.sh +++ b/scripts/cmake-build/host_build.sh @@ -162,8 +162,22 @@ function prepare_env_for_windows_build() { windows_env_err fi - export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH echo "put vcvarsall.bat path to PATH env.." + export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH + + echo "config cuda/cudnn/TensorRT env..." + export NIVIDA_INSTALL_PRE=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit + export CUDA_V=v10.1 + export CUDNN_V=cudnn-10.1-windows10-x64-v7.6.5.32 + export TRT_V=TensorRT-6.0.1.5 + export CUDA_PATH=$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V} + export PATH=$PATH:$CUDA_PATH/bin + export CUDA_BIN_PATH=$CUDA_PATH + export PC_CUDNN_INCLUDE_DIRS=$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/include + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NIVIDA_INSTALL_PRE/${TRT_V}/lib:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/lib/x64:$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/lib/x64 + export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS + export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH + export INCLUDE=$INCLUDE:$CPATH } WINDOWS_BUILD_TARGET="Ninja all > build.log" diff --git a/scripts/whl/BUILD_PYTHON_WHL_README.md b/scripts/whl/BUILD_PYTHON_WHL_README.md index 10066640..8b86b179 100644 --- a/scripts/whl/BUILD_PYTHON_WHL_README.md +++ b/scripts/whl/BUILD_PYTHON_WHL_README.md @@ -1,7 +1,7 @@ # python whl package build support status -* windows build (ok,cpu only) -* linux build (ok, cpu or gpu) -* macos build (ok,cpu only) +* windows build (cpu + gpu) +* linux build (cpu + gpu) +* macos build (cpu only) # build env prepare ## linux diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d4bedb37..4857ed2e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -52,8 +52,10 @@ foreach (INCPATH IN LISTS MGB_INC) endforeach() if(MGE_WITH_CUDA) - target_compile_options(megbrain PRIVATE "$<$:-Xcompiler=-Wno-unused-parameter>" - "$<$>:-Wno-unused-parameter>") + if(NOT WIN32 AND NOT MSVC) + target_compile_options(megbrain PRIVATE "$<$:-Xcompiler=-Wno-unused-parameter>" + "$<$>:-Wno-unused-parameter>") + endif() else() target_compile_options(megbrain PRIVATE "-Wno-unused-parameter") endif() diff --git a/src/core/impl/utils/debug.cpp b/src/core/impl/utils/debug.cpp index 23b0017d..63426ce0 100644 --- a/src/core/impl/utils/debug.cpp +++ b/src/core/impl/utils/debug.cpp @@ -195,12 +195,14 @@ public: static void set_flag(int f) { flag() = f; } static void init() { +#if !defined(WIN32) int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr, nullptr); if (err) { mgb_throw(SystemError, "failed to setup atfork handler: %s", strerror(err)); } +#endif } }; #endif -- GitLab