提交 4a178a8d 编写于 作者: M Megvii Engine Team 提交者: Xinran Xu

feat(windows/cuda/cmake): support cmake cuda build on windows

GitOrigin-RevId: 4d9832e5592cb0e2e39172f0ebf52546e1e5b269
上级 1915593b
...@@ -116,7 +116,7 @@ endif() ...@@ -116,7 +116,7 @@ endif()
if(MSVC OR WIN32) if(MSVC OR WIN32)
add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
message("-- into windows build...") message("-- into windows build...")
message(" -- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl") if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl")
message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md") message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md")
endif() endif()
...@@ -131,12 +131,20 @@ if(MSVC OR WIN32) ...@@ -131,12 +131,20 @@ if(MSVC OR WIN32)
set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion") set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion")
set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default") set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default")
set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break") set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break")
set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601") set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601 /wd4819")
set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj") set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj")
set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport") set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}")
#FIXME: fix halide JIT on windows
message("-- disable jit and halide on windows host build...")
set(MGE_WITH_HALIDE OFF)
set(MGE_WITH_JIT OFF)
#FIXME: fix MegRay on windows
message("-- Disable distributed build on windows host build...")
set(MGE_WITH_DISTRIBUTED OFF)
else() else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
...@@ -286,7 +294,16 @@ if(MGE_WITH_CUDA) ...@@ -286,7 +294,16 @@ if(MGE_WITH_CUDA)
set(CMAKE_CUDA_FLAGS_RELEASE "-O3") set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g")
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os")
if(MSVC OR WIN32)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all")
set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068")
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd")
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ")
else()
set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all")
endif()
if(NOT MGE_ENABLE_RTTI) if(NOT MGE_ENABLE_RTTI)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti")
...@@ -332,15 +349,29 @@ if(MGE_WITH_CUDA) ...@@ -332,15 +349,29 @@ if(MGE_WITH_CUDA)
endif() endif()
if(MGE_CUDA_USE_STATIC) if(MGE_CUDA_USE_STATIC)
if(MGE_WITH_TRT) if(MGE_WITH_TRT)
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY})
message("-- windows TRT_LIBRARY: ${TRT_LIBRARY}")
message("-- windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
else()
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive)
endif()
else() else()
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive)
endif() endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib)
else()
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static)
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublasLt.lib)
else()
list(APPEND MGE_CUDA_LIBS cublasLt_static) list(APPEND MGE_CUDA_LIBS cublasLt_static)
endif() endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") endif()
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)
# mark all symbols from liblapack_static.a as weak to avoid # mark all symbols from liblapack_static.a as weak to avoid
# duplicated definition with mkl # duplicated definition with mkl
find_library( find_library(
...@@ -377,7 +408,11 @@ if(MGE_WITH_CUDA) ...@@ -377,7 +408,11 @@ if(MGE_WITH_CUDA)
endif() endif()
add_subdirectory(dnn/cuda-stub) add_subdirectory(dnn/cuda-stub)
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub)
else()
list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt)
endif()
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}")
endif() endif()
...@@ -699,3 +734,22 @@ if (NOT MGE_WITH_DISTRIBUTED) ...@@ -699,3 +734,22 @@ if (NOT MGE_WITH_DISTRIBUTED)
${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake
DESTINATION ${MGE_INSTALL_CMAKEDIR}) DESTINATION ${MGE_INSTALL_CMAKEDIR})
endif() endif()
if(MSVC OR WIN32)
add_compile_options(
$<$<CONFIG:>:/MT>
$<$<CONFIG:Debug>:/MTd>
$<$<CONFIG:Release>:/MT>
)
foreach (CompilerFlag
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if(${CompilerFlag} MATCHES "/MD")
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
set(${CompilerFlag} "${${CompilerFlag}}" CACHE STRING "msvc compiler flags" FORCE)
message("MSVC flags: ${CompilerFlag}:${${CompilerFlag}}")
endif()
endforeach()
endif()
...@@ -9,7 +9,7 @@ endif() ...@@ -9,7 +9,7 @@ endif()
if(MGE_CUDA_USE_STATIC) if(MGE_CUDA_USE_STATIC)
find_library(CUDNN_LIBRARY find_library(CUDNN_LIBRARY
NAMES libcudnn_static.a libcudnn_static.lib NAMES libcudnn_static.a cudnn.lib
PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
HINTS ${SYSTEM_LIBRARY_PATHS} HINTS ${SYSTEM_LIBRARY_PATHS}
PATH_SUFFIXES lib lib64 PATH_SUFFIXES lib lib64
...@@ -30,7 +30,7 @@ endif() ...@@ -30,7 +30,7 @@ endif()
get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH) get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH)
find_path(CUDNN_INCLUDE_DIR find_path(CUDNN_INCLUDE_DIR
NAMES cudnn.h NAMES cudnn.h
HINTS ${PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root}
PATH_SUFFIXES include PATH_SUFFIXES include
DOC "Path to CUDNN include directory." ) DOC "Path to CUDNN include directory." )
......
if($ENV{LIBRARY_PATH}) if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "")
string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH})
endif() endif()
if(MGE_CUDA_USE_STATIC) if(MGE_CUDA_USE_STATIC)
find_library(TRT_LIBRARY find_library(TRT_LIBRARY
NAMES libnvinfer_static.a libnvinfer_static.lib NAMES libnvinfer_static.a nvinfer.lib
PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
HINTS ${SYSTEM_LIBRARY_PATHS} HINTS ${SYSTEM_LIBRARY_PATHS}
PATH_SUFFIXES lib lib64 PATH_SUFFIXES lib lib64
DOC "TRT library." ) DOC "TRT library." )
else() else()
find_library(TRT_LIBRARY find_library(TRT_LIBRARY
NAMES libnvinfer.so libnvinfer.dylib NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll
PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
HINTS ${SYSTEM_LIBRARY_PATHS} HINTS ${SYSTEM_LIBRARY_PATHS}
PATH_SUFFIXES lib lib64 PATH_SUFFIXES lib lib64
......
file (GLOB_RECURSE SOURCES src/*.cpp) file (GLOB_RECURSE SOURCES src/*.cpp)
add_library (cuda-stub SHARED ${SOURCES}) if(MSVC OR WIN32)
add_library (cuda-stub STATIC ${SOURCES})
else()
add_library (cuda-stub SHARED ${SOURCES})
endif()
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda)
target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL)
target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) if (MSVC OR WIN32)
target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined)
else()
target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined)
endif()
install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS}) install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS})
...@@ -140,4 +140,9 @@ ...@@ -140,4 +140,9 @@
#define MEGDNN_DEVICE #define MEGDNN_DEVICE
#endif #endif
#if defined(_MSC_VER) || defined(WIN32)
#define ATTR_ALIGNED(v) __declspec(align(v))
#else
#define ATTR_ALIGNED(v) __attribute__((aligned(v)))
#endif
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
...@@ -215,9 +215,9 @@ struct TensorLayout : public TensorShape { ...@@ -215,9 +215,9 @@ struct TensorLayout : public TensorShape {
DType dtype; DType dtype;
Format format; Format format;
#if MEGDNN_CC_HOST
TensorLayout(); TensorLayout();
#if MEGDNN_CC_HOST
TensorLayout(const TensorLayout& layout) = default; TensorLayout(const TensorLayout& layout) = default;
//! create empty layout with given dtype //! create empty layout with given dtype
......
...@@ -275,19 +275,19 @@ __global__ void kern_fwd_half(__half* dst, const __half* src, ...@@ -275,19 +275,19 @@ __global__ void kern_fwd_half(__half* dst, const __half* src,
#define SET_SW(func, type, sw) \ #define SET_SW(func, type, sw) \
if (param.flt_h == 2 && param.flt_w == 2) { \ if (param.flt_h == 2 && param.flt_w == 2) { \
kern = func<type, 1, 2, 2, sw>; \ f_struct.f = func<type, 1, 2, 2, sw>; \
} else if (param.flt_h == 3 && param.flt_w == 3) { \ } else if (param.flt_h == 3 && param.flt_w == 3) { \
kern = func<type, 1, 3, 3, sw>; \ f_struct.f = func<type, 1, 3, 3, sw>; \
} else if (param.flt_h == 5 && param.flt_w == 5) { \ } else if (param.flt_h == 5 && param.flt_w == 5) { \
kern = func<type, 1, 5, 5, sw>; \ f_struct.f = func<type, 1, 5, 5, sw>; \
} else if (param.flt_h == 7 && param.flt_w == 7) { \ } else if (param.flt_h == 7 && param.flt_w == 7) { \
kern = func<type, 1, 7, 7, sw>; \ f_struct.f = func<type, 1, 7, 7, sw>; \
} else { \ } else { \
kern = func<type, 1, 0, 0, sw>; \ f_struct.f = func<type, 1, 0, 0, sw>; \
} }
#define GET_KERN(func, type) \ #define GET_KERN(func, type) \
void (*kern)(type*, const type*, const type*, Param); \ FixFunction<type> f_struct; \
if (param.chl_mul == 1) { \ if (param.chl_mul == 1) { \
if (param.stride_w == 1) { \ if (param.stride_w == 1) { \
SET_SW(func, type, 1) \ SET_SW(func, type, 1) \
...@@ -295,31 +295,32 @@ __global__ void kern_fwd_half(__half* dst, const __half* src, ...@@ -295,31 +295,32 @@ __global__ void kern_fwd_half(__half* dst, const __half* src,
SET_SW(func, type, 0) \ SET_SW(func, type, 0) \
} \ } \
} else { \ } else { \
kern = func<type, 0, 0, 0, 0>; \ f_struct.f = func<type, 0, 0, 0, 0>; \
} \ } \
return kern; return f_struct;
template <typename T> template <typename T>
void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); struct FixFunction {
void (*f)(T*, const T*, const T*, Param);
};
template <typename T>
FixFunction<T> get_kern(const Param& param);
template <> template <>
void (*get_kern<float>(const Param& param))(float*, const float*, const float*, FixFunction<float> get_kern<float>(const Param& param) {
const Param) {
GET_KERN(kern_fwd_float, float); GET_KERN(kern_fwd_float, float);
} }
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
template <> template <>
void (*get_kern<__half>(const Param& param))(__half*, const __half*, FixFunction<__half> get_kern<__half>(const Param& param) {
const __half*, const Param) {
GET_KERN(kern_fwd_half, __half); GET_KERN(kern_fwd_half, __half);
} }
#endif #endif
template <> template <>
void (*get_kern<dt_float16>(const Param& param))(dt_float16*, const dt_float16*, FixFunction<dt_float16> get_kern<dt_float16>(const Param& param) {
const dt_float16*,
const Param) {
GET_KERN(kern_fwd_float, dt_float16); GET_KERN(kern_fwd_float, dt_float16);
} }
...@@ -337,7 +338,7 @@ template <typename T> ...@@ -337,7 +338,7 @@ template <typename T>
void run_fwd(T* dst, const T* src, const T* flt, const Param& param, void run_fwd(T* dst, const T* src, const T* flt, const Param& param,
cudaStream_t stream) { cudaStream_t stream) {
void (*kern)(T*, const T*, const T*, Param); void (*kern)(T*, const T*, const T*, Param);
kern = get_kern<T>(param); kern = get_kern<T>(param).f;
int nr_thread = query_blocksize_for_kernel(kern), int nr_thread = query_blocksize_for_kernel(kern),
nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul; nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul;
......
...@@ -178,25 +178,29 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -178,25 +178,29 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
__half2 dst2 = {0.0, 0.0}; __half2 dst2 = {0.0, 0.0};
if (static_cast<uint32_t>(ow) < if (static_cast<uint32_t>(ow) <
static_cast<uint32_t>(owmin_y)) { static_cast<uint32_t>(owmin_y)) {
dst2 = {*(pd + ow), 0.0}; dst2.x = *(pd + ow);
dst2.y = 0.0;
sum = fma2(dst2, flt3, sum); sum = fma2(dst2, flt3, sum);
++ow; ++ow;
--fw; --fw;
} }
if (static_cast<uint32_t>(owmax_x) < if (static_cast<uint32_t>(owmax_x) <
static_cast<uint32_t>(owmax)) { static_cast<uint32_t>(owmax)) {
dst2 = {0.0, *(pd + owmax)}; dst2.x = 0.0;
dst2.y = *(pd + owmax);
sum = fma2(dst2, flt0, sum); sum = fma2(dst2, flt0, sum);
} }
if (static_cast<uint32_t>(fw) == 1) { if (static_cast<uint32_t>(fw) == 1) {
dst2 = {*(pd + ow), *(pd + ow)}; dst2.x = *(pd + ow);
dst2.y = *(pd + ow);
sum = fma2(dst2, flt2, sum); sum = fma2(dst2, flt2, sum);
++ow; ++ow;
--fw; --fw;
} }
if (static_cast<uint32_t>(ow) <= if (static_cast<uint32_t>(ow) <=
static_cast<uint32_t>(owmax_x)) { static_cast<uint32_t>(owmax_x)) {
dst2 = {*(pd + ow), *(pd + ow)}; dst2.x = *(pd + ow);
dst2.y = *(pd + ow);
sum = fma2(dst2, flt1, sum); sum = fma2(dst2, flt1, sum);
} }
...@@ -218,18 +222,21 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -218,18 +222,21 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
__half2 dst2 = {0.0, 0.0}; __half2 dst2 = {0.0, 0.0};
if (static_cast<uint32_t>(ow) < if (static_cast<uint32_t>(ow) <
static_cast<uint32_t>(owmin_y)) { static_cast<uint32_t>(owmin_y)) {
dst2 = {*(pd + ow), 0.0}; dst2.x = *(pd + ow);
dst2.y = 0.0;
sum = fma2(dst2, flt5, sum); sum = fma2(dst2, flt5, sum);
++ow; ++ow;
--fw; --fw;
} }
if (static_cast<uint32_t>(owmax_x) < if (static_cast<uint32_t>(owmax_x) <
static_cast<uint32_t>(owmax)) { static_cast<uint32_t>(owmax)) {
dst2 = {0.0, *(pd + owmax)}; dst2.x = 0.0;
dst2.y = *(pd + owmax);
sum = fma2(dst2, flt0, sum); sum = fma2(dst2, flt0, sum);
} }
if (static_cast<uint32_t>(fw) == 3) { if (static_cast<uint32_t>(fw) == 3) {
dst2 = {*(pd + ow), *(pd + ow)}; dst2.x = *(pd + ow);
dst2.y = *(pd + ow);
sum = fma2(dst2, flt4, sum); sum = fma2(dst2, flt4, sum);
++ow; ++ow;
--fw; --fw;
...@@ -237,7 +244,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -237,7 +244,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
if (static_cast<uint32_t>(fw) == 2 && if (static_cast<uint32_t>(fw) == 2 &&
static_cast<uint32_t>(ow) <= static_cast<uint32_t>(ow) <=
static_cast<uint32_t>(owmax_x)) { static_cast<uint32_t>(owmax_x)) {
dst2 = {*(pd + ow), *(pd + ow)}; dst2.x = *(pd + ow);
dst2.y = *(pd + ow);
sum = fma2(dst2, flt3, sum); sum = fma2(dst2, flt3, sum);
++ow; ++ow;
--fw; --fw;
...@@ -245,7 +253,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -245,7 +253,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
if (static_cast<uint32_t>(fw) == 1 && if (static_cast<uint32_t>(fw) == 1 &&
static_cast<uint32_t>(ow) <= static_cast<uint32_t>(ow) <=
static_cast<uint32_t>(owmax_x)) { static_cast<uint32_t>(owmax_x)) {
dst2 = {*(pd + ow), *(pd + ow)}; dst2.x = *(pd + ow);
dst2.y = *(pd + ow);
sum = fma2(dst2, flt2, sum); sum = fma2(dst2, flt2, sum);
++ow; ++ow;
--fw; --fw;
...@@ -253,7 +262,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -253,7 +262,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
if (static_cast<uint32_t>(fw) == 0 && if (static_cast<uint32_t>(fw) == 0 &&
static_cast<uint32_t>(ow) <= static_cast<uint32_t>(ow) <=
static_cast<uint32_t>(owmax_x)) { static_cast<uint32_t>(owmax_x)) {
dst2 = {*(pd + ow), *(pd + ow)}; dst2.x = *(pd + ow);
dst2.y = *(pd + ow);
sum = fma2(dst2, flt1, sum); sum = fma2(dst2, flt1, sum);
} }
...@@ -270,8 +280,10 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -270,8 +280,10 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
uint32_t fw = iw - ow + PW; uint32_t fw = iw - ow + PW;
if (static_cast<uint32_t>(ow) <= if (static_cast<uint32_t>(ow) <=
static_cast<uint32_t>(owmax)) { static_cast<uint32_t>(owmax)) {
pd2 = {*(pd + ow), *(pd + ow)}; pd2.x = *(pd + ow);
pf2 = {0.0, 0.0}; pd2.y = *(pd + ow);
pf2.x = 0.0;
pf2.y = 0.0;
if (static_cast<uint32_t>(ow) >= if (static_cast<uint32_t>(ow) >=
static_cast<uint32_t>(owmin_y)) static_cast<uint32_t>(owmin_y))
pf2.y = *(pf + fw + 1); pf2.y = *(pf + fw + 1);
...@@ -427,14 +439,15 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -427,14 +439,15 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
#define sw param.stride_w #define sw param.stride_w
#define SET_STRIDE(func, type, chl_mul, fh, fw) \ #define SET_STRIDE(func, type, chl_mul, fh, fw) \
if (sh == 1 && sw == 1) { \ if (sh == 1 && sw == 1) { \
kern_ptr = func<type, chl_mul, fh, fw, 1, 1>; \ f_struct.f = func<type, chl_mul, fh, fw, 1, 1>; \
} else if (sh == 2 && sw == 2) { \ } else if (sh == 2 && sw == 2) { \
kern_ptr = func<type, chl_mul, fh, fw, 2, 2>; \ f_struct.f = func<type, chl_mul, fh, fw, 2, 2>; \
} else { \ } else { \
kern_ptr = func<type, chl_mul, fh, fw, 0, 0>; \ f_struct.f = func<type, chl_mul, fh, fw, 0, 0>; \
} }
#define GET_KERN(func, type) \ #define GET_KERN(func, type) \
FixFunction<type> f_struct; \
if (param.chl_mul == 1) { \ if (param.chl_mul == 1) { \
if (param.flt_h == 3 && param.flt_w == 3) { \ if (param.flt_h == 3 && param.flt_w == 3) { \
SET_STRIDE(func, type, 1, 3, 3); \ SET_STRIDE(func, type, 1, 3, 3); \
...@@ -447,36 +460,32 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, ...@@ -447,36 +460,32 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad,
} \ } \
} else { \ } else { \
SET_STRIDE(func, type, 0, 0, 0); \ SET_STRIDE(func, type, 0, 0, 0); \
} } \
return f_struct;
template <typename T>
struct FixFunction {
void (*f)(T*, const T*, const T*, const Param);
};
template <typename T> template <typename T>
void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); FixFunction<T> get_kern(const Param& param);
template <> template <>
void (*get_kern<float>(const Param& param))(float*, const float*, const float*, FixFunction<float> get_kern<float>(const Param& param) {
const Param) {
void (*kern_ptr)(float*, const float*, const float*, Param);
GET_KERN(kern_bwd_data_float, float); GET_KERN(kern_bwd_data_float, float);
return kern_ptr;
} }
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
template <> template <>
void (*get_kern<__half>(const Param& param))(__half*, const __half*, FixFunction<__half> get_kern<__half>(const Param& param) {
const __half*, const Param) {
void (*kern_ptr)(__half*, const __half*, const __half*, Param);
GET_KERN(kern_bwd_data_hf, __half); GET_KERN(kern_bwd_data_hf, __half);
return kern_ptr;
} }
#endif #endif
template <> template <>
void (*get_kern<dt_float16>(const Param& param))(dt_float16*, const dt_float16*, FixFunction<dt_float16> get_kern<dt_float16>(const Param& param) {
const dt_float16*,
const Param) {
void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param);
GET_KERN(kern_bwd_data_float, dt_float16); GET_KERN(kern_bwd_data_float, dt_float16);
return kern_ptr;
} }
#undef sh #undef sh
...@@ -494,7 +503,7 @@ template <typename T> ...@@ -494,7 +503,7 @@ template <typename T>
void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
const Param& param, cudaStream_t stream) { const Param& param, cudaStream_t stream) {
void (*kern)(T*, const T*, const T*, Param); void (*kern)(T*, const T*, const T*, Param);
kern = get_kern<T>(param); kern = get_kern<T>(param).f;
int nr_thread = query_blocksize_for_kernel(kern), int nr_thread = query_blocksize_for_kernel(kern),
nr_out_dimx = param.src_h * param.src_w * param.batch; nr_out_dimx = param.src_h * param.src_w * param.batch;
......
...@@ -193,7 +193,8 @@ __global__ void kern_bwd_filter_hf( ...@@ -193,7 +193,8 @@ __global__ void kern_bwd_filter_hf(
return; return;
} }
sum2 = {0.0, 0.0}; sum2.x = 0.0;
sum2.y = 0.0;
__half2 src2{0.0, 0.0}; __half2 src2{0.0, 0.0};
__half2 dst2{0.0, 0.0}; __half2 dst2{0.0, 0.0};
...@@ -331,51 +332,74 @@ __global__ void kern_bwd_filter_hf( ...@@ -331,51 +332,74 @@ __global__ void kern_bwd_filter_hf(
#endif #endif
#define GET_KERN(func, type) \ #define GET_KERN(func, type) \
switch(_p) { \ FixFunction<type> f_struct; \
case 1<<10: kern_ptr = func<type, 1<<10>; break; \ switch (_p) { \
case 1<<9: kern_ptr = func<type, 1<<9>; break; \ case 1 << 10: \
case 1<<8: kern_ptr = func<type, 1<<8>; break; \ f_struct.f = func<type, 1 << 10>; \
case 1<<7: kern_ptr = func<type, 1<<7>; break; \ break; \
case 1<<6: kern_ptr = func<type, 1<<6>; break; \ case 1 << 9: \
case 1<<5: kern_ptr = func<type, 1<<5>; break; \ f_struct.f = func<type, 1 << 9>; \
case 1<<4: kern_ptr = func<type, 1<<4>; break; \ break; \
case 1<<3: kern_ptr = func<type, 1<<3>; break; \ case 1 << 8: \
case 1<<2: kern_ptr = func<type, 1<<2>; break; \ f_struct.f = func<type, 1 << 8>; \
case 1<<1: kern_ptr = func<type, 1<<1>; break; \ break; \
case 1<<0: kern_ptr = func<type, 1<<0>; break; \ case 1 << 7: \
} f_struct.f = func<type, 1 << 7>; \
break; \
case 1 << 6: \
f_struct.f = func<type, 1 << 6>; \
break; \
case 1 << 5: \
f_struct.f = func<type, 1 << 5>; \
break; \
case 1 << 4: \
f_struct.f = func<type, 1 << 4>; \
break; \
case 1 << 3: \
f_struct.f = func<type, 1 << 3>; \
break; \
case 1 << 2: \
f_struct.f = func<type, 1 << 2>; \
break; \
case 1 << 1: \
f_struct.f = func<type, 1 << 1>; \
break; \
case 1 << 0: \
f_struct.f = func<type, 1 << 0>; \
break; \
default: \
megdnn_assert(false, "DO NOT IMP CASE FUNCTION!!"); \
} \
return f_struct;
template <typename T> template <typename T>
void (*get_kern(const uint32_t& _p))(T*, const T*, const T*, Param); struct FixFunction {
void (*f)(T*, const T*, const T*, Param);
};
template <typename T>
FixFunction<T> get_kern(const uint32_t& _p);
template <> template <>
void (*get_kern<float>(const uint32_t& _p))(float*, const float*, const float*, Param) { FixFunction<float> get_kern<float>(const uint32_t& _p) {
void (*kern_ptr)(float*, const float*, const float*, Param) = NULL;
GET_KERN(kern_bwd_filter_float, float); GET_KERN(kern_bwd_filter_float, float);
return kern_ptr;
} }
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
template <> template <>
void (*get_kern<__half>(const uint32_t& _p))(__half*, const __half*, const __half*, Param) { FixFunction<__half> get_kern<__half>(const uint32_t& _p) {
void (*kern_ptr)(__half*, const __half*, const __half*, Param) = NULL;
GET_KERN(kern_bwd_filter_hf, __half); GET_KERN(kern_bwd_filter_hf, __half);
return kern_ptr;
} }
#endif #endif
template <> template <>
void (*get_kern<dt_float16>(const uint32_t& _p))(dt_float16*, const dt_float16*, FixFunction<dt_float16> get_kern<dt_float16>(const uint32_t& _p) {
const dt_float16*, Param) {
void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param) = NULL;
GET_KERN(kern_bwd_filter_float, dt_float16); GET_KERN(kern_bwd_filter_float, dt_float16);
return kern_ptr;
} }
#undef GET_KERN #undef GET_KERN
} // anonymous namespace } // anonymous namespace
namespace megdnn { namespace megdnn {
namespace cuda { namespace cuda {
namespace convolution { namespace convolution {
...@@ -385,7 +409,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, ...@@ -385,7 +409,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad,
const Param &param, cudaStream_t stream) { const Param &param, cudaStream_t stream) {
void (*kern)(T*, const T*, const T*, Param) = NULL; void (*kern)(T*, const T*, const T*, Param) = NULL;
uint32_t uint32_t
nr_thread = query_blocksize_for_kernel(get_kern<T>(1024)), nr_thread = query_blocksize_for_kernel(get_kern<T>(1024).f),
nr_thpf = std::min(nr_thread, nr_thpf = std::min(nr_thread,
std::max<uint32_t>( std::max<uint32_t>(
1, 1,
...@@ -395,7 +419,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, ...@@ -395,7 +419,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad,
do { do {
#define CK(_n) \ #define CK(_n) \
if (nr_thpf >= _n) { \ if (nr_thpf >= _n) { \
kern = get_kern<T>(_n); \ kern = get_kern<T>(_n).f; \
nr_thpf = _n; \ nr_thpf = _n; \
break; \ break; \
} }
......
...@@ -155,7 +155,7 @@ struct BlockTileIteratorBasic { ...@@ -155,7 +155,7 @@ struct BlockTileIteratorBasic {
filter_gl2sh_visitor.copy(); filter_gl2sh_visitor.copy();
} }
consumer.template consume_block(src_gl2sh_visitor, consumer.consume_block(src_gl2sh_visitor,
filter_gl2sh_visitor); filter_gl2sh_visitor);
if (!(ci_outer == ci_blks - 1 && h == h_end && if (!(ci_outer == ci_blks - 1 && h == h_end &&
......
...@@ -171,7 +171,7 @@ struct BlockTileIterator_COxHW { ...@@ -171,7 +171,7 @@ struct BlockTileIterator_COxHW {
filter_gl2sh_visitor.copy(); filter_gl2sh_visitor.copy();
} }
consumer.template consume_block(src_gl2sh_visitor, consumer.consume_block(src_gl2sh_visitor,
filter_gl2sh_visitor); filter_gl2sh_visitor);
if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) { if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) {
......
...@@ -162,7 +162,7 @@ struct BlockTileIteratorUnrollWidth { ...@@ -162,7 +162,7 @@ struct BlockTileIteratorUnrollWidth {
filter_gl2sh_visitor.copy(); filter_gl2sh_visitor.copy();
} }
consumer.template consume_block(src_gl2sh_visitor, consumer.consume_block(src_gl2sh_visitor,
filter_gl2sh_visitor); filter_gl2sh_visitor);
if (!(ci_outer == ci_blks - 1 && h == h_end && if (!(ci_outer == ci_blks - 1 && h == h_end &&
......
...@@ -154,7 +154,7 @@ struct BlockTileIteratorUnrollWidthV2 { ...@@ -154,7 +154,7 @@ struct BlockTileIteratorUnrollWidthV2 {
filter_gl2sh_visitor.copy(); filter_gl2sh_visitor.copy();
} }
consumer.template consume_block(src_gl2sh_visitor, consumer.consume_block(src_gl2sh_visitor,
filter_gl2sh_visitor); filter_gl2sh_visitor);
if (!(ci_outer == ci_blks - 1 && h == h_end)) { if (!(ci_outer == ci_blks - 1 && h == h_end)) {
......
...@@ -72,7 +72,7 @@ __global__ void convolution_kernel( ...@@ -72,7 +72,7 @@ __global__ void convolution_kernel(
DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src}; DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src};
FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
if (check_bounds) { if (check_bounds) {
block_iterator.template set_remain(src_gl2sh_visitor, block_iterator.set_remain(src_gl2sh_visitor,
filter_gl2sh_visitor); filter_gl2sh_visitor);
} }
...@@ -89,7 +89,7 @@ __global__ void convolution_kernel( ...@@ -89,7 +89,7 @@ __global__ void convolution_kernel(
GlobalMemoryWriter global_memory_writer; GlobalMemoryWriter global_memory_writer;
global_memory_writer.init(smem_dst, alpha, beta); global_memory_writer.init(smem_dst, alpha, beta);
if (check_bounds) { if (check_bounds) {
block_iterator.template set_remain(global_memory_writer); block_iterator.set_remain(global_memory_writer);
} }
bias.move(block_iterator.block_batch, block_iterator.block_out_channel, bias.move(block_iterator.block_batch, block_iterator.block_out_channel,
block_iterator.block_out_height, block_iterator.block_out_width); block_iterator.block_out_height, block_iterator.block_out_width);
...@@ -130,7 +130,7 @@ __global__ void convolution_kernel_precomp_offset( ...@@ -130,7 +130,7 @@ __global__ void convolution_kernel_precomp_offset(
DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset}; DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset};
FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
if (check_bounds) { if (check_bounds) {
block_iterator.template set_remain(src_gl2sh_visitor, block_iterator.set_remain(src_gl2sh_visitor,
filter_gl2sh_visitor); filter_gl2sh_visitor);
} }
...@@ -147,7 +147,7 @@ __global__ void convolution_kernel_precomp_offset( ...@@ -147,7 +147,7 @@ __global__ void convolution_kernel_precomp_offset(
GlobalMemoryWriter global_memory_writer; GlobalMemoryWriter global_memory_writer;
global_memory_writer.init(smem_dst, alpha, beta); global_memory_writer.init(smem_dst, alpha, beta);
if (check_bounds) { if (check_bounds) {
block_iterator.template set_remain(global_memory_writer); block_iterator.set_remain(global_memory_writer);
} }
bias.move(block_iterator.block_batch, block_iterator.block_out_channel, bias.move(block_iterator.block_batch, block_iterator.block_out_channel,
block_iterator.block_out_height, block_iterator.block_out_width); block_iterator.block_out_height, block_iterator.block_out_width);
......
...@@ -259,8 +259,8 @@ void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois, ...@@ -259,8 +259,8 @@ void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois,
auto&& out_data_elems = out_data.layout.total_nr_elems(); auto&& out_data_elems = out_data.layout.total_nr_elems();
auto&& out_count_elems = out_count.layout.total_nr_elems(); auto&& out_count_elems = out_count.layout.total_nr_elems();
size_t out_data_bytes = sizeof(float[out_data_elems]); size_t out_data_bytes = sizeof(float) * out_data_elems;
size_t out_count_bytes = sizeof(float[out_count_elems]); size_t out_count_bytes = sizeof(float) * out_count_elems;
cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream); cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream);
cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream); cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream);
...@@ -292,8 +292,8 @@ void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois, ...@@ -292,8 +292,8 @@ void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois,
auto&& data_diff_elems = data_diff.layout.total_nr_elems(); auto&& data_diff_elems = data_diff.layout.total_nr_elems();
auto&& trans_diff_elems = trans_diff.layout.total_nr_elems(); auto&& trans_diff_elems = trans_diff.layout.total_nr_elems();
size_t data_diff_bytes = sizeof(float[data_diff_elems]); size_t data_diff_bytes = sizeof(float) * data_diff_elems;
size_t trans_diff_bytes = sizeof(float[trans_diff_elems]); size_t trans_diff_bytes = sizeof(float) * trans_diff_elems;
cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream); cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream);
cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream); cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream);
......
...@@ -58,7 +58,7 @@ enum BcastType { ...@@ -58,7 +58,7 @@ enum BcastType {
template <typename ctype> template <typename ctype>
class VectTypeTrait; class VectTypeTrait;
struct __attribute__((aligned(8))) half4 { struct ATTR_ALIGNED(8) half4 {
dt_float16 x, y, z, w; dt_float16 x, y, z, w;
}; };
...@@ -69,7 +69,7 @@ __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y, ...@@ -69,7 +69,7 @@ __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y,
return t; return t;
} }
struct __attribute__((aligned(8))) bhalf4 { struct ATTR_ALIGNED(8) bhalf4 {
dt_bfloat16 x, y, z, w; dt_bfloat16 x, y, z, w;
}; };
......
# build support status # build support status
## host build ## host build
* windows build (ok) * windows build (cpu + gpu)
* linux build (ok) * linux build (cpu + gpu)
* macos build (ok) * macos build (cpu only)
## cross build ## cross build
* windows cross build arm-android (ok) * windows cross build arm-android (ok)
* windows cross build arm-linux (ok) * windows cross build arm-linux (ok)
...@@ -17,9 +17,19 @@ ...@@ -17,9 +17,19 @@
### windows host build ### windows host build
``` ```
1: installl Visual Studio (need support LLVM/clang-cl), eg 2019 1: installl Visual Studio (need support LLVM/clang-cl), eg 2019
clang-cl 9 linker have crash issue, pls install 7/8/10 pls install LLVM-10, VS llvm linker have issue, pls replace lld-link.exe,
download from https://releases.llvm.org/download.html#10.0.0
2: install extension of VS: python/cmake/LLVM 2: install extension of VS: python/cmake/LLVM
3: CUDA env(if enable CUDA), version detail: project_root_dir/README.md 3: CUDA env(if enable CUDA), version detail: project_root_dir/README.md
4: now we support cuda10.1+cudnn7.6+TensorRT6.0 on windows, as windows can
only use dll in fact with cudnn/TensorRT, so please install the same version;
4a: install cuda10.1 to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
4b: install cudnn7.6 to C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32
4c: install TensorRT6.0 to C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5
4d: add C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin to system Path env
4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env
4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path
if u do not do 4d/4e/4f, CUDA runtime can not find dll
``` ```
### linux host build ### linux host build
``` ```
......
...@@ -162,8 +162,22 @@ function prepare_env_for_windows_build() { ...@@ -162,8 +162,22 @@ function prepare_env_for_windows_build() {
windows_env_err windows_env_err
fi fi
export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH
echo "put vcvarsall.bat path to PATH env.." echo "put vcvarsall.bat path to PATH env.."
export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH
echo "config cuda/cudnn/TensorRT env..."
export NIVIDA_INSTALL_PRE=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit
export CUDA_V=v10.1
export CUDNN_V=cudnn-10.1-windows10-x64-v7.6.5.32
export TRT_V=TensorRT-6.0.1.5
export CUDA_PATH=$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}
export PATH=$PATH:$CUDA_PATH/bin
export CUDA_BIN_PATH=$CUDA_PATH
export PC_CUDNN_INCLUDE_DIRS=$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/include
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NIVIDA_INSTALL_PRE/${TRT_V}/lib:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/lib/x64:$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/lib/x64
export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS
export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH
export INCLUDE=$INCLUDE:$CPATH
} }
WINDOWS_BUILD_TARGET="Ninja all > build.log" WINDOWS_BUILD_TARGET="Ninja all > build.log"
......
# python whl package build support status # python whl package build support status
* windows build (ok,cpu only) * windows build (cpu + gpu)
* linux build (ok, cpu or gpu) * linux build (cpu + gpu)
* macos build (ok,cpu only) * macos build (cpu only)
# build env prepare # build env prepare
## linux ## linux
......
...@@ -52,8 +52,10 @@ foreach (INCPATH IN LISTS MGB_INC) ...@@ -52,8 +52,10 @@ foreach (INCPATH IN LISTS MGB_INC)
endforeach() endforeach()
if(MGE_WITH_CUDA) if(MGE_WITH_CUDA)
if(NOT WIN32 AND NOT MSVC)
target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>" target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>"
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>") "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>")
endif()
else() else()
target_compile_options(megbrain PRIVATE "-Wno-unused-parameter") target_compile_options(megbrain PRIVATE "-Wno-unused-parameter")
endif() endif()
......
...@@ -195,12 +195,14 @@ public: ...@@ -195,12 +195,14 @@ public:
static void set_flag(int f) { flag() = f; } static void set_flag(int f) { flag() = f; }
static void init() { static void init() {
#if !defined(WIN32)
int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr, int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr,
nullptr); nullptr);
if (err) { if (err) {
mgb_throw(SystemError, "failed to setup atfork handler: %s", mgb_throw(SystemError, "failed to setup atfork handler: %s",
strerror(err)); strerror(err));
} }
#endif
} }
}; };
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册