From 211ef78c1e8929276b831a163789f675d376c56c Mon Sep 17 00:00:00 2001 From: T8T9 <62922815+T8T9@users.noreply.github.com> Date: Fri, 5 Jun 2020 11:13:04 +0800 Subject: [PATCH] Builtin cuda (#24904) * support CUDA using cmake built-in way (#24395) * support CUDA using cmake built-in way. test=develop * test=develop * cmake_minimum_required 3.10 * test=develop --- CMakeLists.txt | 17 +++++++-- cmake/configure.cmake | 8 ++-- cmake/cuda.cmake | 87 ++++++++++++++++++------------------------- cmake/flags.cmake | 10 +++-- cmake/generic.cmake | 16 ++++++-- cmake/init.cmake | 29 +++++++++++++++ 6 files changed, 103 insertions(+), 64 deletions(-) create mode 100644 cmake/init.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index eed1ee57a7..f4de578af6 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 3.10) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -20,6 +20,19 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) include(system) project(paddle CXX C) + +include(init) + +# enable language CUDA +# TODO(Shibo Tao): remove find_package(CUDA) completely. +find_package(CUDA QUIET) +option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) +if(WITH_GPU AND NOT APPLE) + enable_language(CUDA) + message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: " + "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}") +endif() + message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " @@ -52,14 +65,12 @@ else(WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations") endif(WIN32) -find_package(CUDA QUIET) find_package(Git REQUIRED) find_package(Threads REQUIRED) include(simd) ################################ Exposed Configurations ####################################### -option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 51a267074d..1814656d24 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -77,7 +77,7 @@ if(WITH_GPU) FIND_PACKAGE(CUDA REQUIRED) - if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7) message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") endif() @@ -90,7 +90,7 @@ if(WITH_GPU) else() message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") endif() - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=\"${SIMD_FLAG}\"") # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) @@ -98,11 +98,11 @@ if(WITH_GPU) if(TENSORRT_FOUND) if(WIN32) - if(${CUDA_VERSION_MAJOR} VERSION_LESS 9) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 9) message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows") endif() else() - if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 8) message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") endif() if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 4a93e2f26e..82548b8c51 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -104,12 +104,12 @@ function(select_nvcc_arch_flags out_variable) elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") set(cuda_arch_bin "60 61") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") - if (NOT ${CUDA_VERSION} LESS 10.0) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) add_definitions("-DSUPPORTS_CUDA_FP16") endif() set(cuda_arch_bin "70") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") - if (NOT ${CUDA_VERSION} LESS 10.0) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) add_definitions("-DSUPPORTS_CUDA_FP16") endif() set(cuda_arch_bin "75") @@ -142,19 +142,19 @@ function(select_nvcc_arch_flags out_variable) foreach(arch ${cuda_arch_bin}) if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") # User explicitly specified PTX for the concrete BIN - list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) - list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + string(APPEND nvcc_flags " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}") + string(APPEND nvcc_archs_readable " sm_${CMAKE_MATCH_1}") else() # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN - list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) - list(APPEND nvcc_archs_readable sm_${arch}) + string(APPEND nvcc_flags " -gencode arch=compute_${arch},code=sm_${arch}") + string(APPEND nvcc_archs_readable " sm_${arch}") endif() endforeach() # Tell NVCC to add PTX intermediate code for the specified architectures foreach(arch ${cuda_arch_ptx}) - list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) - list(APPEND nvcc_archs_readable compute_${arch}) + string(APPEND nvcc_flags " -gencode arch=compute_${arch},code=compute_${arch}") + string(APPEND nvcc_archs_readable " compute_${arch}") endforeach() string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") @@ -162,32 +162,32 @@ function(select_nvcc_arch_flags out_variable) set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) endfunction() -message(STATUS "CUDA detected: " ${CUDA_VERSION}) -if (${CUDA_VERSION} LESS 7.0) +message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION}) +if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) -elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") -elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. - list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") -elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") -elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") endif() -add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"") -include_directories(${CUDA_INCLUDE_DIRS}) +message(STATUS "PADDLE_CUDA_BINVER=${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") + if(NOT WITH_DSO) if(WIN32) set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${CUDA_cusolver_LIBRARY}) @@ -196,37 +196,24 @@ endif(NOT WITH_DSO) # setting nvcc arch flags select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) -list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) -message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}") +message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}") # Set C++11 support set(CUDA_PROPAGATE_HOST_FLAGS OFF) - # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -if (NOT WIN32) # windows msvc2015 support c++11 natively. - # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. - list(APPEND CUDA_NVCC_FLAGS "-std=c++11") - list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +if (NOT WIN32) # windows msvc2015 support c++11 natively. + # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++11") endif(NOT WIN32) -# in cuda9, suppress cuda warning on eigen -list(APPEND CUDA_NVCC_FLAGS "-w") +# in cuda9, suppress cuda warning on eigen +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w") # Set :expt-relaxed-constexpr to suppress Eigen warnings -list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") -if (NOT WIN32) - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) - elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) - elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) - elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) - endif() -else(NOT WIN32) +if (WIN32) list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") if(MSVC_STATIC_CRT) @@ -241,9 +228,9 @@ else(NOT WIN32) elseif(CMAKE_BUILD_TYPE STREQUAL "Release") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") else() - message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") -endif() -endif(NOT WIN32) + message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") + endif() +endif(WIN32) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index f3a0660900..9b04ad913b 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -76,7 +76,7 @@ macro(safe_set_nvflag flag_name) CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) if(${safe_name}) - LIST(APPEND CUDA_NVCC_FLAGS -Xcompiler ${flag_name}) + set(SAFE_GPU_COMMON_FLAGS "${SAFE_GPU_COMMON_FLAGS} -Xcompiler=\"${flag_name}\"") endif() endmacro() @@ -169,7 +169,7 @@ if(NOT APPLE) -Wno-error=nonnull-compare # Warning in boost gcc 8.2 -Wno-error=address # Warning in boost gcc 8.2 -Wno-ignored-qualifiers # Warning in boost gcc 8.2 - -Wno-ignored-attributes # Warning in Eigen gcc 8.3 + -Wno-ignored-attributes # Warning in Eigen gcc 8.3 -Wno-parentheses # Warning in Eigen gcc 8.3 ) endif() @@ -187,7 +187,7 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) -if (NOT WITH_NV_JETSON) +if (NOT WITH_NV_JETSON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif() endif(NOT WIN32) @@ -212,10 +212,14 @@ foreach(flag ${COMMON_FLAGS}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) endforeach() +set(SAFE_GPU_COMMON_FLAGS "") foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") + + if(WIN32 AND MSVC_STATIC_CRT) # windows build turn off warnings. safe_set_static_flag() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 7dc79919b7..4be0dcfd69 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -411,10 +411,14 @@ function(nv_library TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(nv_library_SRCS) + # Attention: + # 1. cuda_add_library is deprecated after cmake v3.10, use add_library for CUDA please. + # 2. cuda_add_library does not support ccache. + # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html if (nv_library_SHARED OR nv_library_shared) # build *.so - cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) + add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) else() - cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) find_fluid_modules(${TARGET_NAME}) endif() if (nv_library_DEPS) @@ -449,7 +453,7 @@ function(nv_binary TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + add_executable(${TARGET_NAME} ${nv_binary_SRCS}) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) @@ -463,7 +467,11 @@ function(nv_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + # Attention: + # 1. cuda_add_executable is deprecated after cmake v3.10, use cuda_add_executable for CUDA please. + # 2. cuda_add_executable does not support ccache. + # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html + add_executable(${TARGET_NAME} ${nv_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) diff --git a/cmake/init.cmake b/cmake/init.cmake new file mode 100644 index 0000000000..a33bfdbd41 --- /dev/null +++ b/cmake/init.cmake @@ -0,0 +1,29 @@ +# Attention: cmake will append these flags to compile command automatically. +# So if you want to add global option, change this file rather than flags.cmake + +# default: "-g" +set(CMAKE_C_FLAGS_DEBUG "-g") +# default: "-O3 -DNDEBUG" +set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG") +# default: "-O2 -g -DNDEBUG" +set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") +# default: "-Os -DNDEBUG" +set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG") + +# default: "-g" +set(CMAKE_CXX_FLAGS_DEBUG "-g") +# default: "-O3 -DNDEBUG" +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") +# default: "-O2 -g -DNDEBUG" +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") +# default: "-Os -DNDEBUG" +set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") + +# default: "-g" +set(CMAKE_CUDA_FLAGS_DEBUG "-g") +# default: "-O3 -DNDEBUG" +set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") +# default: "-O2 -g -DNDEBUG" +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") +# default: "-O1 -DNDEBUG" +set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") -- GitLab