未验证 提交 76738504 编写于 作者: Y Y_Xuan 提交者: GitHub

添加rocm平台支持代码 (#29342)

* 添加rocm平台支持代码

* 修改一些问题

* 修改一些歧义并添加备注

* 修改代码格式

* 解决冲突后的代码修改

* 修改operators.cmake

* 修改格式

* 修正错误

* 统一接口

* 修改日期
上级 b96dada4
......@@ -129,7 +129,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF)
################################ Internal Configurations #######################################
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_ROCM_PLATFORM "Compile PaddlePaddle with ROCM platform" OFF)
option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
......@@ -260,10 +260,19 @@ include(configure) # add paddle env configuration
include_directories("${PADDLE_SOURCE_DIR}")
if(WITH_AMD_GPU)
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
if(WITH_ROCM_PLATFORM)
find_package(HIP)
include(hip)
endif(WITH_AMD_GPU)
endif(WITH_ROCM_PLATFORM)
if(WITH_ARM)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
......
......@@ -121,10 +121,14 @@ if(WITH_GPU)
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
elseif(WITH_AMD_GPU)
elseif(WITH_ROCM_PLATFORM)
add_definitions(-DPADDLE_WITH_HIP)
add_definitions(-DEIGEN_USE_HIP)
add_definitions(-D__HIP_PLATFORM_HCC__)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
else()
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
......
......@@ -28,7 +28,7 @@ endif()
# eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
if(WITH_AMD_GPU)
if(WITH_ROCM_PLATFORM)
set(EIGEN_REPOSITORY ${GIT_URL}/sabreshao/hipeigen.git)
set(EIGEN_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
endif()
......
......@@ -39,7 +39,6 @@ ExternalProject_Add(
# to be modified without triggering incremental compilation, and the
# third-party library version changes cannot be incorporated.
# reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# rocprim is "ROCm Parallel Primitives" for short.
# It is a header-only library providing HIP and HC parallel primitives
# for developing performant GPU-accelerated code on AMD ROCm platform.
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "/opt/rocm/hcc")
endif()
INCLUDE(ExternalProject)
SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim)
SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
ExternalProject_Add(
extern_rocprim
${SHALLOW_CLONE}
GIT_REPOSITORY "${GIT_URL}/ROCmSoftwarePlatform/rocPRIM.git"
GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc
PREFIX ${ROCPRIM_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
CMAKE_ARGS -DONLY_INSTALL=ON
CMAKE_ARGS -DBUILD_TEST=OFF
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
INSTALL_DIR ${ROCPRIM_INSTALL_DIR}
${EXTERNAL_PROJECT_LOG_ARGS}
)
INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
add_library(rocprim INTERFACE)
add_dependencies(rocprim extern_rocprim)
......@@ -155,7 +155,7 @@ set(COMMON_FLAGS
)
if(NOT APPLE)
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0)
if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM_PLATFORM AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 7.3))
set(COMMON_FLAGS
${COMMON_FLAGS}
-Wno-format-truncation # Warning in boost gcc 8.2
......
......@@ -537,12 +537,13 @@ function(nv_test TARGET_NAME)
endfunction(nv_test)
function(hip_library TARGET_NAME)
if (WITH_AMD_GPU)
if (WITH_ROCM_PLATFORM)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_library_SRCS})
set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
......@@ -554,7 +555,7 @@ function(hip_library TARGET_NAME)
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so)
find_fluid_modules(${TARGET_NAME})
endif()
if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
......@@ -585,12 +586,59 @@ function(hip_library TARGET_NAME)
endif()
endfunction(hip_library)
function(hip_library_ops TARGET_NAME)
if (WITH_ROCM_PLATFORM)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_library_ops "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_library_ops_SRCS})
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
if(hip_library_ops_SRCS)
if (hip_library_ops_SHARED OR hip_library_ops_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so)
find_fluid_modules(${TARGET_NAME})
endif()
if("${hip_library_ops_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS})
list(REMOVE_ITEM hip_library_ops_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS})
endif()
# cpplint code style
foreach(source_file ${hip_library_ops_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND hip_library_ops_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
else(hip_library_ops_SRCS)
if (hip_library_ops_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_ops_DEPS})
else()
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(hip_library_ops_SRCS)
endif()
endfunction(hip_library_ops)
function(hip_binary TARGET_NAME)
if (WITH_AMD_GPU)
if (WITH_ROCM_PLATFORM)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
if(hip_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
......@@ -604,12 +652,13 @@ function(hip_binary TARGET_NAME)
endfunction(hip_binary)
function(hip_test TARGET_NAME)
if (WITH_AMD_GPU AND WITH_TESTING)
if (WITH_ROCM_PLATFORM AND WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_test_SRCS})
set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
......
if(NOT WITH_AMD_GPU)
if(NOT WITH_ROCM_PLATFORM)
return()
endif()
include_directories("/opt/rocm/include")
include_directories("/opt/rocm/hip/include")
include_directories("/opt/rocm/miopen/include")
include_directories("/opt/rocm/hipblas/include")
include_directories("/opt/rocm/hiprand/include")
include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust")
include_directories("${ROCM_PATH}/include")
include_directories("${ROCM_PATH}/hip/include")
include_directories("${ROCM_PATH}/miopen/include")
include_directories("${ROCM_PATH}/hipblas/include")
include_directories("${ROCM_PATH}/rocblas/include")
include_directories("${ROCM_PATH}/hiprand/include")
include_directories("${ROCM_PATH}/rocrand/include")
include_directories("${ROCM_PATH}/rccl/include")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
include_directories("${ROCM_PATH}/rocthrust/include/")
include_directories("${ROCM_PATH}/hipcub/include/")
include_directories("${ROCM_PATH}/rocprim/include/")
include_directories("${ROCM_PATH}/hipsparse/include/")
include_directories("${ROCM_PATH}/rocsparse/include/")
include_directories("${ROCM_PATH}/rocfft/include/")
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "")
set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "")
# now default is clang
set(HIP_COMPILER "clang")
list(APPEND EXTERNAL_LIBS "-L${ROCM_PATH}/lib/ -lhip_hcc")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -DEIGEN_USE_HIP -DEIGEN_USE_GPU -D__HIP_NO_HALF_CONVERSIONS__ -std=c++11 --amdgpu-target=gfx906" )
if(WITH_RCCL)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_RCCL")
endif()
if(NOT WITH_PYTHON)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_NO_PYTHON")
endif(NOT WITH_PYTHON)
if(WITH_DSO)
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_USE_DSO")
endif(WITH_DSO)
if(WITH_TESTING)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_TESTING")
endif(WITH_TESTING)
if(WITH_DISTRIBUTE)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
endif(WITH_DISTRIBUTE)
if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC)
if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "/opt/rocm/hcc")
endif()
if("${HIP_COMPILER}" STREQUAL "hcc")
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "${ROCM_PATH}/hcc")
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -ldl --amdgpu-target=gfx906 ")
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906")
elseif("${HIP_COMPILER}" STREQUAL "clang")
if("x${HIP_CLANG_PATH}" STREQUAL "x")
set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin")
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
#Number of parallel jobs by default is 1
if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
endif()
#Add support for parallel build and link
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
endif()
if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS})
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral")
set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}")
else()
message("clang compiler doesn't support parallel jobs")
endif()
endif()
# Set the CMake Flags to use the HIP-Clang Compiler.
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> --amdgpu-target=gfx906")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906" )
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -ldl --amdgpu-target=gfx906")
endif()
......@@ -52,10 +52,24 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
endif()
if(WITH_AMD_GPU)
if(WITH_ROCM_PLATFORM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu.cc)
list(APPEND hip_cu_cc_srcs ${TARGET}.hip.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND hip_cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu)
endif()
string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu.cc)
list(APPEND miopen_hip_cu_cc_srcs ${MIOPEN_FILE}.hip.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu)
list(APPEND miopen_hip_cu_srcs ${MIOPEN_FILE}.hip.cu)
endif()
endif()
if(WITH_MKLDNN)
......@@ -72,16 +86,20 @@ function(op_library TARGET)
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$")
if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
list(APPEND hip_cu_srcs ${src})
elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu.cc$")
list(APPEND hip_cu_cc_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu$")
list(APPEND cudnn_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cc$")
list(APPEND miopen_hip_cc_srcs ${src})
elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cu$")
list(APPEND miopen_hip_cu_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
......@@ -145,8 +163,8 @@ function(op_library TARGET)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
elseif (WITH_ROCM_PLATFORM)
hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
......@@ -237,8 +255,19 @@ function(op_library TARGET)
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len)
if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0)
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len)
if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
......
......@@ -209,11 +209,6 @@ include(external/warpctc) # download, build, install warpctc
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
if(WITH_AMD_GPU)
include(external/rocprim) # download, build, install rocprim
list(APPEND third_party_deps extern_rocprim)
endif()
include(cblas) # find first, then download, build, install openblas
if(${CBLAS_PROVIDER} STREQUAL MKLML)
list(APPEND third_party_deps extern_mklml)
......
......@@ -35,8 +35,8 @@ function(math_library TARGET)
list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM_PLATFORM AND (${hip_srcs} MATCHES ".*\\.hip.cu$"))
hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc)
#hip
if (WITH_ROCM_PLATFORM)
list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
endif()
# There is no macOS version of NCCL.
# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
......@@ -9,6 +13,12 @@ if (NOT APPLE AND NOT WIN32)
if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc)
endif()
if (WITH_ROCM_PLATFORM)
list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
if (WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc)
endif()
endif()
endif()
if (TENSORRT_FOUND)
......@@ -19,8 +29,13 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
if(WITH_ROCM_PLATFORM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif()
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif()
......
......@@ -46,6 +46,24 @@ DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
#ifdef PADDLE_WITH_HIP
DEFINE_string(miopen_dir, "",
"Specify path for loading libMIOpen.so. For instance, "
"/opt/rocm/miopen/lib. If empty [default], dlopen "
"will search miopen from LD_LIBRARY_PATH");
DEFINE_string(rocm_dir, "",
"Specify path for loading rocm library, such as librocblas, "
"libcurand, libcusolver. For instance, /opt/rocm/lib. "
"If default, dlopen will search rocm from LD_LIBRARY_PATH");
DEFINE_string(rccl_dir, "",
"Specify path for loading rccl library, such as librccl.so. "
"For instance, /opt/rocm/rccl/lib. If default, "
"dlopen will search rccl from LD_LIBRARY_PATH");
#endif
namespace paddle {
namespace platform {
namespace dynload {
......@@ -246,6 +264,8 @@ void* GetCublasDsoHandle() {
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
{cuda_lib_path});
#elif PADDLE_WITH_HIP
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif
......@@ -272,6 +292,8 @@ void* GetCUDNNDsoHandle() {
"CUDNN version.");
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
{cuda_lib_path}, win_warn_meg);
#elif PADDLE_WITH_HIP
return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
#else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
{cuda_lib_path});
......@@ -294,6 +316,8 @@ void* GetCurandDsoHandle() {
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
{cuda_lib_path});
#elif PADDLE_WITH_HIP
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif
......@@ -313,6 +337,8 @@ void* GetCusolverDsoHandle() {
void* GetNVRTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
#elif PADDLE_WITH_HIP
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
#endif
......@@ -321,6 +347,8 @@ void* GetNVRTCDsoHandle() {
void* GetCUDADsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
#elif PADDLE_WITH_HIP
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
#endif
......@@ -348,6 +376,8 @@ void* GetNCCLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg);
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hiprand.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hiprand_dso_flag;
void *hiprand_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hiprand.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hiprand_dso_flag;
extern void *hiprand_dso_handle;
#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
hiprandStatus_t operator()(Args... args) { \
using hiprandFunc = decltype(&::__name); \
std::call_once(hiprand_dso_flag, []() { \
hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \
return reinterpret_cast<hiprandFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HIPRAND_RAND_ROUTINE_EACH(__macro) \
__macro(hiprandCreateGenerator); \
__macro(hiprandSetStream); \
__macro(hiprandSetPseudoRandomGeneratorSeed); \
__macro(hiprandGenerateUniform); \
__macro(hiprandGenerateUniformDouble); \
__macro(hiprandGenerateNormal); \
__macro(hiprandDestroyGenerator);
HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hiprtc.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hiprtc_dso_flag;
void* hiprtc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
bool HasNVRTC() {
std::call_once(hiprtc_dso_flag,
[]() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
return hiprtc_dso_handle != nullptr;
}
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hip/hiprtc.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hiprtc_dso_flag;
extern void* hiprtc_dso_handle;
extern bool HasNVRTC();
#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hiprtc_func = decltype(&::__name); \
std::call_once(hiprtc_dso_flag, []() { \
hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
}); \
static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \
return reinterpret_cast<hiprtc_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed hiprtc functions
**/
#define HIPRTC_ROUTINE_EACH(__macro) \
__macro(hiprtcGetErrorString); \
__macro(hiprtcCompileProgram); \
__macro(hiprtcCreateProgram); \
__macro(hiprtcDestroyProgram); \
__macro(hiprtcGetCode); \
__macro(hiprtcGetCodeSize); \
__macro(hiprtcGetProgramLog); \
__macro(hiprtcGetProgramLogSize)
HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag miopen_dso_flag;
void* miopen_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP);
MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_R5
MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_R6
MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_R7
MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(miopen_dso_flag,
[]() { miopen_dso_handle = GetCUDNNDsoHandle(); });
return miopen_dso_handle != nullptr;
}
void EnforceCUDNNLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
miopen_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load miopen shared library. Cannot invoke method %s.",
fn_name));
}
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <miopen/miopen.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag miopen_dso_flag;
extern void* miopen_dso_handle;
extern bool HasCUDNN();
inline const char* miopenGetErrorString(miopenStatus_t status) {
switch (status) {
case miopenStatusSuccess:
return "MIOPEN_STATUS_SUCCESS";
case miopenStatusNotInitialized:
return "MIOPEN_STATUS_NOT_INITIALIZED";
case miopenStatusInvalidValue:
return "MIOPEN_STATUS_INVALID_VALUE";
case miopenStatusBadParm:
return "MIOPEN_STATUS_BAD_PARAM";
case miopenStatusAllocFailed:
return "MIOPEN_STATUS_ALLOC_FAILED";
case miopenStatusInternalError:
return "MIOPEN_STATUS_INTERNAL_ERROR";
case miopenStatusNotImplemented:
return "MIOPEN_STATUS_NOT_IMPLEMENTED";
case miopenStatusUnknownError:
default:
return "MIOPEN_STATUS_UNKNOWN_ERROR";
}
}
extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using miopen_func = decltype(&::__name); \
std::call_once(miopen_dso_flag, []() { \
miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
static void* p_##__name = dlsym(miopen_dso_handle, #__name); \
return reinterpret_cast<miopen_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed miopen functions in HPPL
**/
#define MIOPEN_DNN_ROUTINE_EACH(__macro) \
__macro(miopenSet4dTensorDescriptor); \
__macro(miopenSetTensorDescriptor); \
__macro(miopenInitConvolutionNdDescriptor); \
__macro(miopenFindConvolutionForwardAlgorithm); \
__macro(miopenGetConvolutionNdForwardOutputDim); \
__macro(miopenFindConvolutionBackwardDataAlgorithm); \
__macro(miopenFindConvolutionBackwardWeightsAlgorithm); \
__macro(miopenGetTensorDescriptor); \
__macro(miopenCreateTensorDescriptor); \
__macro(miopenDestroyTensorDescriptor); \
__macro(miopenSet2dPoolingDescriptor); \
__macro(miopenGet2dPoolingDescriptor); \
__macro(miopenGetPoolingNdForwardOutputDim); \
__macro(miopenCreateConvolutionDescriptor); \
__macro(miopenCreatePoolingDescriptor); \
__macro(miopenDestroyPoolingDescriptor); \
__macro(miopenPoolingGetWorkSpaceSize); \
__macro(miopenPoolingGetWorkSpaceSizeV2); \
__macro(miopenSetNdPoolingDescriptor); \
__macro(miopenInitConvolutionDescriptor); \
__macro(miopenDestroyConvolutionDescriptor); \
__macro(miopenGetConvolutionNdDescriptor); \
__macro(miopenDeriveBNTensorDescriptor); \
__macro(miopenCreate); \
__macro(miopenDestroy); \
__macro(miopenSetStream); \
__macro(miopenActivationForward); \
__macro(miopenActivationBackward); \
__macro(miopenConvolutionBackwardWeights); \
__macro(miopenConvolutionForward); \
__macro(miopenConvolutionBackwardBias); \
__macro(miopenConvolutionForwardGetWorkSpaceSize); \
__macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \
__macro(miopenTransformTensor); \
__macro(miopenPoolingForward); \
__macro(miopenPoolingBackward); \
__macro(miopenSoftmaxBackward); \
__macro(miopenSoftmaxForward); \
__macro(miopenCreateDropoutDescriptor); \
__macro(miopenDropoutGetStatesSize); \
__macro(miopenSetDropoutDescriptor); \
__macro(miopenCreateRNNDescriptor); \
__macro(miopenSetRNNDescriptor); \
__macro(miopenGetRNNParamsSize); \
__macro(miopenGetRNNWorkspaceSize); \
__macro(miopenGetRNNTrainingReserveSize); \
__macro(miopenRNNForwardTraining); \
__macro(miopenRNNBackwardData); \
__macro(miopenRNNBackwardWeights); \
__macro(miopenRNNForwardInference); \
__macro(miopenDestroyDropoutDescriptor); \
__macro(miopenDestroyRNNDescriptor);
MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
__macro(miopenConvolutionBackwardData);
MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R3:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs available after R4:
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
__macro(miopenBatchNormalizationForwardTraining); \
__macro(miopenBatchNormalizationForwardInference); \
__macro(miopenBatchNormalizationBackward);
MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R5
#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \
__macro(miopenCreateActivationDescriptor); \
__macro(miopenSetActivationDescriptor); \
__macro(miopenGetActivationDescriptor); \
__macro(miopenDestroyActivationDescriptor);
MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
// APIs in R6
#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
/*__macro(miopenSetRNNDescriptor_v6);*/
MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(miopenSetConvolutionGroupCount); \
__macro(miopenCreateCTCLossDescriptor); \
__macro(miopenDestroyCTCLossDescriptor); \
__macro(miopenGetCTCLossDescriptor); \
__macro(miopenSetCTCLossDescriptor); \
__macro(miopenGetCTCLossWorkspaceSize); \
__macro(miopenCTCLoss);
MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
/*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/rccl.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rccl_dso_flag;
void *rccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <rccl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rccl_dso_flag;
extern void* rccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(&::__name); \
std::call_once(rccl_dso_flag, []() { \
rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define RCCL_RAND_ROUTINE_EACH(__macro) \
__macro(ncclCommInitAll); \
__macro(ncclGetUniqueId); \
__macro(ncclCommInitRank); \
__macro(ncclCommDestroy); \
__macro(ncclCommCount); \
__macro(ncclCommCuDevice); \
__macro(ncclCommUserRank); \
__macro(ncclAllReduce); \
__macro(ncclBcast); \
__macro(ncclAllGather); \
__macro(ncclGroupStart); \
__macro(ncclGroupEnd); \
__macro(ncclReduce); \
__macro(ncclReduceScatter); \
__macro(ncclGetErrorString);
RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/rocblas.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rocblas_dso_flag;
void *rocblas_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2
ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3
ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4
ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hip/hip_runtime.h>
#include <rocblas.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rocblas_dso_flag;
extern void *rocblas_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using rocblas_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(rocblas_dso_flag, []() { \
rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
return reinterpret_cast<rocblas_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(rocblas_saxpy); \
__macro(rocblas_daxpy); \
__macro(rocblas_sscal); \
__macro(rocblas_dscal); \
__macro(rocblas_scopy); \
__macro(rocblas_dcopy); \
__macro(rocblas_sgemv); \
__macro(rocblas_dgemv); \
__macro(rocblas_sgemm); \
__macro(rocblas_dgemm); \
__macro(rocblas_hgemm); \
__macro(rocblas_dgeam); \
/*rocblas_gemm_ex function not support at rocm3.5*/ \
/*__macro(rocblas_gemm_ex); */ \
__macro(rocblas_sgemm_batched); \
__macro(rocblas_dgemm_batched); \
__macro(rocblas_cgemm_batched); \
__macro(rocblas_zgemm_batched); \
__macro(rocblas_create_handle); \
__macro(rocblas_destroy_handle); \
__macro(rocblas_add_stream); \
__macro(rocblas_set_stream); \
__macro(rocblas_get_stream); \
__macro(rocblas_set_pointer_mode); \
__macro(rocblas_get_pointer_mode);
ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(rocblas_sgemm_strided_batched); \
__macro(rocblas_dgemm_strided_batched); \
__macro(rocblas_cgemm_strided_batched); \
__macro(rocblas_zgemm_strided_batched); \
__macro(rocblas_hgemm_strided_batched);
ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(rocblas_gemm_batched_ex); \
// rocm not support now(rocm3.5)
// __macro(rocblas_gemm_strided_batched_ex);
ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/rocm_driver.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag rocm_dso_flag;
void* rocm_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
ROCM_ROUTINE_EACH(DEFINE_WRAP);
bool HasCUDADriver() {
std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
return rocm_dso_handle != nullptr;
}
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <hip/hip_runtime.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag rocm_dso_flag;
extern void* rocm_dso_handle;
extern bool HasCUDADriver();
#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using rocm_func = decltype(&::__name); \
std::call_once(rocm_dso_flag, []() { \
rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
}); \
static void* p_##__name = dlsym(rocm_dso_handle, #__name); \
return reinterpret_cast<rocm_func>(p_##__name)(args...); \
} \
}; \
extern struct DynLoad__##__name __name
/**
* include all needed cuda driver functions
**/
#define ROCM_ROUTINE_EACH(__macro) \
__macro(hipGetErrorString); \
__macro(hipModuleLoadData); \
__macro(hipModuleGetFunction); \
__macro(hipModuleUnload); \
/*rocm3.5 not support the function*/ \
/* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
__macro(hipModuleLaunchKernel); \
__macro(hipLaunchKernel); \
__macro(hipGetDevice); \
__macro(hipDevicePrimaryCtxGetState)
ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -20,6 +20,10 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#define CUDA_VERSION 10000
#include <hip/hip_runtime.h>
#endif
#ifdef __GNUC__
#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
......@@ -37,6 +41,10 @@ limitations under the License. */
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#endif
#ifdef __HIPCC__
#define PADDLE_CUDA_FP16
#include <hip/hip_fp16.h>
#endif
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
......@@ -81,11 +89,13 @@ struct PADDLE_ALIGN(2) float16 {
// Constructors
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit float16(const half& h) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
#if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
#else
x = h.x;
#endif // CUDA_VERSION >= 9000
#endif
}
#endif // PADDLE_CUDA_FP16
......@@ -100,7 +110,9 @@ struct PADDLE_ALIGN(2) float16 {
#endif
HOSTDEVICE inline explicit float16(float val) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if ((defined(PADDLE_CUDA_FP16)) && \
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
(defined(__HIP_DEVICE_COMPILE__))))
half tmp = __float2half(val);
x = *reinterpret_cast<uint16_t*>(&tmp);
......@@ -246,7 +258,9 @@ struct PADDLE_ALIGN(2) float16 {
#endif
HOSTDEVICE inline explicit operator float() const {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(PADDLE_CUDA_FP16) && \
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
(defined(__HIP_DEVICE_COMPILE__))))
half tmp = *reinterpret_cast<const half*>(this);
return __half2float(tmp);
......@@ -353,10 +367,11 @@ struct PADDLE_ALIGN(2) float16 {
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// CUDA 9.0 regarding the half data type.
// xuan[TODO] change for rocm
#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
DEVICE inline half operator+(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hadd(a, b);
#else
float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
......@@ -365,7 +380,8 @@ DEVICE inline half operator+(const half& a, const half& b) {
}
DEVICE inline half operator-(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hsub(a, b);
#else
float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
......@@ -374,7 +390,8 @@ DEVICE inline half operator-(const half& a, const half& b) {
}
DEVICE inline half operator*(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hmul(a, b);
#else
float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
......@@ -383,7 +400,8 @@ DEVICE inline half operator*(const half& a, const half& b) {
}
DEVICE inline half operator/(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
......@@ -394,7 +412,8 @@ DEVICE inline half operator/(const half& a, const half& b) {
}
DEVICE inline half operator-(const half& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hneg(a);
#else
float res = -static_cast<float>(float16(a));
......@@ -423,7 +442,8 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT
}
DEVICE inline bool operator==(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __heq(a, b);
#else
return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
......@@ -431,7 +451,8 @@ DEVICE inline bool operator==(const half& a, const half& b) {
}
DEVICE inline bool operator!=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hne(a, b);
#else
return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
......@@ -439,7 +460,8 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
}
DEVICE inline bool operator<(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hlt(a, b);
#else
return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
......@@ -447,7 +469,8 @@ DEVICE inline bool operator<(const half& a, const half& b) {
}
DEVICE inline bool operator<=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hle(a, b);
#else
return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
......@@ -455,7 +478,8 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
}
DEVICE inline bool operator>(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hgt(a, b);
#else
return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
......@@ -463,7 +487,8 @@ DEVICE inline bool operator>(const half& a, const half& b) {
}
DEVICE inline bool operator>=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hge(a, b);
#else
return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
......@@ -475,7 +500,8 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
// Arithmetic operators for float16 on GPU
#if defined(PADDLE_CUDA_FP16)
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hadd(half(a), half(b)));
#else
return float16(static_cast<float>(a) + static_cast<float>(b));
......@@ -483,7 +509,8 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
}
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hsub(half(a), half(b)));
#else
return float16(static_cast<float>(a) - static_cast<float>(b));
......@@ -491,7 +518,8 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
}
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hmul(half(a), half(b)));
#else
return float16(static_cast<float>(a) * static_cast<float>(b));
......@@ -499,7 +527,8 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
}
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
(defined(__HIP_DEVICE_COMPILE__)))
// TODO(kexinzhao): check which cuda version starts to support __hdiv
float num = __half2float(half(a));
float denom = __half2float(half(b));
......@@ -510,7 +539,8 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
}
HOSTDEVICE inline float16 operator-(const float16& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hneg(half(a)));
#else
float16 res;
......@@ -540,7 +570,8 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT
}
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __heq(half(a), half(b));
#else
return static_cast<float>(a) == static_cast<float>(b);
......@@ -548,7 +579,8 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
}
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hne(half(a), half(b));
#else
return static_cast<float>(a) != static_cast<float>(b);
......@@ -556,7 +588,8 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
}
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hlt(half(a), half(b));
#else
return static_cast<float>(a) < static_cast<float>(b);
......@@ -564,7 +597,8 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
}
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hle(half(a), half(b));
#else
return static_cast<float>(a) <= static_cast<float>(b);
......@@ -572,7 +606,8 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
}
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hgt(half(a), half(b));
#else
return static_cast<float>(a) > static_cast<float>(b);
......@@ -580,7 +615,8 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
}
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__)))
return __hge(half(a), half(b));
#else
return static_cast<float>(a) >= static_cast<float>(b);
......@@ -846,7 +882,9 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
}
HOSTDEVICE inline bool(isnan)(const float16& a) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (defined(PADDLE_CUDA_FP16) && \
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__))))
return __hisnan(half(a));
#else
return (a.x & 0x7fff) > 0x7c00;
......@@ -936,31 +974,31 @@ struct numeric_limits<paddle::platform::float16> {
static const bool traps = true;
static const bool tinyness_before = false;
static paddle::platform::float16(min)() {
HOSTDEVICE static paddle::platform::float16(min)() {
return paddle::platform::raw_uint16_to_float16(0x400);
}
static paddle::platform::float16 lowest() {
HOSTDEVICE static paddle::platform::float16 lowest() {
return paddle::platform::raw_uint16_to_float16(0xfbff);
}
static paddle::platform::float16(max)() {
HOSTDEVICE static paddle::platform::float16(max)() {
return paddle::platform::raw_uint16_to_float16(0x7bff);
}
static paddle::platform::float16 epsilon() {
HOSTDEVICE static paddle::platform::float16 epsilon() {
return paddle::platform::raw_uint16_to_float16(0x0800);
}
static paddle::platform::float16 round_error() {
HOSTDEVICE static paddle::platform::float16 round_error() {
return paddle::platform::float16(0.5);
}
static paddle::platform::float16 infinity() {
HOSTDEVICE static paddle::platform::float16 infinity() {
return paddle::platform::raw_uint16_to_float16(0x7c00);
}
static paddle::platform::float16 quiet_NaN() {
HOSTDEVICE static paddle::platform::float16 quiet_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7e00);
}
static paddle::platform::float16 signaling_NaN() {
HOSTDEVICE static paddle::platform::float16 signaling_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7e00);
}
static paddle::platform::float16 denorm_min() {
HOSTDEVICE static paddle::platform::float16 denorm_min() {
return paddle::platform::raw_uint16_to_float16(0x1);
}
};
......
......@@ -13,7 +13,11 @@
// limitations under the License.
#pragma once
#ifdef __CUDACC__
#ifdef __HIPCC__
#include <hip/hip_runtime.h>
#endif
#if (defined(__CUDACC__) || defined(__HIPCC__))
#define HOSTDEVICE __host__ __device__
#define DEVICE __device__
#define HOST __host__
......
......@@ -145,11 +145,11 @@ if(WITH_PYTHON)
endif(WITH_MKLDNN)
endif(WIN32)
if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED
if(WITH_ROCM_PLATFORM)
cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
DEPS ARCHIVE_START ${PYBIND_DEPS}
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
DEPS ${PYBIND_DEPS}
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
else()
cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
......@@ -158,7 +158,7 @@ if(WITH_PYTHON)
if(NOT APPLE AND NOT WIN32)
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT WIN32)
endif(WITH_AMD_GPU)
endif(WITH_ROCM_PLATFORM)
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_pybind ${os_dependency_modules})
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册