From 6d5d9f23e929f37b0ec83f522d54cc1b4196ec2d Mon Sep 17 00:00:00 2001 From: hong19860320 <9973393+hong19860320@users.noreply.github.com> Date: Tue, 4 Jul 2023 14:42:20 +0800 Subject: [PATCH] [XPU] Add XPU plugin support (#55101) * Add XPU plugin to support the customized ops or improve the performance of the fusion ops based on hand-written xpu micro kernels. * refine README.md --- CMakeLists.txt | 1 + cmake/external/xpu.cmake | 6 + paddle/phi/CMakeLists.txt | 4 + paddle/phi/backends/xpu/xpu_header.h | 3 + paddle/phi/kernels/xpu/plugin/CMakeLists.txt | 379 ++++++++++++++++++ paddle/phi/kernels/xpu/plugin/README.md | 21 + paddle/phi/kernels/xpu/plugin/build.sh | 28 ++ .../kernels/xpu/plugin/example/CMakeLists.txt | 58 +++ .../phi/kernels/xpu/plugin/example/build.sh | 27 ++ .../phi/kernels/xpu/plugin/example/example.cc | 71 ++++ paddle/phi/kernels/xpu/plugin/example/run.sh | 49 +++ .../kernels/xpu/plugin/include/xpu/plugin.h | 31 ++ .../xpu/plugin/src/kernel/kunlun2cpp/add1.xpu | 57 +++ .../phi/kernels/xpu/plugin/src/linker.specs | 6 + .../kernels/xpu/plugin/src/wrapper/add2.cpp | 70 ++++ 15 files changed, 811 insertions(+) create mode 100644 paddle/phi/kernels/xpu/plugin/CMakeLists.txt create mode 100644 paddle/phi/kernels/xpu/plugin/README.md create mode 100755 paddle/phi/kernels/xpu/plugin/build.sh create mode 100644 paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt create mode 100755 paddle/phi/kernels/xpu/plugin/example/build.sh create mode 100644 paddle/phi/kernels/xpu/plugin/example/example.cc create mode 100755 paddle/phi/kernels/xpu/plugin/example/run.sh create mode 100644 paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h create mode 100644 paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/add1.xpu create mode 100644 paddle/phi/kernels/xpu/plugin/src/linker.specs create mode 100644 paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 795a9321f9f..326d4c684b5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF) option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF) +option(WITH_XPU_PLUGIN "Compile PaddlePaddle with BAIDU XPU plugin" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 8aed9944fc4..45adc981562 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -170,6 +170,12 @@ if(WITH_XPTI) set(XPU_XPTI_LIB "${XPU_LIB_DIR}/${XPU_XPTI_LIB_NAME}") endif() +if(WITH_XPU_PLUGIN) + message(STATUS "Compile with XPU PLUGIN!") + add_definitions(-DPADDLE_WITH_XPU_PLUGIN) + include_directories(${CMAKE_SOURCE_DIR}/paddle/phi/kernels/xpu/plugin/include) +endif() + if(WITH_XPU_BKCL AND WITH_XPU_XFT) target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB} ${XPU_XFT_LIB}) diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 40e44bae076..1ed3fac1228 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -86,6 +86,10 @@ endif() if(WITH_XPU) list(APPEND PHI_DEPS xpulib) + if(WITH_XPU_PLUGIN) + add_subdirectory(kernels/xpu/plugin) + list(APPEND PHI_DEPS xpuplugin) + endif() endif() set(PHI_SRCS diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h index c787874b22f..36caaf00f5e 100644 --- a/paddle/phi/backends/xpu/xpu_header.h +++ b/paddle/phi/backends/xpu/xpu_header.h @@ -24,6 +24,9 @@ limitations under the License. */ #include "xpu/runtime.h" #include "xpu/runtime_ex.h" #include "xpu/xdnn.h" +#ifdef PADDLE_WITH_XPU_PLUGIN +#include "xpu/plugin.h" +#endif namespace xpu = baidu::xpu::api; diff --git a/paddle/phi/kernels/xpu/plugin/CMakeLists.txt b/paddle/phi/kernels/xpu/plugin/CMakeLists.txt new file mode 100644 index 00000000000..18151fc8e09 --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/CMakeLists.txt @@ -0,0 +1,379 @@ +cmake_minimum_required(VERSION 3.16) + +project(xpuplugin LANGUAGES CXX) + +if(NOT DEFINED BUILD_STANDALONE) + if(NOT DEFINED XPU_INC_DIR) + message( + FATAL_ERROR + "XPU_INC_DIR not set, or directory ${XPU_INC_DIR} not found, please compile with PaddlePaddle." + ) + endif() + if(NOT DEFINED XPU_LIB_DIR) + message( + FATAL_ERROR + "XPU_LIB_DIR not set, or directory ${XPU_LIB_DIR} not found, please compile with PaddlePaddle." + ) + endif() + set(XDNN_INC_DIR ${XPU_INC_DIR}) + set(XDNN_LIB_DIR ${XPU_LIB_DIR}) + set(XRE_INC_DIR ${XPU_INC_DIR}) + set(XRE_LIB_DIR ${XPU_LIB_DIR}) + set(XPU_DEPS xpulib) # Depends cmake/external/xpu.cmake +else() + if(NOT DEFINED XDNN_PATH) + set(XDNN_PATH $ENV{XDNN_PATH}) + endif() + if(NOT DEFINED XRE_PATH) + set(XRE_PATH $ENV{XRE_PATH}) + endif() + if(NOT IS_DIRECTORY ${XDNN_PATH}) + message( + FATAL_ERROR + "XDNN_PATH not set, or directory ${XDNN_PATH} not found, please export XDNN_PATH=." + ) + endif() + if(NOT IS_DIRECTORY ${XRE_PATH}) + message( + FATAL_ERROR + "XRE_PATH not set, or directory ${XRE_PATH} not found, please export XRE_PATH=." + ) + endif() + set(XDNN_INC_DIR ${XDNN_PATH}/include) + set(XDNN_LIB_DIR ${XDNN_PATH}/so) + set(XRE_INC_DIR ${XRE_PATH}/include) + set(XRE_LIB_DIR ${XRE_PATH}/so) +endif() + +if(NOT DEFINED CLANG_PATH) + set(CLANG_PATH $ENV{CLANG_PATH}) +endif() +if(NOT IS_DIRECTORY ${CLANG_PATH}) + message( + FATAL_ERROR + "Directory ${CLANG_PATH} not found, please export CLANG_PATH=." + ) +endif() + +message(STATUS "Build with CLANG_PATH=" ${CLANG_PATH}) +set(XPU_CLANG ${CLANG_PATH}/bin/clang++) +message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG}) + +if(NOT DEFINED HOST_SYSROOT) + set(HOST_SYSROOT $ENV{HOST_SYSROOT}) +endif() +if(NOT HOST_SYSROOT) + set(HOST_SYSROOT /opt/compiler/gcc-8.2) +endif() +if(NOT IS_DIRECTORY ${HOST_SYSROOT}) + message( + FATAL_ERROR + "Directory ${HOST_SYSROOT} not found, please export HOST_SYSROOT=." + ) +endif() + +if(NOT DEFINED HOST_ARCH) + set(HOST_ARCH $ENV{HOST_ARCH}) +endif() +if(NOT HOST_ARCH) + set(HOST_ARCH x86_64-baidu-linux-gnu) +endif() + +if(NOT DEFINED TARGET_ARCH) + set(TARGET_ARCH $ENV{TARGET_ARCH}) +endif() +if(NOT TARGET_ARCH) + set(TARGET_ARCH x86_64-baidu-linux-gnu) +endif() + +if(NOT DEFINED TOOLCHAIN_ARGS) + set(TOOLCHAIN_ARGS $ENV{TOOLCHAIN_ARGS}) +endif() +set(TOOLCHAIN_ARGS -isystem ${HOST_SYSROOT}/include/c++/8.2.0 -isystem + /usr/include/ -isystem /usr/include/x86_64-linux-gnu) +if(HOST_ARCH MATCHES "x86_64") + if(TARGET_ARCH MATCHES "x86_64") + if(EXISTS ${HOST_SYSROOT}/bin/g++) + set(HOST_CXX ${HOST_SYSROOT}/bin/g++) + if(NOT EXISTS ${HOST_SYSROOT}/bin/ar) + # try gcc-ar + set(HOST_AR ${HOST_SYSROOT}/bin/gcc-ar) + endif() + else() + set(HOST_CXX /usr/bin/g++) + set(HOST_AR /usr/bin/ar) + endif() + endif() + if(TARGET_ARCH MATCHES "aarch64") + set(TOOLCHAIN_ARGS --gcc-toolchain=${HOST_SYSROOT}) + set(HOST_SYSROOT ${HOST_SYSROOT}/aarch64-linux-gnu/libc) + set(HOST_CXX ${CMAKE_CXX_COMPILER}) + set(HOST_AR ${CMAKE_AR}) + endif() +endif() +if(HOST_ARCH MATCHES "aarch64") + if(TARGET_ARCH MATCHES "aarch64") + if(EXISTS ${HOST_SYSROOT}/bin/g++) + set(HOST_CXX ${HOST_SYSROOT}/bin/g++) + set(HOST_AR ${HOST_SYSROOT}/bin/ar) + else() + set(HOST_CXX /usr/bin/g++) + set(HOST_AR /usr/bin/ar) + endif() + endif() +endif() + +set(OPT_LEVEL "-O2") +message(STATUS "Build with TARGET_ARCH=" ${TARGET_ARCH}) +message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS}) +message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT}) +message(STATUS "Build with HOST_CXX=" ${HOST_CXX}) +message(STATUS "Build with HOST_AR=" ${HOST_AR}) + +# compile xpu kernel macro function +macro( + compile_kernel + kernel_path + kernel_name + xpu_n + rule + device_o_extra_flags + host_o_extra_flags + xpu_n_macro) + set(arg_rule ${rule}) + separate_arguments(arg_rule) + set(arg_device_o_extra_flags ${device_o_extra_flags}) + separate_arguments(arg_device_o_extra_flags) + set(arg_host_o_extra_flags ${host_o_extra_flags}) + separate_arguments(arg_host_o_extra_flags) + + add_custom_command( + OUTPUT ${kernel_name}.device.bin.o ${kernel_name}.o + COMMAND + ${XPU_CLANG} -std=c++11 ${OPT_LEVEL} ${arg_device_o_extra_flags} -c + ${kernel_path} -D ${xpu_n_macro} --target=${TARGET_ARCH} ${HOST_XPU_FLAGS} + --basename ${kernel_name} -fno-builtin --xpu-arch=${xpu_n} -fPIC + -Wno-int-to-void-pointer-cast -Wno-int-to-pointer-cast -Werror -mllvm + --xpu-inline-cost -mllvm --xpu-inline-hot-call + -I${CMAKE_CURRENT_SOURCE_DIR}/include -I${CMAKE_CURRENT_SOURCE_DIR}/src + -I${CMAKE_CURRENT_SOURCE_DIR}/src/kernel + -I${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/include ${arg_rule} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${kernel_path} + COMMENT ${kernel_name}.device.bin.o ${kernel_name}.o + VERBATIM) + + list(APPEND xpuplugin_kernels_depends ${kernel_name}.device.bin.o + ${kernel_name}.o) +endmacro() + +macro( + __compile_kernel_with_rules + kernel_path + kernel_name + xpu_n + rules_path + device_o_extra_flags + host_o_extra_flags + xpu_n_macro) + file(STRINGS ${rules_path} rules) + + foreach(rule IN LISTS rules) + message(STATUS " Instantiate with '${rule}'") + execute_process( + COMMAND bash "-c" "echo -n ${rule} | md5sum | cut -c1-6" + OUTPUT_VARIABLE rule_md5 + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(kernel_name_md5 ${kernel_name}_${rule_md5}) + compile_kernel( + ${kernel_path} + ${kernel_name_md5} + ${xpu_n} + ${rule} + ${device_o_extra_flags} + ${host_o_extra_flags} + ${xpu_n_macro}) + endforeach() +endmacro() + +macro( + compile_kernel_with_rules + kernel_path + kernel_name + xpu_n + rules_path + device_o_extra_flags + host_o_extra_flags + xpu_n_macro) + # reconfigure if file |rules_path| was modified + set_property( + DIRECTORY + APPEND + PROPERTY CMAKE_CONFIGURE_DEPENDS ${rules_path}) + __compile_kernel_with_rules( + ${kernel_path} + ${kernel_name} + ${xpu_n} + ${rules_path} + ${device_o_extra_flags} + ${host_o_extra_flags} + ${xpu_n_macro}) +endmacro() + +macro(search_and_compile_kernel xpu_n) + if(${xpu_n} STREQUAL "xpu1") + set(XPU_DEVICE_O_EXTRA_FLAGS " ") + set(XPU_HOST_O_EXTRA_FLAGS " ") + set(XPU_KERNEL_PATH "src/kernel/cpp/*.xpu") + set(xpu_n_macro "__XPU1__") + elseif(${xpu_n} STREQUAL "xpu2") + set(XPU_DEVICE_O_EXTRA_FLAGS "--xpu-arch=xpu2") + set(XPU_HOST_O_EXTRA_FLAGS "--xpu-arch=xpu2") + set(XPU_KERNEL_PATH "src/kernel/kunlun2cpp/*.xpu") + set(xpu_n_macro "__XPU2__") + elseif(${xpu_n} STREQUAL "xpu3") + set(XPU_DEVICE_O_EXTRA_FLAGS "--xpu-arch=xpu3") + set(XPU_HOST_O_EXTRA_FLAGS "--xpu-arch=xpu3") + set(XPU_KERNEL_PATH "src/kernel/kunlun3cpp/*.xpu") + set(xpu_n_macro "__XPU3__") + else() + message(FATAL_ERROR "Are you sure? ${xpu_n}") + endif() + file(GLOB_RECURSE xpu_kernels ${XPU_KERNEL_PATH}) + list(LENGTH xpu_kernels xpu_kernels_num) + message(STATUS "Found ${xpu_kernels_num} ${xpu_n} kernels") + + foreach(xpu_kernel IN LISTS xpu_kernels) + message(STATUS "Process ${xpu_kernel}") + get_filename_component(kernel_name ${xpu_kernel} NAME_WE) + get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY) + set(kernel_rules ${kernel_dir}/${kernel_name}.rules) + set(kernel_name ${xpu_n}_${kernel_name}) + if(EXISTS ${kernel_rules}) + compile_kernel_with_rules( + ${xpu_kernel} + ${kernel_name} + ${xpu_n} + ${kernel_rules} + ${XPU_DEVICE_O_EXTRA_FLAGS} + ${XPU_HOST_O_EXTRA_FLAGS} + ${xpu_n_macro}) + else() + compile_kernel( + ${xpu_kernel} + ${kernel_name} + ${xpu_n} + " " + ${XPU_DEVICE_O_EXTRA_FLAGS} + ${XPU_HOST_O_EXTRA_FLAGS} + ${xpu_n_macro}) + endif() + endforeach() +endmacro() + +# compile xpu kernels +search_and_compile_kernel("xpu1") +search_and_compile_kernel("xpu2") +search_and_compile_kernel("xpu3") + +# compile xpu wrappers +file(GLOB_RECURSE xpu_wrappers src/wrapper/*.cpp) +list(LENGTH xpu_wrappers xpu_wrappers_num) +message(STATUS "Found ${xpu_wrappers_num} XPU wrappers") + +foreach(xpu_wrapper IN LISTS xpu_wrappers) + message(STATUS "Process ${xpu_wrapper}") + get_filename_component(wrapper_name ${xpu_wrapper} NAME_WE) + set(wrapper_target ${wrapper_name}_wrapper) + + add_custom_target( + ${wrapper_target} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS wrapper_build/${wrapper_name}.wrapper.d + wrapper_build/${wrapper_name}.wrapper.o + COMMENT ${wrapper_target} + VERBATIM) + + add_custom_command( + OUTPUT wrapper_build/${wrapper_name}.wrapper.d + COMMAND ${CMAKE_COMMAND} -E make_directory wrapper_build + COMMAND + ${XPU_CLANG} -M -MQ wrapper_build/${wrapper_name}.wrapper.o -MF + wrapper_build/${wrapper_name}.wrapper.d -std=c++11 -x xpu -c + ${xpu_wrapper} -I${XDNN_INC_DIR} -I${XRE_INC_DIR} + -I${CMAKE_CURRENT_SOURCE_DIR}/include -I${CMAKE_CURRENT_SOURCE_DIR}/src + -I${CMAKE_CURRENT_SOURCE_DIR}/src/wrapper -D_GNU_SOURCE + -D__STDC_LIMIT_MACROS -DNDEBUG --sysroot=${HOST_SYSROOT} ${TOOLCHAIN_ARGS} + --target=${TARGET_ARCH} -fPIC -Werror -Wreorder -fvisibility=hidden + --xpu-host-only ${XPU_MF_FLAGS} + COMMAND + ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR} + ${CMAKE_BINARY_DIR}/CMakeFiles/${wrapper_target}.dir/DependInfo.cmake + --color=$(COLOR) + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpu_wrapper} ${XPU_DEPS} + COMMENT wrapper_build/${wrapper_name}.wrapper.d + VERBATIM) + + add_custom_command( + OUTPUT wrapper_build/${wrapper_name}.wrapper.o + COMMAND ${CMAKE_COMMAND} -E make_directory wrapper_build + COMMAND + ${XPU_CLANG} -std=c++11 ${EXTRA_FLAGS} ${OPT_LEVEL} -x xpu -c + ${xpu_wrapper} -o wrapper_build/${wrapper_name}.wrapper.o + -I${XDNN_INC_DIR} -I${XRE_INC_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/include + -I${CMAKE_CURRENT_SOURCE_DIR}/src + -I${CMAKE_CURRENT_SOURCE_DIR}/src/wrapper -D_GNU_SOURCE + -D__STDC_LIMIT_MACROS -DNDEBUG --sysroot=${HOST_SYSROOT} ${TOOLCHAIN_ARGS} + --target=${TARGET_ARCH} -fPIC -Wunused-variable -Werror -Wreorder + -fvisibility=hidden --xpu-host-only ${HOST_XPU_FLAGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS wrapper_build/${wrapper_name}.wrapper.d + COMMENT wrapper_build/${wrapper_name}.wrapper.o + VERBATIM) + list(APPEND xpuplugin_wrapper_depends wrapper_build/${wrapper_name}.wrapper.o) +endforeach() + +add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a + COMMAND ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a + ${xpuplugin_kernels_depends} ${xpuplugin_wrapper_depends} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpuplugin_kernels_depends} ${xpuplugin_wrapper_depends} + COMMENT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a + VERBATIM) + +add_custom_target( + xpuplugin_a + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${xpuplugin_kernels_depends} ${xpuplugin_wrapper_depends} + ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a + COMMENT xpuplugin_a + VERBATIM) + +add_custom_target( + xpuplugin_so ALL + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS xpuplugin_a ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so + COMMENT xpuplugin_so) + +add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so + COMMAND + ${HOST_CXX} -shared -o ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so -Xlinker + \"-\(\" -Wl,--whole-archive ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a + -Wl,--no-whole-archive -L${XDNN_LIB_DIR} -L${XRE_LIB_DIR} -lxpurt -lxpuapi + -Wl,--no-undefined -Wl,-soname,libxpuplugin.so -lstdc++ -ldl -lm -lpthread + -specs=${CMAKE_CURRENT_SOURCE_DIR}/src/linker.specs -Xlinker \"-\)\"\; + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a + COMMENT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so) + +if(NOT DEFINED BUILD_STANDALONE) + add_library(xpuplugin STATIC IMPORTED GLOBAL) + add_dependencies(xpuplugin xpuplugin_a) + set_target_properties( + xpuplugin PROPERTIES IMPORTED_LOCATION + ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a) +endif() diff --git a/paddle/phi/kernels/xpu/plugin/README.md b/paddle/phi/kernels/xpu/plugin/README.md new file mode 100644 index 00000000000..e0783d50e6f --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/README.md @@ -0,0 +1,21 @@ +# XPU PLUGIN +## Standalone build and test. +``` +$ cd plugin +Modify ./build.sh to set the path of XDNN, XRE and XTDK. +$ ./build.sh + +$ cd example +Modify ./example/build.sh to set the path of XDNN and XRE. +$ ./build.sh +$ ./run.sh +``` +## Build with PaddlePaddle. +### Copy to the source code of PaddlePaddle. +``` +$ cp -rf plugin /paddle/phi/xpu +``` +### Add -DWITH_XPU_PLUGIN=ON as extra cmake arguments. +``` +$ cmake .. -DWITH_XPU_PLUGIN=ON +``` diff --git a/paddle/phi/kernels/xpu/plugin/build.sh b/paddle/phi/kernels/xpu/plugin/build.sh new file mode 100755 index 00000000000..9e446436ab0 --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +export XDNN_PATH=/opt/xdnn # +export XRE_PATH=/opt/xre # +export CLANG_PATH=/opt/xtdk # +export HOST_SYSROOT=/opt/compiler/gcc-8.2 # + +rm -rf build +mkdir build +cd build +cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_STANDALONE=ON .. +make diff --git a/paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt b/paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt new file mode 100644 index 00000000000..f9b714ade3a --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt @@ -0,0 +1,58 @@ +cmake_minimum_required(VERSION 3.16) + +project(example LANGUAGES CXX) + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR x86_64) +set(CMAKE_C_COMPILER "gcc") +set(CMAKE_CXX_COMPILER "g++") + +if(NOT DEFINED XDNN_PATH) + set(XDNN_PATH $ENV{XDNN_PATH}) +endif() +if(NOT DEFINED XRE_PATH) + set(XRE_PATH $ENV{XRE_PATH}) +endif() +if(NOT IS_DIRECTORY ${XDNN_PATH}) + message( + FATAL_ERROR + "XDNN_PATH not set, or directory ${XDNN_PATH} not found, please export XDNN_PATH=." + ) +endif() +if(NOT IS_DIRECTORY ${XRE_PATH}) + message( + FATAL_ERROR + "XRE_PATH not set, or directory ${XRE_PATH} not found, please export XRE_PATH=." + ) +endif() +set(XDNN_INC_DIR ${XDNN_PATH}/include) +set(XDNN_LIB_DIR ${XDNN_PATH}/so) +set(XRE_INC_DIR ${XRE_PATH}/include) +set(XRE_LIB_DIR ${XRE_PATH}/so) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wl,--allow-shlib-undefined") + +include_directories(${XDNN_INC_DIR}) +include_directories(${XRE_INC_DIR}) +link_directories(${XDNN_LIB_DIR}) +link_directories(${XRE_LIB_DIR}) +set(DEPS ${DEPS} xpurt xpuapi) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include) +if(NOT DEFINED LINK_TYPE) + set(LINK_TYPE $ENV{LINK_TYPE}) +endif() +if(LINK_TYPE STREQUAL "static") + set(DEPS ${DEPS} ${CMAKE_CURRENT_SOURCE_DIR}/../build/libxpuplugin.a) +elseif(LINK_TYPE STREQUAL "shared") + link_directories(${CMAKE_CURRENT_SOURCE_DIR}/../build) + set(DEPS ${DEPS} xpuplugin) +else() + message( + FATAL_ERROR + "Unknown LINK_TYPE ${LINK_TYPE}, only supports static or shared.") + return() +endif() + +add_executable(example example.cc) +target_link_libraries(example ${DEPS}) diff --git a/paddle/phi/kernels/xpu/plugin/example/build.sh b/paddle/phi/kernels/xpu/plugin/example/build.sh new file mode 100755 index 00000000000..e5d74dcafc8 --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/example/build.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +export XDNN_PATH=/opt/xdnn # +export XRE_PATH=/opt/xre # +export LINK_TYPE=static # shared/static + +rm -rf build +mkdir build +cd build +cmake -DCMAKE_VERBOSE_MAKEFILE=ON .. +make diff --git a/paddle/phi/kernels/xpu/plugin/example/example.cc b/paddle/phi/kernels/xpu/plugin/example/example.cc new file mode 100644 index 00000000000..1a669842d0b --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/example/example.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + * copyright (C) 2022 KUNLUNXIN, Inc + */ + +#include +#include "xpu/plugin.h" +#include "xpu/xdnn.h" + +namespace xdnn = baidu::xpu::api; + +int main() { + int num = 5; + int errcode = 0; + auto ctx = xdnn::create_context(); + float* A = nullptr; + errcode = xpu_malloc(reinterpret_cast(&A), num * sizeof(float)); + assert(errcode == 0); + float* B = nullptr; + errcode = xpu_malloc(reinterpret_cast(&B), num * sizeof(float)); + assert(errcode == 0); + + std::vector A_cpu = {1, 2, 3, 4, 5}; + std::vector B_cpu(num, 0.0f); + std::vector B_ref = {3, 4, 5, 6, 7}; + xpu_memcpy(reinterpret_cast(A), + reinterpret_cast(&(A_cpu[0])), + num * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + errcode = xdnn::plugin::add2(ctx, A, B, num); + assert(errcode == 0); + xpu_memcpy(reinterpret_cast(&(B_cpu[0])), + reinterpret_cast(B), + num * sizeof(float), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + printf("A(%p):\n", A); + for (size_t i = 0; i < num; i++) { + printf("%f ", A_cpu[i]); + } + printf("\nB(%p):\n", B); + for (size_t i = 0; i < num; i++) { + printf("%f ", B_cpu[i]); + } + bool pass = true; + for (size_t i = 0; i < num; i++) { + if (fabs(B_cpu[i] - B_ref[i]) > 1e-5f) { + pass = false; + break; + } + } + printf("\nCheck %s! \n", pass ? "pass" : "fail"); + + destroy_context(ctx); + errcode = xpu_free(A); + assert(errcode == 0); + errcode = xpu_free(B); + assert(errcode == 0); + return 0; +} diff --git a/paddle/phi/kernels/xpu/plugin/example/run.sh b/paddle/phi/kernels/xpu/plugin/example/run.sh new file mode 100755 index 00000000000..d3936a51a68 --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/example/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +XDNN_PATH=/opt/xdnn # +XRE_PATH=/opt/xre # + +:<= ncores) { + return; + } + + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + + const int buf_size = 128; + __simd__ float local_x[buf_size]; + __simd__ float local_y[buf_size]; + + float32x16_t v_x; + float32x16_t v_y; + int len_per_loop = 128; + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + int read_len = min(len_per_loop, len - i); + GM2LM(x + i, local_x, read_len * sizeof(float)); + for (int k = 0; k < read_len; k += 16) { + v_x = vload_lm_float32x16(local_x + k); + v_y = svadd_float32x16(1.0f, v_x); + vstore_lm_float32x16((local_y + k), v_y); + } + mfence(); + LM2GM(local_y, y + i, read_len * sizeof(float)); + } +} + +} // namespace plugin +} // namespace xpu2 diff --git a/paddle/phi/kernels/xpu/plugin/src/linker.specs b/paddle/phi/kernels/xpu/plugin/src/linker.specs new file mode 100644 index 00000000000..55f6f783707 --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/src/linker.specs @@ -0,0 +1,6 @@ +# overwrite incorrect rpath arguments +# its original value is: +# -rpath $ORIGIN:$ORIGIN/lib:$ORIGIN/lib64:$ORIGIN/../lib:$ORIGIN/../lib64:/opt/compiler/gcc-4.8.2/lib:/opt/compiler/gcc-4.8.2/lib64 +# specify your own rpath if needed. +*linker: +collect2 -rpath $ORIGIN diff --git a/paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp b/paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp new file mode 100644 index 00000000000..31728befc68 --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp @@ -0,0 +1,70 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + * copyright (C) 2022 KUNLUNXIN, Inc + */ + +#include "xpu/plugin.h" +#include "xpu/refactor/impl_public/wrapper_check.h" + +namespace xpu2 { +namespace plugin { +__attribute__((global)) void add1(const float* x, float* y, int len); +} +} // namespace xpu2 + +namespace baidu { +namespace xpu { +namespace api { +namespace plugin { + +static int cpu_wrapper(Context* ctx, const float* x, float* y, int len) { + for (int i = 0; i < len; i++) { + y[i] = x[i] + 2.0f; + } + return SUCCESS; +} + +static int xpu2_wrapper(Context* ctx, const float* x, float* y, int len) { + ctx_guard RAII_GUARD(ctx); + float* tensor_one = RAII_GUARD.alloc(len); + WRAPPER_ASSERT_WORKSPACE(ctx, tensor_one); + int ret = constant(ctx, tensor_one, len, 1.0f); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = add(ctx, x, tensor_one, y, len); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + xpu2::plugin::add1<<ncluster(), 64, ctx->xpu_stream>>>(y, y, len); + return api::SUCCESS; +} + +int add2(Context* ctx, const float* x, float* y, int len) { + WRAPPER_CHECK_CTX(ctx); + WRAPPER_DUMP_FUNCTION_T1(ctx, "add2", float); + WRAPPER_DUMP_PARAM3(ctx, x, y, len); + WRAPPER_DUMP(ctx); + WRAPPER_ASSERT_GT(ctx, len, 0); + WRAPPER_CHECK_2PTRS(ctx, float, len, x, y); + if (ctx->dev().type() == api::kCPU) { + return cpu_wrapper(ctx, x, y, len); + } + if (ctx->dev().type() == api::kXPU2) { + return xpu2_wrapper(ctx, x, y, len); + } + return NOT_IMPLEMENT; +} + +} // namespace plugin +} // namespace api +} // namespace xpu +} // namespace baidu -- GitLab