From 6d5d9f23e929f37b0ec83f522d54cc1b4196ec2d Mon Sep 17 00:00:00 2001
From: hong19860320 <9973393+hong19860320@users.noreply.github.com>
Date: Tue, 4 Jul 2023 14:42:20 +0800
Subject: [PATCH] [XPU] Add XPU plugin support (#55101)

* Add XPU plugin to support the customized ops or improve the performance of the fusion ops based on hand-written xpu micro kernels.

* refine README.md
---
 CMakeLists.txt                                |   1 +
 cmake/external/xpu.cmake                      |   6 +
 paddle/phi/CMakeLists.txt                     |   4 +
 paddle/phi/backends/xpu/xpu_header.h          |   3 +
 paddle/phi/kernels/xpu/plugin/CMakeLists.txt  | 379 ++++++++++++++++++
 paddle/phi/kernels/xpu/plugin/README.md       |  21 +
 paddle/phi/kernels/xpu/plugin/build.sh        |  28 ++
 .../kernels/xpu/plugin/example/CMakeLists.txt |  58 +++
 .../phi/kernels/xpu/plugin/example/build.sh   |  27 ++
 .../phi/kernels/xpu/plugin/example/example.cc |  71 ++++
 paddle/phi/kernels/xpu/plugin/example/run.sh  |  49 +++
 .../kernels/xpu/plugin/include/xpu/plugin.h   |  31 ++
 .../xpu/plugin/src/kernel/kunlun2cpp/add1.xpu |  57 +++
 .../phi/kernels/xpu/plugin/src/linker.specs   |   6 +
 .../kernels/xpu/plugin/src/wrapper/add2.cpp   |  70 ++++
 15 files changed, 811 insertions(+)
 create mode 100644 paddle/phi/kernels/xpu/plugin/CMakeLists.txt
 create mode 100644 paddle/phi/kernels/xpu/plugin/README.md
 create mode 100755 paddle/phi/kernels/xpu/plugin/build.sh
 create mode 100644 paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt
 create mode 100755 paddle/phi/kernels/xpu/plugin/example/build.sh
 create mode 100644 paddle/phi/kernels/xpu/plugin/example/example.cc
 create mode 100755 paddle/phi/kernels/xpu/plugin/example/run.sh
 create mode 100644 paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h
 create mode 100644 paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/add1.xpu
 create mode 100644 paddle/phi/kernels/xpu/plugin/src/linker.specs
 create mode 100644 paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 795a9321f9f..326d4c684b5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
 option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
 option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
 option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
+option(WITH_XPU_PLUGIN "Compile PaddlePaddle with BAIDU XPU plugin" OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
 option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 8aed9944fc4..45adc981562 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -170,6 +170,12 @@ if(WITH_XPTI)
   set(XPU_XPTI_LIB "${XPU_LIB_DIR}/${XPU_XPTI_LIB_NAME}")
 endif()
 
+if(WITH_XPU_PLUGIN)
+  message(STATUS "Compile with XPU PLUGIN!")
+  add_definitions(-DPADDLE_WITH_XPU_PLUGIN)
+  include_directories(${CMAKE_SOURCE_DIR}/paddle/phi/kernels/xpu/plugin/include)
+endif()
+
 if(WITH_XPU_BKCL AND WITH_XPU_XFT)
   target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}
                         ${XPU_XFT_LIB})
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 40e44bae076..1ed3fac1228 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -86,6 +86,10 @@ endif()
 
 if(WITH_XPU)
   list(APPEND PHI_DEPS xpulib)
+  if(WITH_XPU_PLUGIN)
+    add_subdirectory(kernels/xpu/plugin)
+    list(APPEND PHI_DEPS xpuplugin)
+  endif()
 endif()
 
 set(PHI_SRCS
diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h
index c787874b22f..36caaf00f5e 100644
--- a/paddle/phi/backends/xpu/xpu_header.h
+++ b/paddle/phi/backends/xpu/xpu_header.h
@@ -24,6 +24,9 @@ limitations under the License. */
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
+#ifdef PADDLE_WITH_XPU_PLUGIN
+#include "xpu/plugin.h"
+#endif
 
 namespace xpu = baidu::xpu::api;
 
diff --git a/paddle/phi/kernels/xpu/plugin/CMakeLists.txt b/paddle/phi/kernels/xpu/plugin/CMakeLists.txt
new file mode 100644
index 00000000000..18151fc8e09
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/CMakeLists.txt
@@ -0,0 +1,379 @@
+cmake_minimum_required(VERSION 3.16)
+
+project(xpuplugin LANGUAGES CXX)
+
+if(NOT DEFINED BUILD_STANDALONE)
+  if(NOT DEFINED XPU_INC_DIR)
+    message(
+      FATAL_ERROR
+        "XPU_INC_DIR not set, or directory ${XPU_INC_DIR} not found, please compile with PaddlePaddle."
+    )
+  endif()
+  if(NOT DEFINED XPU_LIB_DIR)
+    message(
+      FATAL_ERROR
+        "XPU_LIB_DIR not set, or directory ${XPU_LIB_DIR} not found, please compile with PaddlePaddle."
+    )
+  endif()
+  set(XDNN_INC_DIR ${XPU_INC_DIR})
+  set(XDNN_LIB_DIR ${XPU_LIB_DIR})
+  set(XRE_INC_DIR ${XPU_INC_DIR})
+  set(XRE_LIB_DIR ${XPU_LIB_DIR})
+  set(XPU_DEPS xpulib) # Depends cmake/external/xpu.cmake
+else()
+  if(NOT DEFINED XDNN_PATH)
+    set(XDNN_PATH $ENV{XDNN_PATH})
+  endif()
+  if(NOT DEFINED XRE_PATH)
+    set(XRE_PATH $ENV{XRE_PATH})
+  endif()
+  if(NOT IS_DIRECTORY ${XDNN_PATH})
+    message(
+      FATAL_ERROR
+        "XDNN_PATH not set, or directory ${XDNN_PATH} not found, please export XDNN_PATH=<path_to_xdnn>."
+    )
+  endif()
+  if(NOT IS_DIRECTORY ${XRE_PATH})
+    message(
+      FATAL_ERROR
+        "XRE_PATH not set, or directory ${XRE_PATH} not found, please export XRE_PATH=<path_to_xre>."
+    )
+  endif()
+  set(XDNN_INC_DIR ${XDNN_PATH}/include)
+  set(XDNN_LIB_DIR ${XDNN_PATH}/so)
+  set(XRE_INC_DIR ${XRE_PATH}/include)
+  set(XRE_LIB_DIR ${XRE_PATH}/so)
+endif()
+
+if(NOT DEFINED CLANG_PATH)
+  set(CLANG_PATH $ENV{CLANG_PATH})
+endif()
+if(NOT IS_DIRECTORY ${CLANG_PATH})
+  message(
+    FATAL_ERROR
+      "Directory ${CLANG_PATH} not found, please export CLANG_PATH=<path_to_xtdk>."
+  )
+endif()
+
+message(STATUS "Build with CLANG_PATH=" ${CLANG_PATH})
+set(XPU_CLANG ${CLANG_PATH}/bin/clang++)
+message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
+
+if(NOT DEFINED HOST_SYSROOT)
+  set(HOST_SYSROOT $ENV{HOST_SYSROOT})
+endif()
+if(NOT HOST_SYSROOT)
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
+endif()
+if(NOT IS_DIRECTORY ${HOST_SYSROOT})
+  message(
+    FATAL_ERROR
+      "Directory ${HOST_SYSROOT} not found, please export HOST_SYSROOT=<path_to_gcc>."
+  )
+endif()
+
+if(NOT DEFINED HOST_ARCH)
+  set(HOST_ARCH $ENV{HOST_ARCH})
+endif()
+if(NOT HOST_ARCH)
+  set(HOST_ARCH x86_64-baidu-linux-gnu)
+endif()
+
+if(NOT DEFINED TARGET_ARCH)
+  set(TARGET_ARCH $ENV{TARGET_ARCH})
+endif()
+if(NOT TARGET_ARCH)
+  set(TARGET_ARCH x86_64-baidu-linux-gnu)
+endif()
+
+if(NOT DEFINED TOOLCHAIN_ARGS)
+  set(TOOLCHAIN_ARGS $ENV{TOOLCHAIN_ARGS})
+endif()
+set(TOOLCHAIN_ARGS -isystem ${HOST_SYSROOT}/include/c++/8.2.0 -isystem
+                   /usr/include/ -isystem /usr/include/x86_64-linux-gnu)
+if(HOST_ARCH MATCHES "x86_64")
+  if(TARGET_ARCH MATCHES "x86_64")
+    if(EXISTS ${HOST_SYSROOT}/bin/g++)
+      set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+      if(NOT EXISTS ${HOST_SYSROOT}/bin/ar)
+        # try gcc-ar
+        set(HOST_AR ${HOST_SYSROOT}/bin/gcc-ar)
+      endif()
+    else()
+      set(HOST_CXX /usr/bin/g++)
+      set(HOST_AR /usr/bin/ar)
+    endif()
+  endif()
+  if(TARGET_ARCH MATCHES "aarch64")
+    set(TOOLCHAIN_ARGS --gcc-toolchain=${HOST_SYSROOT})
+    set(HOST_SYSROOT ${HOST_SYSROOT}/aarch64-linux-gnu/libc)
+    set(HOST_CXX ${CMAKE_CXX_COMPILER})
+    set(HOST_AR ${CMAKE_AR})
+  endif()
+endif()
+if(HOST_ARCH MATCHES "aarch64")
+  if(TARGET_ARCH MATCHES "aarch64")
+    if(EXISTS ${HOST_SYSROOT}/bin/g++)
+      set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+      set(HOST_AR ${HOST_SYSROOT}/bin/ar)
+    else()
+      set(HOST_CXX /usr/bin/g++)
+      set(HOST_AR /usr/bin/ar)
+    endif()
+  endif()
+endif()
+
+set(OPT_LEVEL "-O2")
+message(STATUS "Build with TARGET_ARCH=" ${TARGET_ARCH})
+message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
+message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
+message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
+message(STATUS "Build with HOST_AR=" ${HOST_AR})
+
+# compile xpu kernel macro function
+macro(
+  compile_kernel
+  kernel_path
+  kernel_name
+  xpu_n
+  rule
+  device_o_extra_flags
+  host_o_extra_flags
+  xpu_n_macro)
+  set(arg_rule ${rule})
+  separate_arguments(arg_rule)
+  set(arg_device_o_extra_flags ${device_o_extra_flags})
+  separate_arguments(arg_device_o_extra_flags)
+  set(arg_host_o_extra_flags ${host_o_extra_flags})
+  separate_arguments(arg_host_o_extra_flags)
+
+  add_custom_command(
+    OUTPUT ${kernel_name}.device.bin.o ${kernel_name}.o
+    COMMAND
+      ${XPU_CLANG} -std=c++11 ${OPT_LEVEL} ${arg_device_o_extra_flags} -c
+      ${kernel_path} -D ${xpu_n_macro} --target=${TARGET_ARCH} ${HOST_XPU_FLAGS}
+      --basename ${kernel_name} -fno-builtin --xpu-arch=${xpu_n} -fPIC
+      -Wno-int-to-void-pointer-cast -Wno-int-to-pointer-cast -Werror -mllvm
+      --xpu-inline-cost -mllvm --xpu-inline-hot-call
+      -I${CMAKE_CURRENT_SOURCE_DIR}/include -I${CMAKE_CURRENT_SOURCE_DIR}/src
+      -I${CMAKE_CURRENT_SOURCE_DIR}/src/kernel
+      -I${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/include ${arg_rule}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${kernel_path}
+    COMMENT ${kernel_name}.device.bin.o ${kernel_name}.o
+    VERBATIM)
+
+  list(APPEND xpuplugin_kernels_depends ${kernel_name}.device.bin.o
+       ${kernel_name}.o)
+endmacro()
+
+macro(
+  __compile_kernel_with_rules
+  kernel_path
+  kernel_name
+  xpu_n
+  rules_path
+  device_o_extra_flags
+  host_o_extra_flags
+  xpu_n_macro)
+  file(STRINGS ${rules_path} rules)
+
+  foreach(rule IN LISTS rules)
+    message(STATUS "  Instantiate with '${rule}'")
+    execute_process(
+      COMMAND bash "-c" "echo -n ${rule} | md5sum | cut -c1-6"
+      OUTPUT_VARIABLE rule_md5
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(kernel_name_md5 ${kernel_name}_${rule_md5})
+    compile_kernel(
+      ${kernel_path}
+      ${kernel_name_md5}
+      ${xpu_n}
+      ${rule}
+      ${device_o_extra_flags}
+      ${host_o_extra_flags}
+      ${xpu_n_macro})
+  endforeach()
+endmacro()
+
+macro(
+  compile_kernel_with_rules
+  kernel_path
+  kernel_name
+  xpu_n
+  rules_path
+  device_o_extra_flags
+  host_o_extra_flags
+  xpu_n_macro)
+  # reconfigure if file |rules_path| was modified
+  set_property(
+    DIRECTORY
+    APPEND
+    PROPERTY CMAKE_CONFIGURE_DEPENDS ${rules_path})
+  __compile_kernel_with_rules(
+    ${kernel_path}
+    ${kernel_name}
+    ${xpu_n}
+    ${rules_path}
+    ${device_o_extra_flags}
+    ${host_o_extra_flags}
+    ${xpu_n_macro})
+endmacro()
+
+macro(search_and_compile_kernel xpu_n)
+  if(${xpu_n} STREQUAL "xpu1")
+    set(XPU_DEVICE_O_EXTRA_FLAGS " ")
+    set(XPU_HOST_O_EXTRA_FLAGS " ")
+    set(XPU_KERNEL_PATH "src/kernel/cpp/*.xpu")
+    set(xpu_n_macro "__XPU1__")
+  elseif(${xpu_n} STREQUAL "xpu2")
+    set(XPU_DEVICE_O_EXTRA_FLAGS "--xpu-arch=xpu2")
+    set(XPU_HOST_O_EXTRA_FLAGS "--xpu-arch=xpu2")
+    set(XPU_KERNEL_PATH "src/kernel/kunlun2cpp/*.xpu")
+    set(xpu_n_macro "__XPU2__")
+  elseif(${xpu_n} STREQUAL "xpu3")
+    set(XPU_DEVICE_O_EXTRA_FLAGS "--xpu-arch=xpu3")
+    set(XPU_HOST_O_EXTRA_FLAGS "--xpu-arch=xpu3")
+    set(XPU_KERNEL_PATH "src/kernel/kunlun3cpp/*.xpu")
+    set(xpu_n_macro "__XPU3__")
+  else()
+    message(FATAL_ERROR "Are you sure? ${xpu_n}")
+  endif()
+  file(GLOB_RECURSE xpu_kernels ${XPU_KERNEL_PATH})
+  list(LENGTH xpu_kernels xpu_kernels_num)
+  message(STATUS "Found ${xpu_kernels_num} ${xpu_n} kernels")
+
+  foreach(xpu_kernel IN LISTS xpu_kernels)
+    message(STATUS "Process ${xpu_kernel}")
+    get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
+    get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+    set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
+    set(kernel_name ${xpu_n}_${kernel_name})
+    if(EXISTS ${kernel_rules})
+      compile_kernel_with_rules(
+        ${xpu_kernel}
+        ${kernel_name}
+        ${xpu_n}
+        ${kernel_rules}
+        ${XPU_DEVICE_O_EXTRA_FLAGS}
+        ${XPU_HOST_O_EXTRA_FLAGS}
+        ${xpu_n_macro})
+    else()
+      compile_kernel(
+        ${xpu_kernel}
+        ${kernel_name}
+        ${xpu_n}
+        " "
+        ${XPU_DEVICE_O_EXTRA_FLAGS}
+        ${XPU_HOST_O_EXTRA_FLAGS}
+        ${xpu_n_macro})
+    endif()
+  endforeach()
+endmacro()
+
+# compile xpu kernels
+search_and_compile_kernel("xpu1")
+search_and_compile_kernel("xpu2")
+search_and_compile_kernel("xpu3")
+
+# compile xpu wrappers
+file(GLOB_RECURSE xpu_wrappers src/wrapper/*.cpp)
+list(LENGTH xpu_wrappers xpu_wrappers_num)
+message(STATUS "Found ${xpu_wrappers_num} XPU wrappers")
+
+foreach(xpu_wrapper IN LISTS xpu_wrappers)
+  message(STATUS "Process ${xpu_wrapper}")
+  get_filename_component(wrapper_name ${xpu_wrapper} NAME_WE)
+  set(wrapper_target ${wrapper_name}_wrapper)
+
+  add_custom_target(
+    ${wrapper_target}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS wrapper_build/${wrapper_name}.wrapper.d
+            wrapper_build/${wrapper_name}.wrapper.o
+    COMMENT ${wrapper_target}
+    VERBATIM)
+
+  add_custom_command(
+    OUTPUT wrapper_build/${wrapper_name}.wrapper.d
+    COMMAND ${CMAKE_COMMAND} -E make_directory wrapper_build
+    COMMAND
+      ${XPU_CLANG} -M -MQ wrapper_build/${wrapper_name}.wrapper.o -MF
+      wrapper_build/${wrapper_name}.wrapper.d -std=c++11 -x xpu -c
+      ${xpu_wrapper} -I${XDNN_INC_DIR} -I${XRE_INC_DIR}
+      -I${CMAKE_CURRENT_SOURCE_DIR}/include -I${CMAKE_CURRENT_SOURCE_DIR}/src
+      -I${CMAKE_CURRENT_SOURCE_DIR}/src/wrapper -D_GNU_SOURCE
+      -D__STDC_LIMIT_MACROS -DNDEBUG --sysroot=${HOST_SYSROOT} ${TOOLCHAIN_ARGS}
+      --target=${TARGET_ARCH} -fPIC -Werror -Wreorder -fvisibility=hidden
+      --xpu-host-only ${XPU_MF_FLAGS}
+    COMMAND
+      ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_SOURCE_DIR}
+      ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}
+      ${CMAKE_BINARY_DIR}/CMakeFiles/${wrapper_target}.dir/DependInfo.cmake
+      --color=$(COLOR)
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${xpu_wrapper} ${XPU_DEPS}
+    COMMENT wrapper_build/${wrapper_name}.wrapper.d
+    VERBATIM)
+
+  add_custom_command(
+    OUTPUT wrapper_build/${wrapper_name}.wrapper.o
+    COMMAND ${CMAKE_COMMAND} -E make_directory wrapper_build
+    COMMAND
+      ${XPU_CLANG} -std=c++11 ${EXTRA_FLAGS} ${OPT_LEVEL} -x xpu -c
+      ${xpu_wrapper} -o wrapper_build/${wrapper_name}.wrapper.o
+      -I${XDNN_INC_DIR} -I${XRE_INC_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/include
+      -I${CMAKE_CURRENT_SOURCE_DIR}/src
+      -I${CMAKE_CURRENT_SOURCE_DIR}/src/wrapper -D_GNU_SOURCE
+      -D__STDC_LIMIT_MACROS -DNDEBUG --sysroot=${HOST_SYSROOT} ${TOOLCHAIN_ARGS}
+      --target=${TARGET_ARCH} -fPIC -Wunused-variable -Werror -Wreorder
+      -fvisibility=hidden --xpu-host-only ${HOST_XPU_FLAGS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS wrapper_build/${wrapper_name}.wrapper.d
+    COMMENT wrapper_build/${wrapper_name}.wrapper.o
+    VERBATIM)
+  list(APPEND xpuplugin_wrapper_depends wrapper_build/${wrapper_name}.wrapper.o)
+endforeach()
+
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a
+  COMMAND ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a
+          ${xpuplugin_kernels_depends} ${xpuplugin_wrapper_depends}
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${xpuplugin_kernels_depends} ${xpuplugin_wrapper_depends}
+  COMMENT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a
+  VERBATIM)
+
+add_custom_target(
+  xpuplugin_a
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${xpuplugin_kernels_depends} ${xpuplugin_wrapper_depends}
+          ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a
+  COMMENT xpuplugin_a
+  VERBATIM)
+
+add_custom_target(
+  xpuplugin_so ALL
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS xpuplugin_a ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so
+  COMMENT xpuplugin_so)
+
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so
+  COMMAND
+    ${HOST_CXX} -shared -o ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so -Xlinker
+    \"-\(\" -Wl,--whole-archive ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a
+    -Wl,--no-whole-archive -L${XDNN_LIB_DIR} -L${XRE_LIB_DIR} -lxpurt -lxpuapi
+    -Wl,--no-undefined -Wl,-soname,libxpuplugin.so -lstdc++ -ldl -lm -lpthread
+    -specs=${CMAKE_CURRENT_SOURCE_DIR}/src/linker.specs -Xlinker \"-\)\"\;
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a
+  COMMENT ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.so)
+
+if(NOT DEFINED BUILD_STANDALONE)
+  add_library(xpuplugin STATIC IMPORTED GLOBAL)
+  add_dependencies(xpuplugin xpuplugin_a)
+  set_target_properties(
+    xpuplugin PROPERTIES IMPORTED_LOCATION
+                         ${CMAKE_CURRENT_BINARY_DIR}/libxpuplugin.a)
+endif()
diff --git a/paddle/phi/kernels/xpu/plugin/README.md b/paddle/phi/kernels/xpu/plugin/README.md
new file mode 100644
index 00000000000..e0783d50e6f
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/README.md
@@ -0,0 +1,21 @@
+# XPU PLUGIN
+## Standalone build and test.
+```
+$ cd plugin
+Modify ./build.sh to set the path of XDNN, XRE and XTDK.
+$ ./build.sh
+
+$ cd example
+Modify ./example/build.sh to set the path of XDNN and XRE.
+$ ./build.sh
+$ ./run.sh
+```
+## Build with PaddlePaddle.
+### Copy to the source code of PaddlePaddle.
+```
+$ cp -rf plugin <path_to_paddle_source_code>/paddle/phi/xpu
+```
+### Add -DWITH_XPU_PLUGIN=ON as extra cmake arguments.
+```
+$ cmake .. <other_cmake_args> -DWITH_XPU_PLUGIN=ON
+```
diff --git a/paddle/phi/kernels/xpu/plugin/build.sh b/paddle/phi/kernels/xpu/plugin/build.sh
new file mode 100755
index 00000000000..9e446436ab0
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+export XDNN_PATH=/opt/xdnn # <path_to_xdnn>
+export XRE_PATH=/opt/xre # <path_to_xre>
+export CLANG_PATH=/opt/xtdk # <path_to_xtdk>
+export HOST_SYSROOT=/opt/compiler/gcc-8.2 # <path_to_gcc>
+
+rm -rf build
+mkdir build
+cd build
+cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_STANDALONE=ON  ..
+make
diff --git a/paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt b/paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt
new file mode 100644
index 00000000000..f9b714ade3a
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/example/CMakeLists.txt
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 3.16)
+
+project(example LANGUAGES CXX)
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+set(CMAKE_C_COMPILER "gcc")
+set(CMAKE_CXX_COMPILER "g++")
+
+if(NOT DEFINED XDNN_PATH)
+  set(XDNN_PATH $ENV{XDNN_PATH})
+endif()
+if(NOT DEFINED XRE_PATH)
+  set(XRE_PATH $ENV{XRE_PATH})
+endif()
+if(NOT IS_DIRECTORY ${XDNN_PATH})
+  message(
+    FATAL_ERROR
+      "XDNN_PATH not set, or directory ${XDNN_PATH} not found, please export XDNN_PATH=<path_to_xdnn>."
+  )
+endif()
+if(NOT IS_DIRECTORY ${XRE_PATH})
+  message(
+    FATAL_ERROR
+      "XRE_PATH not set, or directory ${XRE_PATH} not found, please export XRE_PATH=<path_to_xre>."
+  )
+endif()
+set(XDNN_INC_DIR ${XDNN_PATH}/include)
+set(XDNN_LIB_DIR ${XDNN_PATH}/so)
+set(XRE_INC_DIR ${XRE_PATH}/include)
+set(XRE_LIB_DIR ${XRE_PATH}/so)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wl,--allow-shlib-undefined")
+
+include_directories(${XDNN_INC_DIR})
+include_directories(${XRE_INC_DIR})
+link_directories(${XDNN_LIB_DIR})
+link_directories(${XRE_LIB_DIR})
+set(DEPS ${DEPS} xpurt xpuapi)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
+if(NOT DEFINED LINK_TYPE)
+  set(LINK_TYPE $ENV{LINK_TYPE})
+endif()
+if(LINK_TYPE STREQUAL "static")
+  set(DEPS ${DEPS} ${CMAKE_CURRENT_SOURCE_DIR}/../build/libxpuplugin.a)
+elseif(LINK_TYPE STREQUAL "shared")
+  link_directories(${CMAKE_CURRENT_SOURCE_DIR}/../build)
+  set(DEPS ${DEPS} xpuplugin)
+else()
+  message(
+    FATAL_ERROR
+      "Unknown LINK_TYPE ${LINK_TYPE}, only supports static or shared.")
+  return()
+endif()
+
+add_executable(example example.cc)
+target_link_libraries(example ${DEPS})
diff --git a/paddle/phi/kernels/xpu/plugin/example/build.sh b/paddle/phi/kernels/xpu/plugin/example/build.sh
new file mode 100755
index 00000000000..e5d74dcafc8
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/example/build.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+export XDNN_PATH=/opt/xdnn # <path_to_xdnn>
+export XRE_PATH=/opt/xre # <path_to_xre>
+export LINK_TYPE=static # shared/static
+
+rm -rf build
+mkdir build
+cd build
+cmake -DCMAKE_VERBOSE_MAKEFILE=ON  ..
+make
diff --git a/paddle/phi/kernels/xpu/plugin/example/example.cc b/paddle/phi/kernels/xpu/plugin/example/example.cc
new file mode 100644
index 00000000000..1a669842d0b
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/example/example.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * copyright (C) 2022 KUNLUNXIN, Inc
+ */
+
+#include <assert.h>
+#include "xpu/plugin.h"
+#include "xpu/xdnn.h"
+
+namespace xdnn = baidu::xpu::api;
+
+int main() {
+  int num = 5;
+  int errcode = 0;
+  auto ctx = xdnn::create_context();
+  float* A = nullptr;
+  errcode = xpu_malloc(reinterpret_cast<void**>(&A), num * sizeof(float));
+  assert(errcode == 0);
+  float* B = nullptr;
+  errcode = xpu_malloc(reinterpret_cast<void**>(&B), num * sizeof(float));
+  assert(errcode == 0);
+
+  std::vector<float> A_cpu = {1, 2, 3, 4, 5};
+  std::vector<float> B_cpu(num, 0.0f);
+  std::vector<float> B_ref = {3, 4, 5, 6, 7};
+  xpu_memcpy(reinterpret_cast<void*>(A),
+             reinterpret_cast<void*>(&(A_cpu[0])),
+             num * sizeof(float),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  errcode = xdnn::plugin::add2(ctx, A, B, num);
+  assert(errcode == 0);
+  xpu_memcpy(reinterpret_cast<void*>(&(B_cpu[0])),
+             reinterpret_cast<void*>(B),
+             num * sizeof(float),
+             XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  printf("A(%p):\n", A);
+  for (size_t i = 0; i < num; i++) {
+    printf("%f ", A_cpu[i]);
+  }
+  printf("\nB(%p):\n", B);
+  for (size_t i = 0; i < num; i++) {
+    printf("%f ", B_cpu[i]);
+  }
+  bool pass = true;
+  for (size_t i = 0; i < num; i++) {
+    if (fabs(B_cpu[i] - B_ref[i]) > 1e-5f) {
+      pass = false;
+      break;
+    }
+  }
+  printf("\nCheck %s! \n", pass ? "pass" : "fail");
+
+  destroy_context(ctx);
+  errcode = xpu_free(A);
+  assert(errcode == 0);
+  errcode = xpu_free(B);
+  assert(errcode == 0);
+  return 0;
+}
diff --git a/paddle/phi/kernels/xpu/plugin/example/run.sh b/paddle/phi/kernels/xpu/plugin/example/run.sh
new file mode 100755
index 00000000000..d3936a51a68
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/example/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+XDNN_PATH=/opt/xdnn # <path_to_xdnn>
+XRE_PATH=/opt/xre # <path_to_xre>
+
+:<<!
+export GLOG_v=0
+export XPU_VISIBLE_DEVICES=0;
+export XPUAPI_DEBUG=1;
+export LD_LIBRARY_PATH=$XDNN_PATH/so:$XRE_PATH/so:$LD_LIBRARY_PATH
+
+chmod +x ./build/example
+./build/example
+!
+
+#:<<!
+SSH_IP_ADDR=localhost
+SSH_PORT=9031
+SSH_USR_ID=root
+SSH_USR_PWD=root
+
+WORK_SPACE="/var/tmp/example"
+EXPORT_ENVIRONMENT_VARIABLES="export GLOG_v=0;export XPU_VISIBLE_DEVICES=0;export XPUAPI_DEBUG=1;"
+EXPORT_ENVIRONMENT_VARIABLES="${EXPORT_ENVIRONMENT_VARIABLES}export LD_LIBRARY_PATH=.:\$LD_LIBRARY_PATH;"
+
+sshpass -p $SSH_USR_PWD ssh -v -o ConnectTimeout=60 -o StrictHostKeyChecking=no -p $SSH_PORT $SSH_USR_ID@$SSH_IP_ADDR "rm -rf $WORK_SPACE"
+sshpass -p $SSH_USR_PWD ssh -v -o ConnectTimeout=60 -o StrictHostKeyChecking=no -p $SSH_PORT $SSH_USR_ID@$SSH_IP_ADDR "mkdir -p $WORK_SPACE"
+sshpass -p $SSH_USR_PWD scp -v -r -o ConnectTimeout=60 -o StrictHostKeyChecking=no -P $SSH_PORT $XDNN_PATH/so/* $SSH_USR_ID@$SSH_IP_ADDR:$WORK_SPACE
+sshpass -p $SSH_USR_PWD scp -v -r -o ConnectTimeout=60 -o StrictHostKeyChecking=no -P $SSH_PORT $XRE_PATH/so/* $SSH_USR_ID@$SSH_IP_ADDR:$WORK_SPACE
+sshpass -p $SSH_USR_PWD scp -v -r -o ConnectTimeout=60 -o StrictHostKeyChecking=no -P $SSH_PORT ../build/libxpuplugin.so $SSH_USR_ID@$SSH_IP_ADDR:$WORK_SPACE
+sshpass -p $SSH_USR_PWD scp -v -r -o ConnectTimeout=60 -o StrictHostKeyChecking=no -P $SSH_PORT build/example $SSH_USR_ID@$SSH_IP_ADDR:$WORK_SPACE
+sshpass -p $SSH_USR_PWD ssh -v -o ConnectTimeout=60 -o StrictHostKeyChecking=no -p $SSH_PORT $SSH_USR_ID@$SSH_IP_ADDR "cd $WORK_SPACE; ${EXPORT_ENVIRONMENT_VARIABLES} chmod +x ./example; ./example"
+#!
diff --git a/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h b/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h
new file mode 100644
index 00000000000..1aa7a822e81
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * copyright (C) 2022 KUNLUNXIN, Inc
+ */
+
+#pragma once
+#include "xpu/xdnn.h"
+
+namespace baidu {
+namespace xpu {
+namespace api {
+namespace plugin {
+
+DLL_EXPORT int add2(Context* ctx, const float* x, float* y, int len);
+
+}  // namespace plugin
+}  // namespace api
+}  // namespace xpu
+}  // namespace baidu
diff --git a/paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/add1.xpu b/paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/add1.xpu
new file mode 100644
index 00000000000..d5149d80975
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/add1.xpu
@@ -0,0 +1,57 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * copyright (C) 2022 KUNLUNXIN, Inc
+ */
+
+#include "xpu/kernel/xtdk.h"
+#include "xpu/kernel/xtdk_math.h"
+#include "xpu/kernel/xtdk_simd.h"
+
+namespace xpu2 {
+namespace plugin {
+
+__global__ void add1(const float* x, float* y, int len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 128;
+  __simd__ float local_x[buf_size];
+  __simd__ float local_y[buf_size];
+
+  float32x16_t v_x;
+  float32x16_t v_y;
+  int len_per_loop = 128;
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(x + i, local_x, read_len * sizeof(float));
+    for (int k = 0; k < read_len; k += 16) {
+      v_x = vload_lm_float32x16(local_x + k);
+      v_y = svadd_float32x16(1.0f, v_x);
+      vstore_lm_float32x16((local_y + k), v_y);
+    }
+    mfence();
+    LM2GM(local_y, y + i, read_len * sizeof(float));
+  }
+}
+
+}  // namespace plugin
+}  // namespace xpu2
diff --git a/paddle/phi/kernels/xpu/plugin/src/linker.specs b/paddle/phi/kernels/xpu/plugin/src/linker.specs
new file mode 100644
index 00000000000..55f6f783707
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/src/linker.specs
@@ -0,0 +1,6 @@
+# overwrite incorrect rpath arguments
+# its original value is:
+# -rpath $ORIGIN:$ORIGIN/lib:$ORIGIN/lib64:$ORIGIN/../lib:$ORIGIN/../lib64:/opt/compiler/gcc-4.8.2/lib:/opt/compiler/gcc-4.8.2/lib64
+# specify your own rpath if needed.
+*linker:
+collect2 -rpath $ORIGIN
diff --git a/paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp b/paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp
new file mode 100644
index 00000000000..31728befc68
--- /dev/null
+++ b/paddle/phi/kernels/xpu/plugin/src/wrapper/add2.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * copyright (C) 2022 KUNLUNXIN, Inc
+ */
+
+#include "xpu/plugin.h"
+#include "xpu/refactor/impl_public/wrapper_check.h"
+
+namespace xpu2 {
+namespace plugin {
+__attribute__((global)) void add1(const float* x, float* y, int len);
+}
+}  // namespace xpu2
+
+namespace baidu {
+namespace xpu {
+namespace api {
+namespace plugin {
+
+static int cpu_wrapper(Context* ctx, const float* x, float* y, int len) {
+  for (int i = 0; i < len; i++) {
+    y[i] = x[i] + 2.0f;
+  }
+  return SUCCESS;
+}
+
+static int xpu2_wrapper(Context* ctx, const float* x, float* y, int len) {
+  ctx_guard RAII_GUARD(ctx);
+  float* tensor_one = RAII_GUARD.alloc<float>(len);
+  WRAPPER_ASSERT_WORKSPACE(ctx, tensor_one);
+  int ret = constant<float>(ctx, tensor_one, len, 1.0f);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = add<float>(ctx, x, tensor_one, y, len);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  xpu2::plugin::add1<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(y, y, len);
+  return api::SUCCESS;
+}
+
+int add2(Context* ctx, const float* x, float* y, int len) {
+  WRAPPER_CHECK_CTX(ctx);
+  WRAPPER_DUMP_FUNCTION_T1(ctx, "add2", float);
+  WRAPPER_DUMP_PARAM3(ctx, x, y, len);
+  WRAPPER_DUMP(ctx);
+  WRAPPER_ASSERT_GT(ctx, len, 0);
+  WRAPPER_CHECK_2PTRS(ctx, float, len, x, y);
+  if (ctx->dev().type() == api::kCPU) {
+    return cpu_wrapper(ctx, x, y, len);
+  }
+  if (ctx->dev().type() == api::kXPU2) {
+    return xpu2_wrapper(ctx, x, y, len);
+  }
+  return NOT_IMPLEMENT;
+}
+
+}  // namespace plugin
+}  // namespace api
+}  // namespace xpu
+}  // namespace baidu
-- 
GitLab