diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index df2e59b7647bf0231362a4220e8610f50243f1c5..2684529930e7ce2b1dba0bbfb3fb95968e0eadc7 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
         description: Format files with ClangFormat.
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
 -   repo: local
     hooks:
     -   id: cpplint-cpp-source
@@ -48,7 +48,7 @@ repos:
         name: copyright_checker
         entry: python ./tools/codestyle/copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
         exclude: |
             (?x)^(
                 paddle/utils/.*
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 8469dc4c02ee37b333254d6d35b0eb48354d4b86..8843dd2628767e8cac167db0ff115d0b63aca53a 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -125,6 +125,9 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
                 list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
+            endif()
         endif()
         if(WITH_ASCEND_CL)
             string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
@@ -162,6 +165,8 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${src})
             elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu_kp_cc_srcs ${src})
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
+                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
@@ -384,7 +389,15 @@ function(op_library TARGET)
 
     # pybind USE_OP_DEVICE_KERNEL for XPU KP
     if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
+        foreach(xpu_kp_src ${xpu_kp_cc_srcs})
+        set(op_name "")
+        find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name)
+        if(NOT ${op_name} EQUAL "")
+            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n")
+            message(STATUS "Building KP Target: ${op_name}")
+            set(pybind_flag 1)
+        endif()
+        endforeach()
     endif()
 
     # pybind USE_OP_DEVICE_KERNEL for NPU
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index f8ab9693db0c9ebe845ae7d77a562fd005f5130d..adab3e1423c91522092dac5503d8c58dcc8370db 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -17,7 +17,7 @@ if(NOT WITH_XPU_KP)
 endif()
 
 if(NOT XPU_TOOLCHAIN)
-  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64)
   get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
 endif()
 if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
@@ -102,7 +102,7 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+  set(XPU_CXX_FLAGS  -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
 
   #include path
   get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
@@ -127,9 +127,11 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.bin.o
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+      cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
     COMMAND
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -148,9 +150,11 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.host.o
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+      cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
     COMMAND
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
+        -I.  -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
         --xpu-host-only -c -v 
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
@@ -185,7 +189,7 @@ macro(xpu_add_library TARGET_NAME)
     # Distinguish .xpu file from other files
     foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
       get_filename_component(language_type_name ${cur_xpu_src} EXT)
-      if(${language_type_name} STREQUAL ".xpu")
+      if(${language_type_name} STREQUAL ".kps")
         list(APPEND xpu_kernel_lists ${cur_xpu_src})
       else()
         list(APPEND cc_kernel_lists ${cur_xpu_src})
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
new file mode 100644
index 0000000000000000000000000000000000000000..a3fea0d7b3dbf91cbe19c299edea3ffee77d3cbe
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -0,0 +1,188 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+#include <xpu/runtime.h>                // NOLINT
+#include "xpu/kernel/cluster_header.h"  // NOLINT
+#include "xpu/kernel/debug.h"           // NOLINT
+#include "xpu/kernel/math.h"            // NOLINT
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ElementwiseAddXPUKPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    const auto& xpu_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T, kps::AddFunctor<T>, 1>(
+        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
+  }
+};
+
+static std::vector<int> get_rdims(const std::vector<int>& xdims,
+                                  const std::vector<int>& ydims) {
+  std::vector<int> rdims;
+  for (size_t i = 0; i < xdims.size(); i++) {
+    if (xdims[i] != ydims[i]) {
+      rdims.push_back(i);
+    }
+  }
+  return rdims;
+}
+
+template <typename T>
+class ElementwiseAddGradXPUKPKernel : public ElemwiseGradKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    const framework::DDim& x_dims = x->dims();
+    const framework::DDim& y_dims = y->dims();
+    const framework::DDim& dz_dims = dz->dims();
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    PADDLE_ENFORCE_GE(
+        axis, 0,
+        platform::errors::InvalidArgument(
+            "Axis should be great than or equal to 0, but received axis is %d.",
+            axis));
+    PADDLE_ENFORCE_LT(
+        axis, max_dim,
+        platform::errors::InvalidArgument(
+            "Axis should be less than %d, but received axis is %d.", max_dim,
+            axis));
+
+    std::vector<int> x_dims_vec(max_dim, 1);
+    std::vector<int> y_dims_vec(max_dim, 1);
+    std::vector<int> z_dims_vec(max_dim, 1);
+    if (x_dims.size() == max_dim) {
+      for (int i = 0; i < max_dim; i++) {
+        x_dims_vec[i] = x_dims[i];
+      }
+    } else {
+      for (int i = 0; i < x_dims.size(); i++) {
+        x_dims_vec[i + axis] = x_dims[i];
+      }
+    }
+
+    if (y_dims.size() == max_dim) {
+      for (int i = 0; i < max_dim; i++) {
+        y_dims_vec[i] = y_dims[i];
+      }
+    } else {
+      for (int i = 0; i < y_dims.size(); i++) {
+        y_dims_vec[i + axis] = y_dims[i];
+      }
+    }
+
+    for (int i = 0; i < max_dim; i++) {
+      z_dims_vec[i] = dz_dims[i];
+    }
+    std::vector<int> rdims_for_x;
+    std::vector<int> rdims_for_y;
+    rdims_for_x = get_rdims(x_dims_vec, z_dims_vec);
+    rdims_for_y = get_rdims(y_dims_vec, z_dims_vec);
+    const T* dz_data = dz->data<T>();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+    if (dx != nullptr) {
+      T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+      if (rdims_for_x.size() == 0) {
+        if (dx_data != dz_data) {
+          framework::TensorCopy(
+              *dz, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), dx);
+        }
+      } else {
+        // For inplace strategy, dx will be stored in addr of dz, which makes
+        // the result of dy wrong.
+        if (dx->IsSharedBufferWith(*dz)) {
+          dx->clear();
+          dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+        }
+
+        int ret = xpu::reduce_sum<XPUType>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
+            reinterpret_cast<XPUType*>(dx_data), z_dims_vec, rdims_for_x);
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
+      }
+    }
+
+    if (dy != nullptr) {
+      T* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+      if (rdims_for_y.size() == 0) {
+        if (dy_data != dz_data) {
+          framework::TensorCopy(
+              *dz, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), dy);
+        }
+      } else {
+        int ret = xpu::reduce_sum<XPUType>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
+            reinterpret_cast<XPUType*>(dy_data), z_dims_vec, rdims_for_y);
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
+                   ops::ElementwiseAddXPUKPKernel<float>);
+
+REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
+                   ops::ElementwiseAddGradXPUKPKernel<float>);
+
+#endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 17288b354a2806ce56114335592b5103f9bf0ac4..e9124dfc1f8a7ad3a88c843c1a1573ba3503d80b 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -74,7 +74,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/device_ext.h"
 #include "paddle/fluid/platform/device/stream.h"
+
+#if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
 #include "unsupported/Eigen/CXX11/Tensor"
+#endif
 
 namespace Eigen {
 struct DefaultDevice;
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 08fe3125287d76654173324e42a2d0773aab444c..0869df143235fcd937d75e7dba908c4efbd7ee95 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,14 +18,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __xpu_kp__
+#if defined(__xpu__)
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu_kp__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index be57b8630f89578e8de48f6dc581cb6fc37a1048..84a36b849afa1c4cdcc1a0f4d4ada598944a1faa 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 namespace kps = phi::kps;
 
@@ -122,7 +122,7 @@ struct DimensionsTransform {
   explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
                                const phi::DDim &dims,
                                int axis) {
-    const int N = max(static_cast<int>(ins.size()), 2);
+    const int N = std::max(static_cast<int>(ins.size()), 2);
     dim_size = dims.size();
     out_dims = phi::vectorize<int64_t>(dims);
     in_dims.resize(N);
@@ -183,7 +183,7 @@ struct DimensionsTransform {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
@@ -268,7 +268,7 @@ __global__ void VectorizedBroadcastKernel(
   int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
   int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   for (; block_offset < main_offset; block_offset += stride) {
     VectorizedBroadcastKernelImpl<InT,
                                   OutT,
@@ -348,12 +348,12 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
 
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
+    ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
     if (use_broadcast[i]) {
       // get the broadcast config,
       // if data shape is[m, n], then you should set data_dim = {n, m}
@@ -363,7 +363,7 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
     }
   }
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   const int threads = 64;
   const int blocks = 8;
   int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index 5fc8f76d988d1449cc41a89a1740ffeb9a3b05df..fbb9d8e3d2ef552750fc98d10a63d230661adf49 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#ifndef __xpu__
+
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
@@ -435,3 +437,5 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
 
 }  // namespace numext
 }  // namespace Eigen
+
+#endif  // __xpu__
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 9a429dfaaf957785ab0108fe19ff63244659df11..47f1593a11eb9e29cc90b7db36650826734ac27f 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -21,12 +21,13 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
+#define HOSTDEVICE __host__ __device__
 namespace kps = phi::kps;
 
 #endif
@@ -436,7 +437,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
@@ -469,10 +470,14 @@ struct Loader {
     kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
     if (is_boundary) {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
-          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+          args,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          num);
     } else {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
-          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+          args,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          num);
     }
   }
 };
@@ -482,8 +487,7 @@ struct InputSetter {
   template <typename Array>
   static HOSTDEVICE void Apply(
       const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
-    (*ins_data)[Index] =
-        reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
+    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data());
   }
 };
 
@@ -718,9 +722,9 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 
   Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   int block_size = 64;
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h
index a2992702b164af380e961859f236b6b725c898c7..369bd8d8ad41832562158c03095ac601d5a822ce 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -114,6 +114,7 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
   }
 }
 
+#ifndef __xpu__
 template <typename T, typename OP, typename Tout = T>
 static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             const T *y,
@@ -128,8 +129,8 @@ static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             bool is_xsize,
                                                             OP op,
                                                             T *dd) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int bid = BLOCK_ID_X;
 
   T val(0);
   if (is_xsize) {
@@ -196,8 +197,8 @@ static __global__ void FastCommonGradBroadcastAllCUDAKernel(
     DY_OP dy_op,
     T *dx,
     T *dy) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int bid = BLOCK_ID_X;
 
   T val(0);
   if (is_xsize_larger) {
@@ -260,67 +261,67 @@ static __global__ void FastCommonGradBroadcastCUDAKernelHeight(const T *x,
   __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
 
   T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t width_stride = GRID_NUM_X * BLOCK_NUM_X;
+  size_t idx = THREAD_ID_X + BLOCK_NUM_X * BLOCK_ID_X;
   size_t full_width =
       (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
   size_t full_height =
       (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
   if (is_y) {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int out_offset = n * w + m;
         int x_offset = (n % x_h) * x_w + m % x_w;
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[x_offset], y[m], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1) {
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         }
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
   } else {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int out_offset = n * w + m;
         int y_offset = (n % x_h) * x_w + m % x_w;
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[m], y[y_offset], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1) {
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         }
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
@@ -339,9 +340,9 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
                                                             int x_h,
                                                             int x_w,
                                                             bool is_y) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
+  int j = BLOCK_ID_X;
+  int i = THREAD_ID_X;
+  int tid = THREAD_ID_X;
   T val(0);
 
   if (is_y) {
@@ -357,7 +358,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -374,7 +375,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -393,9 +394,9 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
                                                         DY_OP dy_op,
                                                         T *dx,
                                                         T *dy) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
+  int j = BLOCK_ID_X;
+  int i = THREAD_ID_X;
+  int tid = THREAD_ID_X;
   T val(0);
   if (is_xsize_larger) {
     do {
@@ -412,7 +413,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -431,7 +432,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
     if (dx) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dx[j] = val;
       }
     }
@@ -456,16 +457,16 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
   __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
 
   T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t width_stride = GRID_NUM_X * BLOCK_NUM_X;
+  size_t idx = THREAD_ID_X + BLOCK_NUM_X * BLOCK_ID_X;
   size_t full_width =
       (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
   size_t full_height =
       (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
   if (is_xsize_larger) {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int x_offset = n * w + m;
         if (dx && m < w && n < h) {
           dx[x_offset] =
@@ -474,29 +475,29 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1)
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
   } else {  // x.dims < y.dims, broadcast for x.
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int y_offset = n * w + m;
         if (dy && m < w && n < h) {
           dy[y_offset] =
@@ -505,22 +506,22 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
         if (dx) {
           if (m < w && n < h) {
             T val = dx_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dx) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1)
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dx[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dx[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
@@ -540,8 +541,8 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
                                                         DY_OP dy_op,
                                                         T *dx,
                                                         T *dy) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int j = BLOCK_ID_X;
 
   T val(0);
   int ttid = tid;
@@ -569,7 +570,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
       int h = pre * post;
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -596,7 +597,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
       int h = pre * post;
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dx[j] = val;
       }
     }
@@ -668,9 +669,9 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
                                               int thread_num,
                                               DX_OP dx_op) {
   T val(0);
-  int i = blockIdx.x;
-  int tid = threadIdx.x;
-  for (int j = tid; j < thread_num; j += blockDim.x) {
+  int i = BLOCK_ID_X;
+  int tid = THREAD_ID_X;
+  for (int j = tid; j < thread_num; j += BLOCK_NUM_X) {
     const int X_index = i * thread_num + j;
     int out_index = X_index;
     int C_index = 0;
@@ -694,7 +695,7 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
     val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
   }
   val = paddle::platform::reduceSum(val, tid, thread_num);
-  if (threadIdx.x == 0) {
+  if (THREAD_ID_X == 0) {
     dx[i] = val;
   }
 }
@@ -1416,8 +1417,8 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
 template <typename T>
 static __global__ void SimpleElemwiseAddGradCUDAKernel(
     const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = gridDim.x * blockDim.x;
+  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  int stride = GRID_NUM_X * BLOCK_NUM_X;
   int loop = size / vec_size;
   int remainder = size % vec_size;
   const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
@@ -1544,14 +1545,14 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
                                                        int64_t size,
                                                        T *dx,
                                                        T *dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
 
   while (col < size) {
     if (dx != nullptr) {
       dx[col] = dout[col];
     }
     dy[col] = -dout[col];
-    col += blockDim.x * gridDim.x;
+    col += BLOCK_NUM_X * GRID_NUM_X;
   }
 }
 
@@ -1629,4 +1630,6 @@ void elementwise_sub_grad(const GPUContext &ctx,
       dy->mutable_data<T>(ctx.GetPlace()));
 }
 
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index cd044e8a3f995919c3bb7fab57c6f1cdffa11040..a445f4a02ea714b2b2851d4de178b5ba76f5678d 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -328,7 +328,7 @@ __device__ __forceinline__ void Reduce(T* out,
                                        const T* in,
                                        ReduceFunctor reducer,
                                        bool reduce_last_dim) {
-  if (Mode == kGlobalMode) {
+  if (Mode == details::kGlobalMode) {
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
 #pragma unroll
@@ -336,7 +336,7 @@ __device__ __forceinline__ void Reduce(T* out,
         out[i] = reducer(out[i], in[i * NX + j]);
       }
     }
-    BlockXReduce<T, OpFunc, NY>(out, reducer);
+    BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 1583774369a97d21aa0ef06f0ec70878720c0b66..75b2dbaf7e6a305fdb32ae3738944922fb4a93a5 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -34,9 +34,9 @@ struct alignas(sizeof(T) * VecSize) VectorType {
 #pragma pack(4)
 template <int kDims>
 struct BroadcastConfig {
-  int strides_in[DDim::kMaxRank];
-  int strides_out[DDim::kMaxRank];
-  int in_dim[DDim::kMaxRank];
+  int strides_in[phi::DDim::kMaxRank];
+  int strides_out[phi::DDim::kMaxRank];
+  int in_dim[phi::DDim::kMaxRank];
 
   HOSTDEVICE BroadcastConfig() {}
 
@@ -222,7 +222,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
  * src: The data pointer of the current block.
  * size: The current block needs to load size data continuously.
  */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
 __device__ __inline__ void ReadData(T* dst,
                                     const T _global_ptr_* src,
                                     int num) {
@@ -251,9 +251,9 @@ template <typename T,
           int BlockSize,
           typename ArgsT,
           int Index,
-          bool IsBoundary = false>
+          bool IsBoundary>
 __device__ __forceinline__ void ReadData(ArgsT* dst,
-                                         const T* __restrict__ src,
+                                         const T _global_ptr_* src,
                                          int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
@@ -366,22 +366,25 @@ __device__ __inline__ void ReadDataBc(T* dst,
  * reduce_last_dim: Used to indicate whether the dimension of reduce contains
  * the lowest dimension.
  */
-template <typename T,
+template <typename Tx,
+          typename Ty,
           int NX,
           int NY,
           int BlockSize,
           int Rank,
           typename IndexCal,
+          typename Functor,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataReduce(T* dst,
-                                          const T _global_ptr_* src,
-                                          int block_offset,
-                                          const IndexCal& index_cal,
-                                          int size_nx,
-                                          int size_ny,
-                                          int stride_nx,
-                                          int stride_ny,
-                                          bool reduce_last_dim) {
+__device__ __forceinline__ void ReadDataReduce(Ty* dst,
+                                               const Tx* __restrict__ src,
+                                               int block_offset,
+                                               const IndexCal& index_cal,
+                                               int size_nx,
+                                               int size_ny,
+                                               int stride_nx,
+                                               int stride_ny,
+                                               Functor func,
+                                               bool reduce_last_dim) {
   __local__ Tx in_temp[1];
   int thread_offset = 0;
   int left_idx = 0;
@@ -618,11 +621,12 @@ template <typename T,
           int BlockSize,
           int Rank,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst,
-                                      const T _global_ptr_* src,
-                                      uint32_t block_offset,
-                                      details::BroadcastConfig<Rank> config,
-                                      int total_num_output) {
+__device__ __inline__ void ReadDataBc(
+    T* dst,
+    const T _global_ptr_* src,
+    uint32_t block_offset,
+    const details::BroadcastConfig<Rank>& config,
+    int total_num_output) {
   int thread_offset = block_offset + core_id() * NX;
   int index_src = 0;
 
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
new file mode 100755
index 0000000000000000000000000000000000000000..8a21e61eaa7d0a7c354607445c30cdc16d2f3041
--- /dev/null
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace phi {
+namespace kps {
+/**
+ * @brief Default unary identity functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  inline IdentityFunctor() {}
+
+  explicit inline IdentityFunctor(int n) {}
+
+  inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x); }
+  __device__ inline IdentityFunctor() {}
+
+  __device__ explicit inline IdentityFunctor(int n) {}
+
+  __device__ inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
+  __device__ inline void SetDiv(int n) {}
+};
+
+/**
+ * @brief Default unary div functor. Divide by a constant
+ */
+template <typename Tx, typename Ty = Tx>
+struct DivideFunctor {
+  inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+
+  inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x * n_inv); }
+
+  __device__ inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  __device__ inline DivideFunctor(int n)
+      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+
+  __device__ inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x * n_inv);
+  }
+
+  __device__ inline void SetDiv(int n) {
+    n_inv = static_cast<Tx>(((float)1.0) / (static_cast<float>(n)));
+  }
+
+ private:
+  Tx n_inv;
+};
+
+/**
+ * @brief Default unary square functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct SquareFunctor {
+  HOSTDEVICE inline SquareFunctor() {}
+
+  HOSTDEVICE explicit inline SquareFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x) * static_cast<Ty>(x);
+  }
+};
+
+/****************************** Binary Functor ********************************/
+
+/**
+ * @brief Default binary min functor
+ */
+template <typename T>
+struct MinFunctor {
+  inline T initial() { /*return static_cast<T>(std::numeric_limits<T>::max());*/
+  }
+
+  __device__ T operator()(const T& a, const T& b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary max functor
+ */
+template <typename T>
+struct MaxFunctor {
+  inline T initial() {
+    // return static_cast<T>(std::numeric_limits<T>::lowest());
+  }
+
+  __device__ T operator()(const T& a, const T& b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct AddFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  __device__ T operator()(const T a, const T b) const { return b + a; }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct MulFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b * a; }
+};
+
+/**
+ * @brief Default binary logic or functor
+ */
+template <typename T>
+struct LogicalOrFunctor {
+  inline T initial() { return static_cast<T>(false); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b || a; }
+};
+
+/**
+ * @brief Default binary logic and functor
+ */
+template <typename T>
+struct LogicalAndFunctor {
+  inline T initial() { return static_cast<T>(true); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b && a; }
+};
+
+/**
+ * @brief Default binary sub functor
+ */
+template <typename T>
+struct SubFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+
+/**
+ * @brief Default binary div functor
+ */
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+template <typename T>
+struct DivFunctor<T,
+                  typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return a / b;
+  }
+};
+
+/**
+ * @brief Default binary floor divide functor
+ */
+template <typename T>
+struct FloorDivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+}  // namespace kps
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/helper_primitives.h b/paddle/phi/kernels/primitive/helper_primitives.h
index 1aeaa2aa100d78725cb58d354c05c336e0ab2744..b0dd8c774f83a50c2bb25c6d18d4af0be928043b 100644
--- a/paddle/phi/kernels/primitive/helper_primitives.h
+++ b/paddle/phi/kernels/primitive/helper_primitives.h
@@ -17,7 +17,7 @@
 namespace phi {
 namespace kps {
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
 struct dim3 {
   int x;
   int y;
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index d29d58b1fecc7fc096c3b2ac1264fb88fe46db55..830bc1972c49fe8c447e9a13f874841d36a12f2d 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -14,11 +14,7 @@
 
 #pragma once
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
-#ifdef PADDLE_WITH_XPU2
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+#ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
 #define KPDevice phi::XPUContext
@@ -26,6 +22,11 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc97ccc82f7ec0b0619658e45c8c15dc68ddb71
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
@@ -0,0 +1,341 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+        self.init_max_relative_error()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'],
+                'Out',
+                max_relative_error=self.max_relative_error)
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=self.max_relative_error)
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=self.max_relative_error)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.006
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestAddOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+            place = fluid.XPUPlace(0)
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float32')
+            np_y = np.array([1, 5, 2]).astype('float32')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = paddle.add(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()