From 3eaf8d2cead9fc3d7b82c5c928c331917ea687b6 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 11 Jan 2022 19:49:01 +0800
Subject: [PATCH] Modified Kernel Primitive API and elementwise for xpu2 #38688

---
 .../elementwise/elementwise_op_broadcast.cu.h |   8 +-
 .../elementwise/elementwise_op_impl.cu.h      |   3 +-
 .../datamover_primitives_xpu2.h               | 172 +++++++++---------
 .../kernel_primitives/kernel_primitives.h     |  15 +-
 paddle/fluid/platform/hostdevice.h            |   9 +-
 paddle/pten/kernels/gpu/elementwise.h         | 104 +++++------
 6 files changed, 164 insertions(+), 147 deletions(-)
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 25c983566b3..e3d4607b713 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -25,8 +25,7 @@ namespace kps = paddle::operators::kernel_primitives;
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchBroadcastElementwiseCudaKernel(
-    const platform::CUDADeviceContext &ctx,
-    const std::vector<const framework::Tensor *> &ins,
+    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
   std::vector<const pten::DenseTensor *> pt_inputs;
   std::vector<pten::DenseTensor *> pt_outputs;
@@ -58,8 +57,7 @@ void LaunchBroadcastElementwiseCudaKernel(
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
-    const platform::CUDADeviceContext &cuda_ctx,
-    const std::vector<const framework::Tensor *> &ins,
+    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
   std::vector<const pten::DenseTensor *> pt_inputs;
   std::vector<pten::DenseTensor *> pt_outputs;
@@ -85,7 +83,7 @@ void LaunchElementwiseCudaKernel(
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
   pten::LaunchElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-      cuda_ctx, pt_inputs, &pt_outputs, axis, func);
+      ctx, pt_inputs, &pt_outputs, axis, func);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 1d8acd5eca5..36ff1ae254d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -35,8 +35,7 @@ using ElementwiseType = pten::ElementwiseType;
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
-    const platform::CUDADeviceContext &ctx,
-    const std::vector<const framework::Tensor *> &ins,
+    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
   std::vector<const pten::DenseTensor *> pt_inputs;
   std::vector<pten::DenseTensor *> pt_outputs;
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
index b27ba27b3c6..33389953589 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
@@ -32,42 +32,50 @@ struct alignas(sizeof(T) * VecSize) VectorType {
  * index of the output data. if input or output shape is [dim0, dim1] then dims
  * must be [dim1, dim0].
  */
+#pragma pack(4)
 template <int kDims>
 struct BroadcastConfig {
-  uint32_t stride_in[framework::DDim::kMaxRank];
-  uint32_t stride_out[framework::DDim::kMaxRank];
-  uint32_t shape_in[framework::DDim::kMaxRank];
+  int strides_in[framework::DDim::kMaxRank];
+  int strides_out[framework::DDim::kMaxRank];
+  int in_dim[framework::DDim::kMaxRank];
 
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
                              const std::vector<int64_t>& in_dims,
                              int dim_size) {
-    std::vector<uint32_t> strides_in;
-    std::vector<uint32_t> strides_out;
-    std::vector<uint32_t> shapes_in;
-
-    strides_out.resize(dim_size, 1);
-    strides_in.resize(dim_size, 1);
-    shapes_in.resize(dim_size, 1);
-
-    for (int i = 0; i < dim_size; ++i) {
-      shape_in[i] = in_dims[dim_size - i - 1];
+    std::vector<int> strides_in_tmp;
+    std::vector<int> strides_out_tmp;
+    std::vector<int> dim_tmp;
+    strides_in_tmp.resize(dim_size, 1);
+    strides_out_tmp.resize(dim_size, 1);
+    dim_tmp.resize(dim_size, 1);
+    for (int i = 1; i < dim_size; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[i - 1];
+      strides_out_tmp[i] = strides_out_tmp[i - 1] * out_dims[i - 1];
     }
 
-    for (int i = 1; i < dim_size - 1; ++i) {
-      strides_out[dim_size - i - 1] = std::accumulate(
-          out_dims.begin(), out_dims.begin() + i, 1, std::multiplies<int64_t>())
-          strides_in[dim_size - i - 1] =
-              std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
-                              std::multiplies<int64_t>())
+    for (int i = 0; i < dim_size; i++) {
+      dim_tmp[i] = in_dims[i];
     }
 
-    memcpy(stride_in, strides_in.data(), kDims * sizeof(uint32_t));
-    memcpy(stride_out, strides_out.data(), kDims * sizeof(uint32_t));
-    memcpy(shape_in, shapes_in.data(), kDims * sizeof(uint32_t));
+    memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int));
+    memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
+    memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
+  }
+
+  __device__ inline int operator()(int index_output) const {
+    int index_src = 0;
+#pragma unroll
+    for (int i = kDims - 1; i >= 0; --i) {
+      int tmp_index = (index_output / strides_out[i]);
+      index_output = index_output - tmp_index * strides_out[i];
+      index_src += (tmp_index % in_dim[i]) * strides_in[i];
+    }
+    return index_src;
   }
 };
+#pragma pack()
 
 }  // namespace details
 
@@ -99,12 +107,12 @@ struct BroadcastConfig {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
-                                         int size_nx, int size_ny,
-                                         int stride_nx, int stride_ny) {
+__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
+                                    int size_nx, int size_ny, int stride_nx,
+                                    int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
-  __local__ T in_temp[1];
+  __local__ Tx in_temp[1];
   // Each branch is added for better performance
   if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
     if (IsBoundary) {
@@ -168,7 +176,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
  * init_data: Initial value.
  */
 template <typename T, int NX>
-__device__ __forceinline__ void Init(T* dst, T init_data) {
+__device__ __inline__ void Init(T* dst, T init_data) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     dst[i] = init_data;
@@ -197,8 +205,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
  * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
-                                         int num) {
+__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
+                                    int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
   if (IsBoundary) {  // core_num() * NX > num
@@ -241,10 +249,11 @@ __device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst, const T _global_ptr_* src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
-    int stride_ny) {
+__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+                                      uint32_t block_offset,
+                                      details::BroadcastConfig<Rank> config,
+                                      int total_num_output, int stride_nx,
+                                      int stride_ny) {
   uint32_t thread_offset = block_offset + core_id();
   uint32_t index_src = 0;
   __local__ T in_temp[1];
@@ -256,16 +265,11 @@ __device__ __forceinline__ void ReadDataBc(
       uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
       index_src = 0;
       if (IsBoundary) {
-        if (index_output >= total_num_output) {
+        if (index_output >= (uint32_t)total_num_output) {
           break;
         }
       }
-#pragma unroll
-      for (int i = 0; i < Rank; ++i) {
-        uint32_t tmp = index_output / config.stride_out[i];
-        index_output = index_output - tmp * config.stride_out[i];
-        index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
-      }
+      index_src = config(index_output);
       GM2LM(src + index_src, in_temp, sizeof(T));
       dst[nx + ny * NX] = in_temp[0];
     }
@@ -305,33 +309,34 @@ __device__ __forceinline__ void ReadDataBc(
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           typename IndexCal, bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataReduce(
-    T* dst, const T _global_ptr_* src, int block_offset,
-    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
-    int stride_ny, bool reduce_last_dim) {
-  __local__ T in_temp[1];
+__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
+                                          int block_offset,
+                                          const IndexCal& index_cal,
+                                          int size_nx, int size_ny,
+                                          int stride_nx, int stride_ny,
+                                          bool reduce_last_dim) {
+  __local__ Tx in_temp[1];
   int thread_offset = 0;
-  int left_size_nx = size_nx;
-  int left_size_ny = size_ny;
+  int left_idx = 0;
   if (reduce_last_dim) {
-    thread_offset = block_offset + core_id();
-    left_size_nx -= thread_offset;
+    thread_offset = core_id();
+    left_idx = 0;
   } else {
-    thread_offset = block_offset + core_id();
-    left_size_ny -= thread_offset;
+    thread_offset = 0;
+    left_idx = 0;
   }
 
   if (NX == 1) {
 #pragma unroll
     for (int ny = 0; ny < NY; ++ny) {
       if (IsBoundary) {
-        if (ny * stride_ny >= left_size_ny) {
+        if (thread_offset >= size_ny) {
           break;
         }
       }
-      uint32_t index_src = index_cal(thread_offset);
-      GM2LM(src + index_src, in_temp, sizeof(T));
-      dst[ny] = in_temp[0];
+      uint32_t index_src = index_cal(thread_offset + block_offset);
+      GM2LM(src + index_src, in_temp, sizeof(Tx));
+      dst[ny] = static_cast<Ty>(func(in_temp[0]));
       thread_offset += stride_ny;
     }
   } else {
@@ -340,17 +345,16 @@ __device__ __forceinline__ void ReadDataReduce(
 #pragma unroll
       for (int ny = 0; ny < NY; ++ny) {
         if (IsBoundary) {
-          if ((ny * stride_ny >= left_size_ny) ||
-              (nx * stride_nx >= left_size_nx)) {
+          if ((thread_offset >= size_ny) ||
+              (left_idx + nx * stride_nx >= size_nx)) {
             break;
           }
         }
-        uint32_t index_src = index_cal(thread_offset);
-        GM2LM(src + index_src, in_temp, sizeof(T));
-        dst[nx + ny * NX] = in_temp[0];
+        uint32_t index_src = index_cal(thread_offset + block_offset);
+        GM2LM(src + index_src, in_temp, sizeof(Tx));
+        dst[nx + ny * NX] = static_cast<Ty>(func(in_temp[0]));
         thread_offset += stride_ny;
       }
-      thread_offset += stride_nx;
     }
   }
 }
@@ -421,9 +425,9 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny) {
+__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
+                                     int size_nx, int size_ny, int stride_nx,
+                                     int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
   __local__ Ty in_temp[1];
@@ -433,11 +437,11 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
     if (IsBoundary) {
       if (left_size_nx > 0) {
         in_temp[0] = static_cast<Ty>(src[0]);
-        LM2GM(in_temp, dst + thread_offset, sizeof(T));
+        LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
       }
     } else {
       in_temp[0] = static_cast<Ty>(src[0]);
-      LM2GM(in_temp, dst + thread_offset, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
     }
   } else if (NX == 1) {
 #pragma unroll
@@ -449,7 +453,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
       }
 
       in_temp[0] = static_cast<Ty>(src[idy]);
-      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(Ty));
     }
   } else if (NY == 1) {  // for NY == 1 and NX != 1
 #pragma unroll
@@ -461,7 +465,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
       }
 
       in_temp[0] = static_cast<Ty>(src[idx]);
-      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(Ty));
     }
   } else {  // for NX != 1 and NY != 1
 #pragma unroll
@@ -480,7 +484,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
         }
         in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
         LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
-              sizeof(T));
+              sizeof(Ty));
       }
     }
   }
@@ -498,7 +502,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
  * init_data: The register pointer of init data, the size is NX.
  */
 template <typename T, int NX, bool IsBoundary = false>
-__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
+__device__ __inline__ void Init(T* dst, T* init_data, int num) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     if (IsBoundary) {
@@ -535,30 +539,26 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst, const T _global_ptr_* src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output) {
-  uint32_t thread_offset = block_offset + core_id() * NX;
-  uint32_t index_src = 0;
-  __local__ T in_temp[1];
+__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+                                      uint32_t block_offset,
+                                      details::BroadcastConfig<Rank> config,
+                                      int total_num_output) {
+  int thread_offset = block_offset + core_id() * NX;
+  int index_src = 0;
 
+  __local__ T in_temp;
 #pragma unroll
-  for (uint32_t nx = 0; nx < NX; ++nx) {
-    uint32_t index_output = thread_offset + nx;
+  for (int nx = 0; nx < NX; ++nx) {
+    int index_output = thread_offset + nx;
     index_src = 0;
     if (IsBoundary) {
       if (index_output >= total_num_output) {
         break;
       }
     }
-#pragma unroll
-    for (int i = 0; i < Rank; ++i) {
-      uint32_t tmp = index_output / config.stride_out[i];
-      index_output = index_output - tmp * config.stride_out[i];
-      index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
-    }
-    GM2LM(src + index_src, in_temp, sizeof(T));
-    dst[nx + ny * NX] = in_temp[0];
+    index_src = config(index_output);
+    GM2LM(src + index_src, &in_temp, sizeof(T));
+    dst[nx] = in_temp;
   }
 }
 
diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
index e20e77ae26a..558f8c81c66 100644
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -13,11 +13,18 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
 #ifdef PADDLE_WITH_XPU2
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
 #include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
+#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
+
+#define KPStream XPUStream
+#define KPDevice paddle::platform::XPUDeviceContext
+#define _ptr_ _global_ptr_
+#define __forceinline__ __inline__
+#define __restrict__
+
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -36,6 +43,12 @@
 #else
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
+#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
+
+#define KPStream gpuStream_t
+#define KPDevice paddle::platform::CUDADeviceContext
+#define _ptr_
+
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define THREAD_ID_Z threadIdx.z
diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h
index 1ffbbc217e2..65005a5adbb 100644
--- a/paddle/fluid/platform/hostdevice.h
+++ b/paddle/fluid/platform/hostdevice.h
@@ -17,7 +17,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__))
+#ifdef __xpu_kp__
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu_kp__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index f78328c01a3..e4cc894e483 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -86,7 +86,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCaller {
   __device__ __forceinline__ void operator()(
-      paddle::framework::Array<OutT *, NumOuts> outs,
+      paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
       ConditionalT<OutT, NumOuts> src[VecSize],
       int block_offset,
       int num) {
@@ -109,7 +109,7 @@ struct ElementwiseWriteDataCaller {
 template <typename OutT, int VecSize, bool IsBoundary>
 struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
   __device__ __forceinline__ void operator()(
-      paddle::framework::Array<OutT *, 1> outs,
+      paddle::framework::Array<_ptr_ OutT *, 1> outs,
       OutT src[VecSize],
       int block_offset,
       int num) {
@@ -126,8 +126,8 @@ template <typename InT,
           int VecSize,
           bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
-    paddle::framework::Array<OutT *, NumOuts> outs,
+    const paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int num,
     int data_offset,
     Functor func) {
@@ -161,8 +161,8 @@ template <typename InT,
           int NumOuts,
           int VecSize>
 __global__ void VectorizedElementwiseKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
+    paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int size,
     int main_offset,
     Functor func) {
@@ -212,17 +212,13 @@ template <typename InT,
           int Arity,
           int NumOuts,
           int VecSize>
-void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
+void ElementwiseCudaKernel(const KPDevice &ctx,
                            const std::vector<const DenseTensor *> &ins,
                            std::vector<DenseTensor *> *outs,
                            Functor func) {
   auto numel = ins[0]->numel();
-  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
-  int grid_size =
-      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
-  auto stream = ctx.stream();
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<OutT *, NumOuts> outs_data;
+  paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < Arity; ++i) {
     ins_data[i] = ins[i]->data<InT>();
@@ -231,8 +227,9 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
     outs_data[i] = (*outs)[i]->mutable_data<OutT>();
   }
 #ifdef PADDLE_WITH_XPU2
-  block_size = 128;
-  grid_size = 8;
+  int block_size = 64;
+  int grid_size = 8;
+  auto stream = ctx.x_context()->xpu_stream;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
   VectorizedElementwiseKernel<InT,
                               OutT,
@@ -242,7 +239,11 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
                               VecSize><<<grid_size, block_size, 0, stream>>>(
       ins_data, outs_data, numel, main_offset, func);
 #else
+  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
+  int grid_size =
+      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  auto stream = ctx.stream();
   VectorizedElementwiseKernel<InT,
                               OutT,
                               Functor,
@@ -259,7 +260,7 @@ template <ElementwiseType ET,
           typename Functor,
           int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
+    const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     Functor func) {
@@ -471,12 +472,12 @@ struct DimensionsTransform {
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
-    const T *__restrict__ src,
+    const _ptr_ T *src,
     uint32_t block_offset,
     const kps::details::BroadcastConfig<Rank> &config,
     int numel,
     int num,
-    bool need_broadcast) {
+    int need_broadcast) {
   // numel : whole num of output
   // num: how many data will be deal with in this time
   if (need_broadcast) {
@@ -496,9 +497,9 @@ template <typename InT,
           int Rank,
           bool IsBoundary = false>
 __device__ void ElementwiseBroadcastKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
-    const paddle::framework::Array<bool, Arity> &use_broadcast,
+    const paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> &ins,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    const paddle::framework::Array<int, Arity> &use_broadcast,
     uint32_t numel,
     const paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
         &configs,
@@ -540,9 +541,9 @@ template <typename InT,
           int VecSize,
           int Rank>
 __global__ void ElementwiseBroadcastKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
-    paddle::framework::Array<bool, Arity> use_broadcast,
+    paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    paddle::framework::Array<int, Arity> use_broadcast,
     uint32_t numel,
     paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
         configs,
@@ -570,7 +571,8 @@ __global__ void ElementwiseBroadcastKernel(
                                           block_offset,
                                           func);
   }
-  if (block_offset < numel) {
+  int num = numel - block_offset;
+  if (num > 0) {
     ElementwiseBroadcastKernelImpl<InT,
                                    OutT,
                                    Functor,
@@ -579,7 +581,7 @@ __global__ void ElementwiseBroadcastKernel(
                                    VecSize,
                                    Rank,
                                    true>(
-        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+        ins, outs, use_broadcast, numel, configs, num, block_offset, func);
   }
 #else
   if (block_offset < main_offset) {
@@ -619,23 +621,16 @@ template <typename InT,
           int NumOuts,
           int VecSize,
           int Rank>
-void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
+void LaunchKernel(const KPDevice &ctx,
                   const std::vector<const DenseTensor *> &ins,
                   std::vector<DenseTensor *> *outs,
                   Functor func,
                   DimensionsTransform merge_dims) {
   int numel = (*outs)[0]->numel();
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
-  auto stream = ctx.stream();
-
   paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
-  paddle::framework::Array<bool, Arity> use_broadcast;
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<OutT *, NumOuts> outs_data;
+  paddle::framework::Array<int, Arity> use_broadcast;
+  paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
     outs_data[i] = (*outs)[i]->mutable_data<OutT>();
@@ -643,7 +638,7 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
 
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = ins[i]->data<InT>();
+    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
     if (use_broadcast[i]) {
       // get the broadcast config,
       // if data shape is[m, n], then you should set data_dim = {n, m}
@@ -654,10 +649,11 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
   }
 
 #ifdef PADDLE_WITH_XPU2
-  threads = 128;
-  blocks = 8;
-  main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  tail_tid = numel % (VecSize * threads);
+  const int threads = 64;
+  const int blocks = 8;
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.x_context()->xpu_stream;
   ElementwiseBroadcastKernel<InT,
                              OutT,
                              Functor,
@@ -673,6 +669,11 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
                                                                 tail_tid,
                                                                 func);
 #else
+  const int threads = 256;
+  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.stream();
   ElementwiseBroadcastKernel<InT,
                              OutT,
                              Functor,
@@ -698,7 +699,7 @@ template <typename InT,
           int NumOuts,
           int VecSize>
 void LaunchBroadcastKernelForDifferentVecSize(
-    const paddle::platform::CUDADeviceContext &ctx,
+    const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     int axis,
@@ -737,7 +738,7 @@ template <ElementwiseType ET,
           typename Functor,
           int NumOuts = 1>
 void LaunchBroadcastElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
+    const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     int axis,
@@ -835,12 +836,11 @@ template <ElementwiseType ET,
           typename OutT,
           typename Functor,
           int NumOuts = 1>
-void LaunchElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &cuda_ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
+void LaunchElementwiseCudaKernel(const KPDevice &ctx,
+                                 const std::vector<const DenseTensor *> &ins,
+                                 std::vector<DenseTensor *> *outs,
+                                 int axis,
+                                 Functor func) {
   std::vector<int> dims_size;
   bool no_broadcast_flag = true;
   for (auto *in : ins) {
@@ -849,14 +849,14 @@ void LaunchElementwiseCudaKernel(
   }
   if (no_broadcast_flag) {
     LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        cuda_ctx, ins, outs, func);
+        ctx, ins, outs, func);
   } else {
     axis = axis == -1
                ? *std::max_element(dims_size.begin(), dims_size.end()) -
                      *std::min_element(dims_size.begin(), dims_size.end())
                : axis;
     LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        cuda_ctx, ins, outs, axis, func);
+        ctx, ins, outs, axis, func);
   }
 }
 
-- 
GitLab