From 255bf609e5d9289dfc6d5122e7fda746c933b6e2 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 1 Mar 2022 18:48:02 +0800
Subject: [PATCH] Add function description for Kernel Primitive API (#39884)

* Add function description for Kernel Primitive API
1. Set cumsum and sort share memory size = 1024
2.sort and cumsum api limitation : blockDim.x must be less than 512 (blockDim.x <= 512)
---
 .../kernels/primitive/compute_primitives.h    | 284 +++++++++++++-----
 .../primitive/compute_primitives_xpu2.h       |  23 ++
 .../kernels/primitive/datamover_primitives.h  |  32 ++
 .../primitive/datamover_primitives_xpu2.h     |  41 +++
 4 files changed, 311 insertions(+), 69 deletions(-)
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 4f3c069f3b..19427551fb 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -136,7 +136,9 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
   return shared_memory[threadIdx.x];
 }
 
-// Swap data
+/**
+ * @brief Swap data
+ */
 template <typename T>
 __device__ __forceinline__ void Swap(T* first_value, T* second_value) {
   T t_value;
@@ -145,7 +147,9 @@ __device__ __forceinline__ void Swap(T* first_value, T* second_value) {
   (*second_value) = t_value;
 }
 
-// swap with monotonic_type
+/**
+ * @brief Swap data according to  monotonic_type.
+ */
 template <typename T>
 __device__ __forceinline__ void Comparator(T* first_value,
                                            T* second_value,
@@ -155,6 +159,9 @@ __device__ __forceinline__ void Comparator(T* first_value,
   }
 }
 
+/**
+ * @brief Swap data and data index according to  monotonic_type.
+ */
 template <typename T, typename IndexType>
 __device__ __forceinline__ void ComparatorWithIndex(T* first_value,
 
@@ -170,6 +177,18 @@ __device__ __forceinline__ void ComparatorWithIndex(T* first_value,
   }
 }
 
+/**
+ * @brief get the last pow of 2
+ */
+__device__ inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
 }  // namespace details
 
 /**
@@ -453,6 +472,29 @@ __device__ __forceinline__ void Reduce(T* out,
   }
 }
 
+/*
+* @brief Fill register with a constant according to OpFunc
+*
+* @template paraments
+* InT: The data type of in1 and in2.
+* OutT: The data type of out.
+* NX: The number of data columns loaded by each thread.
+* NY: The number of data rows loaded by each thread.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename InT>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()()
+* const {
+*         return a;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<InT>().
+*/
 template <typename InT,
           typename OutT,
           int NX,
@@ -466,6 +508,33 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
   }
 }
 
+/*
+* @brief Get ReturnsCount random data fromm compute according to state, state
+* can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
+* initialized.
+*
+* @template paraments
+* StateType: the type of state, can be curandStatePhilox4_32_10_t or
+* hiprandStatePhilox4_32_10_t.
+* OutT: the type of out register.
+* ReturnsCount: The number of random data generated by OpFunc.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename T>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()(StateType state)
+* const {
+*         return ranomd(state);  // Returns ReturnsCount random numbers with
+* data type T
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<T>().
+*/
+
 template <typename StateType,
           typename OutT,
           int ReturnsCount,
@@ -481,131 +550,208 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
   }
 }
 
-// attention please set share_size = blockDim.x;
-// data and b are the register pointer
-#define shared_size 64
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+/*
+* @brief Complete the prefix and in the block, each thread calculates 2 data,
+* the size of out and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: the type of input register.
+* OutT: the type of out register.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename T>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()(T a, T b)
+* const {
+*         return a + b;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is 2;
+* in: The register pointer of input, the size is 2;
+* compute: Compute function which was declared like OpFunc<T>().
+*/
+
+#define SHARED_SIZE_LIMIT 512
+template <typename InT, typename OutT, int BlockSize, class OpFunc>
 __device__ __forceinline__ void Cumsum(OutT* out,
                                        const InT* in,
                                        OpFunc compute) {
-  __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32];
+  constexpr int kSize = SHARED_SIZE_LIMIT * 2 + (SHARED_SIZE_LIMIT * 2) / 32;
+  __shared__ InT temp[kSize];
+  int stride_size = blockDim.x;
   int tidx = threadIdx.x;
   temp[tidx + tidx / 32] = in[0];
-  temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1];
-  for (int stride = 1; stride <= blockDim.x; stride *= 2) {
+  temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1];
+  for (int stride = 1; stride <= stride_size; stride *= 2) {
     __syncthreads();
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
-      temp[index + index / 32] += temp[index - stride + (index - stride) / 32];
+      temp[index + index / 32] =
+          compute(temp[index + index / 2],
+                  temp[index - stride + (index - stride) / 32]);
     }
   }
   for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) {
     __syncthreads();
     int index = (tidx + 1) * 2 * stride - 1;
     if ((index + stride) < (blockDim.x * 2)) {
-      temp[index + stride + (stride + index) / 32] +=
-          temp[index + (index) / 32];
+      temp[index + stride + (stride + index) / 32] =
+          compute(temp[index + stride + (stride + index) / 32],
+                  temp[index + (index) / 32]);
     }
   }
 
   __syncthreads();
   out[0] = static_cast<OutT>(temp[tidx + tidx / 32]);
   out[1] =
-      static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
+      static_cast<OutT>(temp[tidx + stride_size + (tidx + stride_size) / 32]);
 }
-
-#define SHARED_SIZE_LIMIT \
-  1024  // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
-        // larger than blockDim.x * 2
-// if monotonic_type = 1 then increase
-// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2
-// == 1 the increase
-template <typename T>
-__device__ __forceinline__ void Sort(T* dst,
-                                     const T* src_data,
+#undef SHARED_SIZE_LIMIT
+
+/*
+* @brief Sort data in this block, each thread calculates 2 data, the size of out
+* and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: the type of input register.
+* OutT: the type of out register.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+*
+* @param
+* out: The register pointer of out, the size is 2.
+* in: The register pointer of input, the size is 2.
+* num: The num of this block
+* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+* sorted in escending.
+*/
+#define SHARED_SIZE_LIMIT 1024
+// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
+// larger than blockDim.x * 2
+template <typename InT, typename OutT, int BlockSize>
+__device__ __forceinline__ void Sort(OutT* out,
+                                     const InT* in,
                                      int num,
                                      int monotonic_type) {
-  // todo: set  num = Pow2(num)
+  int upper_bound = blockDim.x;
+  // update upper_bound
+  upper_bound = std::min(details::GetLastPow2(num), upper_bound);
   // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
-  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
-                                          // blockDim * 2
-  // Copy value and index from src and src_index
-  value[threadIdx.x] = src_data[0];
-  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  __shared__ InT value[SHARED_SIZE_LIMIT];
+  int stride_size = blockDim.x;
+  // shareMem's size must larger than blockDim * 2
+  // Copy value from in
+  value[threadIdx.x] = in[0];
+  value[threadIdx.x + stride_size] = in[1];
   // make bitonicSort
-  for (int size = 2; size < num; size <<= 1) {
+  for (int size = 2; size < upper_bound; size <<= 1) {
     int bitonic_type = (threadIdx.x & (size / 2)) != 0;
     for (int stride = size / 2; stride > 0; stride >>= 1) {
       __syncthreads();
       int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      details::Comparator<T>(&value[pos], &value[pos + stride], bitonic_type);
+      details::Comparator<InT>(&value[pos], &value[pos + stride], bitonic_type);
     }
   }
   // last sort
-  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+  for (int stride = stride_size; stride > 0; stride >>= 1) {
     __syncthreads();
     int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     // last sort when monotonic_type = 1 then increase
-    details::Comparator<T>(&value[pos], &value[pos + stride], monotonic_type);
+    details::Comparator<InT>(&value[pos], &value[pos + stride], monotonic_type);
   }
   __syncthreads();
-  dst[0] = value[threadIdx.x];
-  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  out[0] = static_cast<OutT>(value[threadIdx.x]);
+  out[1] = static_cast<OutT>(value[threadIdx.x + stride_size]);
 }
 
-template <typename T, typename IndexType>
-__device__ __forceinline__ void Sort(T* dst,
-                                     IndexType* dst_index,
-                                     const T* src_data,
-                                     IndexType* src_index,
+/*
+* @brief Sort data with data_index in this block, each thread calculates 2 data,
+* the size of out and in is 2, and BlockDim.x must be less then 512.
+*
+* @template paraments
+* InT: The type of input register.
+* OutT: The type of out register.
+* IndexType: The type of index.
+* BlockSize: Identifies the current device thread index method. Currently only
+* GPU was supported.
+*
+* @param
+* out: The register pointer of out, the size is 2.
+* out_index: The register pointer of out_index, the size is 2.
+* in: The register pointer of input, the size is 2.
+* in_index: The register pointer of in_index, the size is 2.
+* num: The num of this block.
+* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+* sorted in escending.
+*/
+template <typename InT, typename OutT, typename IndexType, int BlockSize>
+__device__ __forceinline__ void Sort(OutT* out,
+                                     IndexType* out_index,
+                                     const InT* in,
+                                     IndexType* in_index,
                                      int num,
                                      int monotonic_type) {
-  // todo: set  num = Pow2(num)
+  int upper_bound = blockDim.x;
+  // update upper_bound
+  upper_bound = std::min(details::GetLastPow2(num), upper_bound);
   // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
-  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
-                                          // blockDim * 2
+  __shared__ InT value[SHARED_SIZE_LIMIT];
+  // shareMem's size must larger than blockDim * 2
   __shared__ IndexType index[SHARED_SIZE_LIMIT];
-  // Copy value and index from src and src_index
-  value[threadIdx.x] = src_data[0];
-  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // Copy value and index from in and in_index
+  int stride_size = blockDim.x;
+  value[threadIdx.x] = in[0];
+  value[threadIdx.x + stride_size] = in[1];
   // index
-  index[threadIdx.x] = src_index[0];
-  index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1];
+  index[threadIdx.x] = in_index[0];
+  index[threadIdx.x + stride_size] = in_index[1];
   // make bitonicSort
-  for (int size = 2; size < num; size <<= 1) {
+  for (int size = 2; size < upper_bound; size <<= 1) {
     int bitonic_type = (threadIdx.x & (size / 2)) != 0;
     for (int stride = size / 2; stride > 0; stride >>= 1) {
       __syncthreads();
       int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      details::ComparatorWithIndex<T, IndexType>(&value[pos],
-                                                 &value[pos + stride],
-                                                 &index[pos],
-                                                 &index[pos + stride],
-                                                 bitonic_type);
+      details::ComparatorWithIndex<InT, IndexType>(&value[pos],
+                                                   &value[pos + stride],
+                                                   &index[pos],
+                                                   &index[pos + stride],
+                                                   bitonic_type);
     }
   }
 
-  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+  for (int stride = stride_size; stride > 0; stride >>= 1) {
     __syncthreads();
     int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
     // last sort when monotonic_type = 1 then increase
-    details::ComparatorWithIndex<T, IndexType>(&value[pos],
-                                               &value[pos + stride],
-                                               &index[pos],
-                                               &index[pos + stride],
-                                               monotonic_type);
+    details::ComparatorWithIndex<InT, IndexType>(&value[pos],
+                                                 &value[pos + stride],
+                                                 &index[pos],
+                                                 &index[pos + stride],
+                                                 monotonic_type);
   }
 
   __syncthreads();
-  dst[0] = value[threadIdx.x];
-  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-  dst_index[0] = index[threadIdx.x];
-  dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  out[0] = static_cast<OutT>(value[threadIdx.x]);
+  out[1] = static_cast<OutT>(value[threadIdx.x + stride_size]);
+  out_index[0] = index[threadIdx.x];
+  out_index[1] = index[threadIdx.x + stride_size];
+}
+
+template <typename T1, typename T2, typename OutT, typename OpFunc>
+HOSTDEVICE __forceinline__ void OperatorTernary(
+    OutT* out, const T1* in1, const T2* in2, OpFunc func, int num) {
+  func(out, in1, in2, num);
+}
+
+template <typename InT, typename OutT, typename OpFunc>
+HOSTDEVICE __forceinline__ void OperatorBinary(OutT* out,
+                                               const InT* in,
+                                               OpFunc func,
+                                               int num) {
+  func(out, in, num);
 }
 
 }  // namespace kps
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index a445f4a02e..1f4ef2ed93 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -348,6 +348,29 @@ __device__ __forceinline__ void Reduce(T* out,
   }
 }
 
+/*
+* @brief Fill register with a constant according to OpFunc
+*
+* @template paraments
+* InT: The data type of in1 and in2.
+* OutT: The data type of out.
+* NX: The number of data columns loaded by each thread.
+* NY: The number of data rows loaded by each thread.
+* BlockSize: Identifies the current device thread index method. For xpu,
+* core_id() is used as the index.
+* OpFunc: Compute functor which has an operator() as following
+*     template <typename InT>
+*     struct XxxFunctor {
+*       HOSTDEVICE InT operator()()
+* const {
+*         return a;
+*       }
+*     };
+*
+* @param
+* out: The register pointer of out, the size is NX * NY.
+* compute: Compute function which was declared like OpFunc<InT>().
+*/
 template <typename InT,
           typename OutT,
           int NX,
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index a6c4c40a75..2f1e2f589c 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -297,6 +297,24 @@ __device__ __forceinline__ void ReadData(T* dst,
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T,
           int NX,
@@ -714,6 +732,20 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Initialize register with data index.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ * NY: Number of data to initialize, NY only can be 1.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
 template <typename T, int NX, int NY, int BlockSize>
 __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
   int thread_offset = block_offset + threadIdx.x * NX;
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 75b2dbaf7e..53a8b7d0c9 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -244,6 +244,24 @@ __device__ __inline__ void ReadData(T* dst,
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
  */
 template <typename T,
           int NX,
@@ -646,5 +664,28 @@ __device__ __inline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Initialize register with data index.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ * NY: Number of data to initialize, NY only can be 1.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
+  int thread_offset = block_offset + core_id() * NX;
+#pragma unroll
+  for (int nx = 0; nx < NX; ++nx) {
+    dst[nx] = static_cast<T>(thread_offset + nx);
+  }
+}
+
 }  // namespace kps
 }  // namespace phi
-- 
GitLab