[Pten]Move kernel_primitives lib to Pten directory (#39169)

* move kernel_primitives * use pten's errors

[Pten]Move kernel_primitives lib to Pten directory (#39169)
* move kernel_primitives * use pten's errors
452bcbe2 · YuanRisheng · GitHub · bd5c962d · 452bcbe2 · 452bcbe2
11 changed file
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -13,241 +13,10 @@
 // limitations under the License.

 #pragma once
-
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/kernels/funcs/eigen/extensions.h"
+#include "paddle/pten/kernels/primitive/functor_primitives.h"

 namespace paddle {
 namespace operators {
-namespace kernel_primitives {
-namespace details {
-
-static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
-  return ::Eigen::numext::exp(x);
-}
-
-static __device__ __forceinline__ float Exp(float x) { return expf(x); }
-
-static __device__ __forceinline__ double Exp(double x) { return exp(x); }
-
-static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
-  return ::Eigen::numext::log(x);
-}
-
-static __device__ __forceinline__ float Log(float x) { return logf(x); }
-
-static __device__ __forceinline__ double Log(double x) { return log(x); }
-
-}  // namespace details
-
-/******************************** Unary Functor *******************************/
-
-/**
- * @brief Default unary exp functor
- */
-template <typename Tx, typename Ty = Tx>
-struct ExpFunctor {
-  HOSTDEVICE inline ExpFunctor() {}
-
-  HOSTDEVICE explicit inline ExpFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(details::Exp(x));
-  }
-};
-
-/**
- * @brief Default unary identity functor
- */
-template <typename Tx, typename Ty = Tx>
-struct IdentityFunctor {
-  HOSTDEVICE inline IdentityFunctor() {}
-
-  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(x);
-  }
-};
-
-/**
- * @brief Default unary div functor. Divide by a constant
- */
-template <typename Tx, typename Ty = Tx>
-struct DivideFunctor {
- private:
-  using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
-
- public:
-  HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
-
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
-  }
-
- private:
-  MPType n_inv;
-};
-
-/**
- * @brief Default inverse functor
- */
-template <typename Tx, typename Ty = Tx>
-struct InverseFunctor {
-  HOSTDEVICE inline InverseFunctor() {}
-
-  HOSTDEVICE explicit inline InverseFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(-x);
-  }
-};
-
-/**
- * @brief Default unary square functor
- */
-template <typename Tx, typename Ty = Tx>
-struct SquareFunctor {
-  HOSTDEVICE inline SquareFunctor() {}
-
-  HOSTDEVICE explicit inline SquareFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(x) * static_cast<Ty>(x);
-  }
-};
-
-/****************************** Binary Functor ********************************/
-
-/**
- * @brief Default binary min functor
- */
-template <typename T>
-struct MinFunctor {
-  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return (b < a) ? b : a;
-  }
-};
-
-/**
- * @brief Default binary max functor
- */
-template <typename T>
-struct MaxFunctor {
-  inline T initial() {
-    return static_cast<T>(std::numeric_limits<T>::lowest());
-  }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return (b > a) ? b : a;
-  }
-};
-
-/**
- * @brief Default binary add functor
- */
-template <typename T>
-struct AddFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b + a;
-  }
-};
-
-/**
- * @brief Default binary add functor
- */
-template <typename T>
-struct MulFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b * a;
-  }
-};
-
-/**
- * @brief Default binary logic or functor
- */
-template <typename T>
-struct LogicalOrFunctor {
-  inline T initial() { return static_cast<T>(false); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b || a;
-  }
-};
-
-/**
- * @brief Default binary logic and functor
- */
-template <typename T>
-struct LogicalAndFunctor {
-  inline T initial() { return static_cast<T>(true); }
-
-  __device__ __forceinline__ T operator()(const T a, const T b) const {
-    return b && a;
-  }
-};
-
-/**
- * @brief Default binary sub functor
- */
-template <typename T>
-struct SubFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
-};
-
-/**
- * @brief Default binary div functor
- */
-template <typename T, typename Enable = void>
-struct DivFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-
-template <typename T>
-struct DivFunctor<T,
-                  typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    // For int32/int64, need to check whether the divison is zero.
-    PADDLE_ENFORCE_NE(b, 0,
-                      platform::errors::InvalidArgument(
-                          "Integer division by zero encountered "
-                          "in (floor) divide. Please check the input value."));
-    return a / b;
-  }
-};
-
-/**
- * @brief Default binary floor divide functor
- */
-template <typename T>
-struct FloorDivFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    PADDLE_ENFORCE_NE(b, 0,
-                      platform::errors::InvalidArgument(
-                          "Integer division by zero encountered "
-                          "in (floor) divide. Please check the input value."));
-    return static_cast<T>(std::trunc(a / b));
-  }
-};
-
-}  // namespace kernel_primitives
+namespace kernel_primitives = pten::kps;
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -13,61 +13,10 @@
 // limitations under the License.

 #pragma once
-#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
-#ifdef PADDLE_WITH_XPU2
-#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
-#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
-
-#define KPStream XPUStream
-#define KPDevice paddle::platform::XPUDeviceContext
-#define _ptr_ _global_ptr_
-#define __forceinline__ __inline__
-#define __restrict__
-
-#define THREAD_ID_X core_id()
-#define THREAD_ID_Y 0
-#define THREAD_ID_Z 0
-
-#define BLOCK_NUM_X core_num()
-#define BLOCK_NUM_Y 0
-#define BLOCK_NUM_Z 0
-
-#define BLOCK_ID_X cluster_id()
-#define BLOCK_ID_Y 0
-#define BLOCK_ID_Z 0
-
-#define GRID_NUM_X cluster_num()
-#define GRID_NUM_Y 0
-#define GRID_NUM_Z 0
-#else
-#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
-#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
-
-#define KPStream gpuStream_t
-#define KPDevice paddle::platform::CUDADeviceContext
-#define _ptr_
-
-#define THREAD_ID_X threadIdx.x
-#define THREAD_ID_Y threadIdx.y
-#define THREAD_ID_Z threadIdx.z
-
-#define BLOCK_NUM_X blockDim.x
-#define BLOCK_NUM_Y blockDim.y
-#define BLOCK_NUM_Z blockDim.z
-
-#define BLOCK_ID_X blockIdx.x
-#define BLOCK_ID_Y blockIdx.y
-#define BLOCK_ID_Z blockIdx.z
-
-#define GRID_NUM_X gridDim.x
-#define GRID_NUM_Y gridDim.y
-#define GRID_NUM_Z gridDim.z
-#endif
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"

 namespace paddle {
 namespace operators {
-namespace kernel_primitives {}
+namespace kernel_primitives = pten::kps;
 }
 }
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -22,12 +22,12 @@ limitations under the License. */
 #include "paddle/pten/kernels/empty_kernel.h"

 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"

-namespace kps = paddle::operators::kernel_primitives;
+namespace kps = pten::kps;

 #endif


--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -34,13 +34,13 @@ namespace cub = hipcub;

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/pten/core/array.h"
 #include "paddle/pten/core/enforce.h"
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"

 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
@@ -51,7 +51,7 @@ namespace cub = hipcub;
 #define REDUCE_SPLIT_BOUNDARY 512
 #define REDUCE_VEC_SIZE 4

-namespace kps = paddle::operators::kernel_primitives;
+namespace kps = pten::kps;

 namespace pten {
 namespace kernels {

--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -22,11 +22,10 @@
 #endif

 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"

-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {

 #ifdef __HIPCC__
@@ -48,7 +47,7 @@ class MPTypeTrait {
 };

 template <>
-class MPTypeTrait<platform::float16> {
+class MPTypeTrait<pten::dtype::float16> {
 public:
  using Type = float;
 };
@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
 * in: The register pointer of in, the size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+__device__ __forceinline__ void ElementwiseUnary(OutT* out,
+                                                 const InT* in,
                                                 OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; idx++) {
@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+__device__ __forceinline__ void ElementwiseBinary(OutT* out,
+                                                  const InT* in1,
                                                  const InT* in2,
                                                  OpFunc compute) {
 #pragma unroll
@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
 * in3: The register pointer of third input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
-                                                   const InT* in2,
-                                                   const InT* in3,
-                                                   OpFunc compute) {
+__device__ __forceinline__ void ElementwiseTernary(
+    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; ++idx) {
    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
 * ins: A pointers of array consisting of multiple inputs.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Arity,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+__device__ __forceinline__ void ElementwiseAny(OutT* out,
+                                               InT (*ins)[NX * NY],
                                               OpFunc compute) {
  InT args[Arity];
 #pragma unroll
@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
-                                            const InT* in2, OpFunc compute) {
+__device__ __forceinline__ void CycleBinary(OutT* out,
+                                            const InT* in1,
+                                            const InT* in2,
+                                            OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
 * reducer: Compute function which was declared like ReduceFunctor<InT>().
 * reduce_last_dim: if the last dim gets involved in reduction.
 */
-template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          class ReduceFunctor,
          details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in,
+__device__ __forceinline__ void Reduce(T* out,
+                                       const T* in,
                                       ReduceFunctor reducer,
                                       bool reduce_last_dim) {
  int block_index = blockDim.y;
@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
  }
 }

-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
@@ -13,13 +13,13 @@
 // limitations under the License.

 #pragma once
+#include "paddle/pten/common/float16.h"
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"

-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {

 // kGlobalMode: block reduce, each block gets an output;
@@ -33,7 +33,7 @@ class MPTypeTrait {
 };

 template <>
-class MPTypeTrait<platform::float16> {
+class MPTypeTrait<pten::dtype::float16> {
 public:
  using Type = float;
 };
@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
 * in: The register pointer of in, the size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+__device__ __forceinline__ void ElementwiseUnary(OutT* out,
+                                                 const InT* in,
                                                 OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; idx++) {
@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+__device__ __forceinline__ void ElementwiseBinary(OutT* out,
+                                                  const InT* in1,
                                                  const InT* in2,
                                                  OpFunc compute) {
 #pragma unroll
@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
 * in3: The register pointer of third input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
-                                                   const InT* in2,
-                                                   const InT* in3,
-                                                   OpFunc compute) {
+__device__ __forceinline__ void ElementwiseTernary(
+    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; ++idx) {
    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
 * ins: A pointers of array consisting of multiple inputs.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Arity,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+__device__ __forceinline__ void ElementwiseAny(OutT* out,
+                                               InT (*ins)[NX * NY],
                                               OpFunc compute) {
  __local__ InT args[Arity];
 #pragma unroll
@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
-                                            const InT* in2, OpFunc compute) {
+__device__ __forceinline__ void CycleBinary(OutT* out,
+                                            const InT* in1,
+                                            const InT* in2,
+                                            OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
 * reducer: Compute function which was declared like ReduceFunctor<InT>().
 * reduce_last_dim: if the last dim gets involved in reduction.
 */
-template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          class ReduceFunctor,
          details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in,
+__device__ __forceinline__ void Reduce(T* out,
+                                       const T* in,
                                       ReduceFunctor reducer,
                                       bool reduce_last_dim) {
  if (Mode == kGlobalMode) {
@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
  }
 }

-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -22,9 +22,8 @@
 #endif
 #include "paddle/pten/core/ddim.h"

-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {

 #define INT_BITS 32
@@ -103,11 +102,12 @@ struct BroadcastConfig {
    strides_in.resize(dim_size, 1);
    for (int i = 0; i < dim_size; ++i) {
      strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i];
-      strides_in[i] =
-          (i != 0 && strides_in[i] != 0)
-              ? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
-                                std::multiplies<int64_t>())
-              : strides_in[i];
+      strides_in[i] = (i != 0 && strides_in[i] != 0)
+                          ? std::accumulate(in_dims.begin(),
+                                            in_dims.begin() + i,
+                                            1,
+                                            std::multiplies<int64_t>())
+                          : strides_in[i];
    }

    memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
@@ -144,11 +144,18 @@ struct BroadcastConfig {
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
          bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
-                                         int size_nx, int size_ny,
-                                         int stride_nx, int stride_ny) {
+__device__ __forceinline__ void ReadData(Ty* dst,
+                                         const Tx* __restrict__ src,
+                                         int size_nx,
+                                         int size_ny,
+                                         int stride_nx,
+                                         int stride_ny) {
  int thread_offset = threadIdx.x;
  int left_size_nx = size_nx - thread_offset;

@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
 * size: The current block needs to load size data continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
                                         int num) {
  if (IsBoundary) {  // blockDim.x * NX > num
    int thread_offset = threadIdx.x * NX;
@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
          bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
-    T* dst, const T* __restrict__ src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
+    T* dst,
+    const T* __restrict__ src,
+    uint32_t block_offset,
+    details::BroadcastConfig<Rank> config,
+    int total_num_output,
+    int stride_nx,
    int stride_ny) {
  uint32_t thread_offset = block_offset + threadIdx.x;
  uint32_t index_src = 0;
@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
 * reduce_last_dim: Used to indicate whether the dimension of reduce contains
 * the lowest dimension.
 */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize, int Rank,
-          typename IndexCal, typename Functor, bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataReduce(
-    Ty* dst, const Tx* __restrict__ src, int block_offset,
-    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
-    int stride_ny, Functor func, bool reduce_last_dim) {
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
+          typename IndexCal,
+          typename Functor,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataReduce(Ty* dst,
+                                               const Tx* __restrict__ src,
+                                               int block_offset,
+                                               const IndexCal& index_cal,
+                                               int size_nx,
+                                               int size_ny,
+                                               int stride_nx,
+                                               int stride_ny,
+                                               Functor func,
+                                               bool reduce_last_dim) {
  int thread_offset = 0;
  int left_idx = 0;
  if (reduce_last_dim) {
@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
 * size: The current block needs to load size elements continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
                                          int num) {
  if (IsBoundary) {
    int thread_offset = threadIdx.x * NX;
@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
          bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny) {
+__device__ __forceinline__ void WriteData(Ty* dst,
+                                          const Tx* __restrict__ src,
+                                          int size_nx,
+                                          int size_ny,
+                                          int stride_nx,
+                                          int stride_ny) {
  int thread_offset = threadIdx.x;
  int left_size_nx = size_nx - thread_offset;

@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
 * coordinate mapping relationship between output data and input data.
 * total_num_output: Total number of original output.
 */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
          bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
-    T* dst, const T* __restrict__ src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output) {
+    T* dst,
+    const T* __restrict__ src,
+    uint32_t block_offset,
+    details::BroadcastConfig<Rank> config,
+    int total_num_output) {
  uint32_t thread_offset = block_offset + threadIdx.x * NX;
  uint32_t index_src = 0;

@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
  }
 }

-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
@@ -17,9 +17,8 @@
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"

-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {
 namespace details {

 template <typename T, int VecSize>
@@ -105,10 +104,17 @@ struct BroadcastConfig {
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
          bool IsBoundary = false>
-__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
-                                    int size_nx, int size_ny, int stride_nx,
+__device__ __inline__ void ReadData(Ty* dst,
+                                    const Tx _global_ptr_* src,
+                                    int size_nx,
+                                    int size_ny,
+                                    int stride_nx,
                                    int stride_ny) {
  int thread_offset = core_id();
  int left_size_nx = size_nx - thread_offset;
@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
 * size: The current block needs to load size data continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
+__device__ __inline__ void ReadData(T* dst,
+                                    const T _global_ptr_* src,
                                    int num) {
  int thread_offset = core_id() * NX;
  __local__ T in_temp[1];
@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
          bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+__device__ __inline__ void ReadDataBc(T* dst,
+                                      const T _global_ptr_* src,
                                      uint32_t block_offset,
                                      details::BroadcastConfig<Rank> config,
-                                      int total_num_output, int stride_nx,
+                                      int total_num_output,
+                                      int stride_nx,
                                      int stride_ny) {
  uint32_t thread_offset = block_offset + core_id();
  uint32_t index_src = 0;
@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
 * reduce_last_dim: Used to indicate whether the dimension of reduce contains
 * the lowest dimension.
 */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
-          typename IndexCal, bool IsBoundary = false>
-__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
+          typename IndexCal,
+          bool IsBoundary = false>
+__device__ __inline__ void ReadDataReduce(T* dst,
+                                          const T _global_ptr_* src,
                                          int block_offset,
                                          const IndexCal& index_cal,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny,
+                                          int size_nx,
+                                          int size_ny,
+                                          int stride_nx,
+                                          int stride_ny,
                                          bool reduce_last_dim) {
  __local__ Tx in_temp[1];
  int thread_offset = 0;
@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+template <typename Tx,
+          typename Ty,
+          int NX,
+          int NY,
+          int BlockSize,
          bool IsBoundary = false>
-__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
-                                     int size_nx, int size_ny, int stride_nx,
+__device__ __inline__ void WriteData(Ty _global_ptr_* dst,
+                                     const Tx* src,
+                                     int size_nx,
+                                     int size_ny,
+                                     int stride_nx,
                                     int stride_ny) {
  int thread_offset = core_id();
  int left_size_nx = size_nx - thread_offset;
@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
          }
        }
        in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
-        LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
+        LM2GM(in_temp,
+              dst + thread_offset + idx * stride_nx + idy * stride_ny,
              sizeof(Ty));
      }
    }
@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
 * coordinate mapping relationship between output data and input data.
 * total_num_output: Total number of original output.
 */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          int Rank,
          bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+__device__ __inline__ void ReadDataBc(T* dst,
+                                      const T _global_ptr_* src,
                                      uint32_t block_offset,
                                      details::BroadcastConfig<Rank> config,
                                      int total_num_output) {
@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
  }
 }

-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
--- a/paddle/pten/kernels/primitive/functor_primitives.h
+++ b/paddle/pten/kernels/primitive/functor_primitives.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/kernels/funcs/eigen/extensions.h"
+
+namespace pten {
+namespace kps {
+namespace details {
+
+static __device__ __forceinline__ pten::dtype::float16 Exp(
+    pten::dtype::float16 x) {
+  return ::Eigen::numext::exp(x);
+}
+
+static __device__ __forceinline__ float Exp(float x) { return expf(x); }
+
+static __device__ __forceinline__ double Exp(double x) { return exp(x); }
+
+static __device__ __forceinline__ pten::dtype::float16 Log(
+    pten::dtype::float16 x) {
+  return ::Eigen::numext::log(x);
+}
+
+static __device__ __forceinline__ float Log(float x) { return logf(x); }
+
+static __device__ __forceinline__ double Log(double x) { return log(x); }
+
+}  // namespace details
+
+/******************************** Unary Functor *******************************/
+
+/**
+ * @brief Default unary exp functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct ExpFunctor {
+  HOSTDEVICE inline ExpFunctor() {}
+
+  HOSTDEVICE explicit inline ExpFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(details::Exp(x));
+  }
+};
+
+/**
+ * @brief Default unary identity functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  HOSTDEVICE inline IdentityFunctor() {}
+
+  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(x);
+  }
+};
+
+/**
+ * @brief Default unary div functor. Divide by a constant
+ */
+template <typename Tx, typename Ty = Tx>
+struct DivideFunctor {
+ private:
+  using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
+
+ public:
+  HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
+
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
+  }
+
+ private:
+  MPType n_inv;
+};
+
+/**
+ * @brief Default inverse functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct InverseFunctor {
+  HOSTDEVICE inline InverseFunctor() {}
+
+  HOSTDEVICE explicit inline InverseFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(-x);
+  }
+};
+
+/**
+ * @brief Default unary square functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct SquareFunctor {
+  HOSTDEVICE inline SquareFunctor() {}
+
+  HOSTDEVICE explicit inline SquareFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
+    return static_cast<Ty>(x) * static_cast<Ty>(x);
+  }
+};
+
+/****************************** Binary Functor ********************************/
+
+/**
+ * @brief Default binary min functor
+ */
+template <typename T>
+struct MinFunctor {
+  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary max functor
+ */
+template <typename T>
+struct MaxFunctor {
+  inline T initial() {
+    return static_cast<T>(std::numeric_limits<T>::lowest());
+  }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct AddFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b + a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct MulFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b * a;
+  }
+};
+
+/**
+ * @brief Default binary logic or functor
+ */
+template <typename T>
+struct LogicalOrFunctor {
+  inline T initial() { return static_cast<T>(false); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b || a;
+  }
+};
+
+/**
+ * @brief Default binary logic and functor
+ */
+template <typename T>
+struct LogicalAndFunctor {
+  inline T initial() { return static_cast<T>(true); }
+
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b && a;
+  }
+};
+
+/**
+ * @brief Default binary sub functor
+ */
+template <typename T>
+struct SubFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
+};
+
+/**
+ * @brief Default binary div functor
+ */
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <typename T>
+struct DivFunctor<T,
+                  typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      pten::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return a / b;
+  }
+};
+
+/**
+ * @brief Default binary floor divide functor
+ */
+template <typename T>
+struct FloorDivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      pten::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+}  // namespace kps
+}  // namespace pten
--- a/paddle/fluid/operators/kernel_primitives/helper_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/helper_primitives.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,9 +14,8 @@

 #pragma once

-namespace paddle {
-namespace operators {
-namespace kernel_primitives {
+namespace pten {
+namespace kps {

 #ifdef PADDLE_WITH_XPU2
 struct dim3 {
@@ -43,8 +42,12 @@ struct DimConfig {
  int rem_y;
  int rem_z;

-  HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z,
-                                       int size_x, int size_y, int size_z) {
+  HOSTDEVICE explicit inline DimConfig(int split_x,
+                                       int split_y,
+                                       int split_z,
+                                       int size_x,
+                                       int size_y,
+                                       int size_z) {
    split_num_x = split_x;
    split_num_y = split_y;
    split_num_z = split_z;
@@ -60,6 +63,5 @@ struct DimConfig {
  }
 };

-}  // namespace kernel_primitives
-}  // namespace operators
-}  // namespace paddle
+}  // namespace kps
+}  // namespace pten
--- a/paddle/pten/kernels/primitive/kernel_primitives.h
+++ b/paddle/pten/kernels/primitive/kernel_primitives.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/pten/kernels/primitive/helper_primitives.h"
+#ifdef PADDLE_WITH_XPU2
+#include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h"
+
+#define KPStream XPUStream
+#define KPDevice pten::XPUContext
+#define _ptr_ _global_ptr_
+#define __forceinline__ __inline__
+#define __restrict__
+
+#define THREAD_ID_X core_id()
+#define THREAD_ID_Y 0
+#define THREAD_ID_Z 0
+
+#define BLOCK_NUM_X core_num()
+#define BLOCK_NUM_Y 0
+#define BLOCK_NUM_Z 0
+
+#define BLOCK_ID_X cluster_id()
+#define BLOCK_ID_Y 0
+#define BLOCK_ID_Z 0
+
+#define GRID_NUM_X cluster_num()
+#define GRID_NUM_Y 0
+#define GRID_NUM_Z 0
+#else
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/primitive/compute_primitives.h"
+#include "paddle/pten/kernels/primitive/datamover_primitives.h"
+#include "paddle/pten/kernels/primitive/functor_primitives.h"
+
+#define KPStream gpuStream_t
+#define KPDevice pten::GPUContext
+#define _ptr_
+
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define THREAD_ID_Z threadIdx.z
+
+#define BLOCK_NUM_X blockDim.x
+#define BLOCK_NUM_Y blockDim.y
+#define BLOCK_NUM_Z blockDim.z
+
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_ID_Z blockIdx.z
+
+#define GRID_NUM_X gridDim.x
+#define GRID_NUM_Y gridDim.y
+#define GRID_NUM_Z gridDim.z
+#endif