move math_cuda_utils.h to pten/kernels/funcs (#39246)

809a10b6 · Feiyu Chan · GitHub · 3e6950d5 · 809a10b6 · 809a10b6
6 changed file
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -12,7 +12,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"


--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -12,11 +12,11 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_v2_op.h"
-#include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/pten/kernels/funcs/math_cuda_utils.h"

 namespace paddle {
 namespace operators {
@@ -522,7 +522,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,

  if (threadIdx.x < threshold) {
    shared_last_idx = (threshold >> 5) - 1;
-    val = math::warpReduceMin(val, mask);
+    val = pten::funcs::warpReduceMin(val, mask);
    if (lane == 0) {
      shared[wid] = val;
    }
@@ -537,7 +537,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
  if (threadIdx.x < threshold) {
    val = (lane <= shared_last_idx) ? shared[lane]
                                    : std::numeric_limits<T>::max();
-    val = math::warpReduceMin(val, mask);
+    val = pten::funcs::warpReduceMin(val, mask);
    shared_last_val = val;
  }
  __syncthreads();
@@ -589,12 +589,15 @@ __global__ void KeBilinearInterpBwShareMemory(
    s_data[0][threadIdx.x] = 0.f;
    s_data[1][threadIdx.x] = 0.f;
    int remain = nthreads - (tid & (-blockDim.x));
-    int in_top_max_index = math::blockReduceMax(top_right_index, FINAL_MASK);
-    int in_bot_max_index = math::blockReduceMax(bot_right_index, FINAL_MASK);
+    int in_top_max_index =
+        pten::funcs::blockReduceMax(top_right_index, FINAL_MASK);
+    int in_bot_max_index =
+        pten::funcs::blockReduceMax(bot_right_index, FINAL_MASK);

    if (remain > blockDim.x) {
-      in_top_min_index = math::blockReduceMin(input_index, FINAL_MASK);
-      in_bot_min_index = math::blockReduceMin(bot_left_index, FINAL_MASK);
+      in_top_min_index = pten::funcs::blockReduceMin(input_index, FINAL_MASK);
+      in_bot_min_index =
+          pten::funcs::blockReduceMin(bot_left_index, FINAL_MASK);
    } else {
      in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK);
      in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK);

--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -18,13 +18,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/math_cuda_utils.h"

 namespace paddle {
 namespace operators {
 namespace math {

+// NOTE(chenfeiyu): explicitly use operator+ for float2
+// since float2 is not in namespace pten::funcs, ADL won't help
+using pten::funcs::operator+;
+
 template <typename T>
 __device__ __forceinline__ T local_rsqrt(T num) {
  return rsqrt(static_cast<float>(num));
@@ -34,11 +38,12 @@ __device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); }
 #endif

 template <typename T, int TPB>
-__device__ inline void LayerNormSmall(T val, const kvp<T> &thread_data,
+__device__ inline void LayerNormSmall(T val,
+                                      const pten::funcs::kvp<T> &thread_data,
                                      const int ld, const int idx,
                                      const float *bias, const float *scale,
                                      T *output, T eps) {
-  using BlockReduce = cub::BlockReduce<kvp<T>, TPB>;
+  using BlockReduce = cub::BlockReduce<pten::funcs::kvp<T>, TPB>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  __shared__ T mu;      // mean
  __shared__ T rsigma;  // 1 / std.dev.
@@ -59,10 +64,11 @@ __device__ inline void LayerNormSmall(T val, const kvp<T> &thread_data,
 }

 template <typename T, int TPB>
-__device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld,
-                                 const int offset, const float *bias,
-                                 const float *scale, T *output, T eps) {
-  using BlockReduce = cub::BlockReduce<kvp<T>, TPB>;
+__device__ inline void LayerNorm(const pten::funcs::kvp<T> &thread_data,
+                                 const int ld, const int offset,
+                                 const float *bias, const float *scale,
+                                 T *output, T eps) {
+  using BlockReduce = cub::BlockReduce<pten::funcs::kvp<T>, TPB>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  __shared__ T mu;      // mean
  __shared__ T rsigma;  // 1 / std.dev.
@@ -85,10 +91,11 @@ __device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld,
 }

 template <typename T, typename T2, int TPB>
-__device__ inline void LayerNorm2(const kvp<T> &thread_data, const int ld,
-                                  const int offset, const float2 *bias,
-                                  const float2 *scale, T2 *output, T eps) {
-  using BlockReduce = cub::BlockReduce<kvp<T>, TPB>;
+__device__ inline void LayerNorm2(const pten::funcs::kvp<T> &thread_data,
+                                  const int ld, const int offset,
+                                  const float2 *bias, const float2 *scale,
+                                  T2 *output, T eps) {
+  using BlockReduce = cub::BlockReduce<pten::funcs::kvp<T>, TPB>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  __shared__ T mu;      // mean
  __shared__ T rsigma;  // 1 / std.dev.
@@ -137,7 +144,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,

  const int64_t out_offset = seq_pos * hidden;

-  kvp<T> thread_data(0, 0);
+  pten::funcs::kvp<T> thread_data(0, 0);

 #pragma unroll
  for (int it = threadIdx.x; it < hidden; it += TPB) {
@@ -148,7 +155,8 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,

    output[out_offset + it] = val;
    const T rhiddenval = rhidden * val;
-    thread_data = pair_sum(thread_data, kvp<T>(rhiddenval, rhiddenval * val));
+    thread_data = pair_sum(thread_data,
+                           pten::funcs::kvp<T>(rhiddenval, rhiddenval * val));
  }
  LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps);
 }
@@ -180,7 +188,7 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(

  const int64_t out_offset = seq_pos * hidden;

-  kvp<half> thread_data(0, 0);
+  pten::funcs::kvp<half> thread_data(0, 0);

 #pragma unroll
  for (int it = threadIdx.x; it < hidden; it += 256) {
@@ -191,8 +199,8 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(

    output[out_offset + it] = val;
    const half rhiddenval = rhidden * val;
-    thread_data =
-        pair_sum(thread_data, kvp<half>(rhiddenval, rhiddenval * val));
+    thread_data = pair_sum(
+        thread_data, pten::funcs::kvp<half>(rhiddenval, rhiddenval * val));
  }
  LayerNorm<half, 256>(thread_data, hidden, out_offset, bias, scale, output,
                       eps);
@@ -233,10 +241,10 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_,
                  ? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] +
                                       bias_qk_[threadIdx.x + qk_offset])
                  : -1e20f;
-  float max_val = blockReduceMax<float>(tmp, mask);
+  float max_val = pten::funcs::blockReduceMax<float>(tmp, mask);

  float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f;
-  float sum_val = blockReduceSum<float>(qk_tmp, mask);
+  float sum_val = pten::funcs::blockReduceSum<float>(qk_tmp, mask);

  if (threadIdx.x < seq_len)
    qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val);
@@ -256,10 +264,10 @@ __global__ void SoftmaxKernelWithEltadd<half>(
                  ? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] +
                                       bias_qk_[threadIdx.x + qk_offset])
                  : -1e20f;
-  float max_val = blockReduceMax<float>(tmp, mask);
+  float max_val = pten::funcs::blockReduceMax<float>(tmp, mask);

  float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f;
-  float sum_val = blockReduceSum<float>(qk_tmp, mask);
+  float sum_val = pten::funcs::blockReduceSum<float>(qk_tmp, mask);

  if (threadIdx.x < seq_len)
    qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val);
@@ -276,19 +284,20 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_,
  int idx = threadIdx.x;
  assert(blockDim.x % 32 == 0);

-  float2 tmp =
-      idx < seq_len
-          ? ToFloat2<T>(qk_buf_[idx + qk_offset] + bias_qk_[idx + qk_offset])
-          : make_float2(-1e20f, -1e20f);
-  float max_val = blockReduceMax<float>(max(tmp.x, tmp.y), mask);
+  float2 tmp = idx < seq_len
+                   ? pten::funcs::ToFloat2<T>(qk_buf_[idx + qk_offset] +
+                                              bias_qk_[idx + qk_offset])
+                   : make_float2(-1e20f, -1e20f);
+  float max_val = pten::funcs::blockReduceMax<float>(max(tmp.x, tmp.y), mask);
  float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val),
                                              __expf(tmp.y - max_val))
                                : make_float2(0.f, 0.f);
-  float sum_val = blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;
+  float sum_val =
+      pten::funcs::blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;

  if (idx < seq_len) {
    qk_buf_[idx + qk_offset] =
-        FloatsToPair<T>(qk_tmp.x / sum_val, qk_tmp.y / sum_val);
+        pten::funcs::FloatsToPair<T>(qk_tmp.x / sum_val, qk_tmp.y / sum_val);
  }
 }

@@ -304,18 +313,20 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
  int idx = threadIdx.x;
  assert(blockDim.x % 32 == 0);

-  float2 tmp = idx < seq_len ? ToFloat2<half2>(qk_buf_[idx + qk_offset] +
-                                               bias_qk_[idx + qk_offset])
-                             : make_float2(-1e20f, -1e20f);
-  float max_val = blockReduceMax<float>(max(tmp.x, tmp.y), mask);
+  float2 tmp = idx < seq_len
+                   ? pten::funcs::ToFloat2<half2>(qk_buf_[idx + qk_offset] +
+                                                  bias_qk_[idx + qk_offset])
+                   : make_float2(-1e20f, -1e20f);
+  float max_val = pten::funcs::blockReduceMax<float>(max(tmp.x, tmp.y), mask);
  float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val),
                                              __expf(tmp.y - max_val))
                                : make_float2(0.f, 0.f);
-  float sum_val = blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;
+  float sum_val =
+      pten::funcs::blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;

  if (idx < seq_len) {
-    qk_buf_[idx + qk_offset] =
-        FloatsToPair<half2>(qk_tmp.x / sum_val, qk_tmp.y / sum_val);
+    qk_buf_[idx + qk_offset] = pten::funcs::FloatsToPair<half2>(
+        qk_tmp.x / sum_val, qk_tmp.y / sum_val);
  }
 #endif
 }
@@ -338,14 +349,14 @@ __global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk,
                           bias_qk[threadIdx.x + i + qk_offset]
                     : stride_max;
  }
-  T max_val = blockReduceMax<T>(stride_max, mask);
+  T max_val = pten::funcs::blockReduceMax<T>(stride_max, mask);

  T stride_sum = 0.f;
  for (int i = 0; i < seq_len; i += blockDim.x) {
    stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] +
                         bias_qk[threadIdx.x + i + qk_offset] - max_val);
  }
-  T sum_val = blockReduceSum<T>(stride_sum, mask);
+  T sum_val = pten::funcs::blockReduceSum<T>(stride_sum, mask);

  for (int i = 0; i < seq_len; i += blockDim.x) {
    qk_buf[threadIdx.x + i + qk_offset] =
@@ -371,7 +382,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge(
                                   bias_qk[threadIdx.x + i + qk_offset]);
    stride_max = tmp > stride_max ? tmp : stride_max;
  }
-  float max_val = blockReduceMax<float>(stride_max, mask);
+  float max_val = pten::funcs::blockReduceMax<float>(stride_max, mask);

  float stride_sum = 0.f;
  for (int i = 0; i < seq_len; i += blockDim.x) {
@@ -379,7 +390,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge(
                                   bias_qk[threadIdx.x + i + qk_offset]);
    stride_sum += __expf(tmp - max_val);
  }
-  float sum_val = blockReduceSum<float>(stride_sum, mask);
+  float sum_val = pten::funcs::blockReduceSum<float>(stride_sum, mask);

  for (int i = 0; i < seq_len; i += blockDim.x) {
    float tmp =
@@ -403,28 +414,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_,

  float2 stride_max = make_float2(-1e20f, -1e20f);
  for (int i = 0; i < seq_len; i += blockDim.x) {
-    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
-                             bias_qk_[threadIdx.x + i + qk_offset]);
+    float2 cur =
+        pten::funcs::ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
    stride_max.x = max(stride_max.x, cur.x);
    stride_max.y = max(stride_max.y, cur.y);
  }
-  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+  float max_val =
+      pten::funcs::blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);

  float2 stride_sum = make_float2(0.f, 0.f);
  for (int i = 0; i < seq_len; i += blockDim.x) {
-    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
-                             bias_qk_[threadIdx.x + i + qk_offset]);
+    float2 cur =
+        pten::funcs::ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
    stride_sum.x += __expf(cur.x - max_val);
    stride_sum.y += __expf(cur.y - max_val);
  }

  float sum_val =
-      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+      pten::funcs::blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) +
+      1e-6f;

  for (int i = 0; i < seq_len; i += blockDim.x) {
-    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
-                             bias_qk_[threadIdx.x + i + qk_offset]);
-    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<T>(
+    float2 cur =
+        pten::funcs::ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = pten::funcs::FloatsToPair<T>(
        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
  }
 }
@@ -443,28 +459,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(

  float2 stride_max = make_float2(-1e20f, -1e20f);
  for (int i = 0; i < seq_len; i += blockDim.x) {
-    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
-                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    float2 cur =
+        pten::funcs::ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                     bias_qk_[threadIdx.x + i + qk_offset]);
    stride_max.x = max(stride_max.x, cur.x);
    stride_max.y = max(stride_max.y, cur.y);
  }
-  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+  float max_val =
+      pten::funcs::blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);

  float2 stride_sum = make_float2(0.f, 0.f);
  for (int i = 0; i < seq_len; i += blockDim.x) {
-    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
-                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    float2 cur =
+        pten::funcs::ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                     bias_qk_[threadIdx.x + i + qk_offset]);
    stride_sum.x += __expf(cur.x - max_val);
    stride_sum.y += __expf(cur.y - max_val);
  }

  float sum_val =
-      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+      pten::funcs::blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) +
+      1e-6f;

  for (int i = 0; i < seq_len; i += blockDim.x) {
-    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
-                                 bias_qk_[threadIdx.x + i + qk_offset]);
-    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<half2>(
+    float2 cur =
+        pten::funcs::ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                     bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = pten::funcs::FloatsToPair<half2>(
        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
  }
 #endif
@@ -595,13 +616,14 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1,
  const T rld = T(1) / T(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<T> thread_data(0, 0);
+  pten::funcs::kvp<T> thread_data(0, 0);
  const int idx = offset + threadIdx.x;
  T val = 0;
  if (threadIdx.x < hidden) {
    val = input1[idx] + input2[idx];
    const T rldval = rld * val;
-    thread_data = pair_sum(thread_data, kvp<T>(rldval, rldval * val));
+    thread_data =
+        pair_sum(thread_data, pten::funcs::kvp<T>(rldval, rldval * val));
  }
  LayerNormSmall<T, TPB>(val, thread_data, hidden, idx, bias, scale, output,
                         eps);
@@ -617,13 +639,14 @@ __global__ void SkipLayerNormSmallKernel<half, 32>(
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<half> thread_data(0, 0);
+  pten::funcs::kvp<half> thread_data(0, 0);
  const int idx = offset + threadIdx.x;
  half val = 0;
  if (threadIdx.x < hidden) {
    val = input1[idx] + input2[idx];
    const half rldval = rld * val;
-    thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
+    thread_data =
+        pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
  }
  LayerNormSmall<half, 32>(val, thread_data, hidden, idx, bias, scale, output,
                           eps);
@@ -638,13 +661,14 @@ __global__ void SkipLayerNormSmallKernel<half, 128>(
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<half> thread_data(0, 0);
+  pten::funcs::kvp<half> thread_data(0, 0);
  const int idx = offset + threadIdx.x;
  half val = 0;
  if (threadIdx.x < hidden) {
    val = input1[idx] + input2[idx];
    const half rldval = rld * val;
-    thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
+    thread_data =
+        pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
  }
  LayerNormSmall<half, 128>(val, thread_data, hidden, idx, bias, scale, output,
                            eps);
@@ -659,13 +683,14 @@ __global__ void SkipLayerNormSmallKernel<half, 384>(
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<half> thread_data(0, 0);
+  pten::funcs::kvp<half> thread_data(0, 0);
  const int idx = offset + threadIdx.x;
  half val = 0;
  if (threadIdx.x < hidden) {
    val = input1[idx] + input2[idx];
    const half rldval = rld * val;
-    thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
+    thread_data =
+        pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
  }
  LayerNormSmall<half, 384>(val, thread_data, hidden, idx, bias, scale, output,
                            eps);
@@ -681,13 +706,14 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
  const T rld = T(1) / T(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<T> thread_data(0, 0);
+  pten::funcs::kvp<T> thread_data(0, 0);

  for (int it = threadIdx.x; it < hidden; it += TPB) {
    const int idx = offset + it;
    const T val = input1[idx] + input2[idx];
    const T rldval = rld * val;
-    thread_data = pair_sum(thread_data, kvp<T>(rldval, rldval * val));
+    thread_data =
+        pair_sum(thread_data, pten::funcs::kvp<T>(rldval, rldval * val));
    output[idx] = val;
  }
  LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
@@ -705,13 +731,14 @@ __global__ void SkipLayerNormKernel<half, 256>(int num, int hidden,
  const half rld = half(1) / half(hidden);
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<half> thread_data(0, 0);
+  pten::funcs::kvp<half> thread_data(0, 0);

  for (int it = threadIdx.x; it < hidden; it += 256) {
    const int idx = offset + it;
    const half val = input1[idx] + input2[idx];
    const half rldval = rld * val;
-    thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
+    thread_data =
+        pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
    output[idx] = val;
  }
  LayerNorm<half, 256>(thread_data, hidden, offset, bias, scale, output, eps);
@@ -727,13 +754,14 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
  const T rld = T(0.5f / hidden);  // because hidden is hidden/2
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<T> thread_data(0, 0);
+  pten::funcs::kvp<T> thread_data(0, 0);

  for (int it = threadIdx.x; it < hidden; it += TPB) {
    const int idx = offset + it;
    const T2 val2 = input1[idx] + input2[idx];
    thread_data = pair_sum(
-        thread_data, kvp<T>(rld * (val2.x + val2.y),
+        thread_data,
+        pten::funcs::kvp<T>(rld * (val2.x + val2.y),
                            rld * val2.x * val2.x + rld * val2.y * val2.y));
    output[idx] = val2;
  }
@@ -751,13 +779,14 @@ __global__ void SkipLayerNormKernel2<half, half2, 256>(
  const half rld = half(0.5f / hidden);  // because hidden is hidden/2
  const int offset = blockIdx.x * hidden;
  cub::Sum pair_sum;
-  kvp<half> thread_data(0, 0);
+  pten::funcs::kvp<half> thread_data(0, 0);

  for (int it = threadIdx.x; it < hidden; it += 256) {
    const int idx = offset + it;
    const half2 val2 = input1[idx] + input2[idx];
    thread_data = pair_sum(
-        thread_data, kvp<half>(rld * (val2.x + val2.y),
+        thread_data,
+        pten::funcs::kvp<half>(rld * (val2.x + val2.y),
                               rld * val2.x * val2.x + rld * val2.y * val2.y));
    output[idx] = val2;
  }

--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
 #include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/pten/kernels/funcs/math_cuda_utils.h"

 #if CUDA_VERSION >= 11000
 #include <cooperative_groups.h>
@@ -170,8 +170,8 @@ __global__ void L2NormKernel(
    g_tmp += (tmp1 * tmp1);
    tid += grid_stride;
  }
-  p_tmp = math::blockReduceSum<MT>(p_tmp, FINAL_MASK);
-  g_tmp = math::blockReduceSum<MT>(g_tmp, FINAL_MASK);
+  p_tmp = pten::funcs::blockReduceSum<MT>(p_tmp, FINAL_MASK);
+  g_tmp = pten::funcs::blockReduceSum<MT>(g_tmp, FINAL_MASK);

  if (threadIdx.x == 0) {
    p_buffer[blockIdx.x] = p_tmp;
@@ -181,8 +181,8 @@ __global__ void L2NormKernel(
  cg->sync();  // Grid sync for writring partial result to gloabl memory
  MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0;
  MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0;
-  MT tmp0 = math::blockReduceSum<MT>(p_part_sum, FINAL_MASK);
-  MT tmp1 = math::blockReduceSum<MT>(g_part_sum, FINAL_MASK);
+  MT tmp0 = pten::funcs::blockReduceSum<MT>(p_part_sum, FINAL_MASK);
+  MT tmp1 = pten::funcs::blockReduceSum<MT>(g_part_sum, FINAL_MASK);
  if (threadIdx.x == 0) {
    s_buffer[0] = tmp0;
    s_buffer[1] = tmp1;
@@ -294,9 +294,10 @@ __global__ void MomentumLarsKernel(
  MT param_part_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0;
  MT grad_part_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0;
  __syncthreads();
-  MT param_norm = Sqrt(math::blockReduceSum<MT>(param_part_norm, FINAL_MASK));
-  MT grad_norm = Sqrt(rescale_grad_pow *
-                      math::blockReduceSum<MT>(grad_part_norm, FINAL_MASK));
+  MT param_norm =
+      Sqrt(pten::funcs::blockReduceSum<MT>(param_part_norm, FINAL_MASK));
+  MT grad_norm = Sqrt(rescale_grad_pow * pten::funcs::blockReduceSum<MT>(
+                                             grad_part_norm, FINAL_MASK));
 #endif
  MomentumUpdate<T, MT>(param, grad, velocity, param_out, velocity_out,
                        master_param, master_param_out, learning_rate, mu,

--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h
@@ -16,7 +16,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"

--- a/paddle/fluid/operators/math/math_cuda_utils.h
+++ b/paddle/fluid/operators/math/math_cuda_utils.h
@@ -23,9 +23,8 @@ limitations under the License. */

 #include <algorithm>

-namespace paddle {
-namespace operators {
-namespace math {
+namespace pten {
+namespace funcs {

 template <typename T>
 __device__ __forceinline__ T FromFloat(float a);
@@ -315,6 +314,5 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) {
  return val;
 }

-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten