clean distribution_helper, index_impl, aligned_vector code in fluid (#40071)

* clean distribution_helper, index_impl, aligned_vector code in fluid * fix conflicts

clean distribution_helper, index_impl, aligned_vector code in fluid (#40071)
* clean distribution_helper, index_impl, aligned_vector code in fluid * fix conflicts
b9672a1e · Leo Chen · GitHub · e2e2d531 · e2e2d531 · b9672a1e
24 changed file
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef __NVCC__
-#include <curand_kernel.h>
-#endif
-#ifdef __HIPCC__
-#include <hiprand_kernel.h>
-#endif
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/hostdevice.h"
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-#endif
-
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-namespace paddle {
-namespace distribution {
-
-using Tensor = framework::Tensor;
-
-/********************* Transformation Function **********************/
-template <typename T>
-struct exponential_transform {
-  explicit exponential_transform(T lambda) : lambda_(lambda) {}
-
-  HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (std::is_same<T, double>::value) {
-      return static_cast<T>(-1.0) / lambda_ * log(val);
-    } else {
-      return static_cast<T>(-1.0) / lambda_ * __logf(val);
-    }
-#else
-    return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
-#endif
-  }
-
- private:
-  T lambda_;
-};
-
-template <typename T>
-struct uniform_transform {
-  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
-
-  HOSTDEVICE inline T operator()(T val) const {
-    if (UNLIKELY(val == static_cast<T>(1.0))) {
-      return min_;
-    } else {
-      return val * range_ + min_;
-    }
-  }
-
- private:
-  T range_;
-  T min_;
-};
-
-template <typename T>
-struct normal_transform {
-  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
-
-  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
-
- private:
-  T mean_;
-  T std_;
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-namespace kps = phi::kps;
-
-/*********************** Distribution Function *************************/
-template <typename T>
-struct uniform_distribution;
-
-template <typename T>
-struct normal_distribution;
-
-#if defined(__NVCC__)
-template <>
-struct uniform_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct uniform_distribution<double> {
-  __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-template <>
-struct normal_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_normal4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct normal_distribution<double> {
-  __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_normal2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-#else
-template <>
-struct uniform_distribution<float> {
-  __device__ inline float4 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_uniform4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct uniform_distribution<double> {
-  __device__ inline double2 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_uniform2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-
-template <>
-struct normal_distribution<float> {
-  __device__ inline float4 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_normal4(state);
-  }
-  static constexpr int kReturnsCount = 4;
-};
-
-template <>
-struct normal_distribution<double> {
-  __device__ inline double2 operator()(
-      hiprandStatePhilox4_32_10_t *state) const {
-    return hiprand_normal2_double(state);
-  }
-  static constexpr int kReturnsCount = 2;
-};
-#endif
-
-/******** Launch GPU function of distribution and transformation *********/
-template <typename T, typename DistOp, typename TransformOp>
-__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans, T *out_data,
-                                   size_t stride) {
-  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
-  static constexpr int kCount = DistOp::kReturnsCount;
-#if defined(__NVCC__)
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = curandStatePhilox4_32_10_t;
-#else
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = hiprandStatePhilox4_32_10_t;
-#endif
-  size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
-  T result[kCount];
-  for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
-                                                           trans);
-    kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, stride, 1);
-    __syncthreads();
-  }
-}
-
-template <typename T, typename DistOp, typename TransformOp>
-void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
-                                Tensor *out, DistOp dist, TransformOp trans) {
-  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-  auto size = out->numel();
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-  size_t block_size = 256;
-  size_t expect_grid_size = (size + block_size - 1) / block_size;
-  const auto &prop = platform::GetDeviceProperties(device_id);
-  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
-                         prop.multiProcessorCount;
-  size_t grid_size =
-      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
-
-  size_t total_thread = block_size * grid_size;
-  size_t curand4_loop_times =
-      (size + 4 * total_thread - 1) / (4 * total_thread);
-  // 'increment' shoulde be multiple of 4
-  uint64_t increment = curand4_loop_times * 4;
-
-  auto seed_offset = gen_cuda->IncrementOffset(increment);
-  uint64_t seed = seed_offset.first;
-  uint64_t offset = seed_offset.second;
-
-  DistributionKernel<
-      T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data, total_thread);
-}
-
-#endif
-
-}  // namespace distribution
-}  // namespace paddle
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,8 +34,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"

 namespace paddle {
@@ -86,8 +86,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                          bool is_upscale_in_train,
                                          uint64_t increment) {
  using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;

 #ifdef PADDLE_WITH_HIP
  int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -102,7 +102,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
  for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
    LoadT src_val;
-    platform::Load<T, VecSize>(&src[i], &src_val);
+    phi::Load<T, VecSize>(&src[i], &src_val);

 #ifdef PADDLE_WITH_HIP
    float4 rand = hiprand_uniform4(&state);
@@ -126,8 +126,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
      }
    }

-    platform::Store<T, VecSize>(dst_val, &dst[i]);
-    platform::Store<MaskType, VecSize>(mask_val, &mask[i]);
+    phi::Store<T, VecSize>(dst_val, &dst[i]);
+    phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
  }
 }

@@ -153,16 +153,16 @@ __global__ void DropoutGradCUDAKernel(
    const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
    T* dx) {
  using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;

  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
    LoadT dout_val;
-    platform::Load<T, VecSize>(&dout[i], &dout_val);
+    phi::Load<T, VecSize>(&dout[i], &dout_val);

    MaskLoadT mask_val;
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_val);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_val);

    LoadT dx_val;

@@ -172,7 +172,7 @@ __global__ void DropoutGradCUDAKernel(
                                 static_cast<MT>(mask_val[j]) * factor);
    }

-    platform::Store<T, VecSize>(dx_val, &dx[i]);
+    phi::Store<T, VecSize>(dx_val, &dx[i]);
  }
 }

@@ -219,7 +219,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
    uint64_t increment;
    // VectorizedRandomGenerator use curand_uniform4, so we only support
    // vec_size is 4;
-    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
    auto offset =
        ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;

--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -76,7 +76,7 @@ class ExponentialKernel<platform::CPUDeviceContext, T>
    auto engine = gen->GetCPUEngine();

    std::uniform_real_distribution<T> uniform(0.0, 1.0);
-    distribution::exponential_transform<T> trans(lambda);
+    phi::funcs::exponential_transform<T> trans(lambda);
    for (int64_t i = 0; i < size; ++i) {
      out_data[i] = trans(uniform(*engine));
    }

--- a/paddle/fluid/operators/exponential_op.cu
+++ b/paddle/fluid/operators/exponential_op.cu
@@ -26,9 +26,9 @@ class ExponentialKernel<platform::CUDADeviceContext, T>
    auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
    T lambda = static_cast<T>(ctx.Attr<float>("lambda"));

-    distribution::uniform_distribution<T> dist;
-    distribution::exponential_transform<T> trans(lambda);
-    distribution::distribution_and_transform<T>(dev_cxt, out, dist, trans);
+    phi::funcs::uniform_distribution<T> dist;
+    phi::funcs::exponential_transform<T> trans(lambda);
+    phi::funcs::distribution_and_transform<T>(dev_cxt, out, dist, trans);
  }
 };


--- a/paddle/fluid/operators/exponential_op.h
+++ b/paddle/fluid/operators/exponential_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary(
 template <typename T>
 void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
                           const T* in0, const T* in1, T* out) {
-  int in_vec_size = std::min(platform::GetVectorizedSize<T>(in0),
-                             platform::GetVectorizedSize<T>(in1));
-  int out_vec_size = std::min(4, platform::GetVectorizedSize<T>(out));
+  int in_vec_size =
+      std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1));
+  int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out));
  int vec_size = std::min(out_vec_size, in_vec_size);

  int numel = m * n;

--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
                                    const T factor, const int64_t size, T *dx) {
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;

-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
    LoadT dout_vec;
    LoadT src_vec;
    MaskLoadT mask_vec;

-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
-    platform::Load<T, VecSize>(&src[i], &src_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&src[i], &src_vec);

    StoreT dx_vec;
 #pragma unroll
@@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
      T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
      dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
    }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
  }
 }

@@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
                                        T *dx, T *dbias) {
  int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;

-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
  T tmp_sum[VecSize] = {static_cast<T>(0)};
  // calculate the dx and temporary sum
  if (col_id * VecSize < cols) {
@@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
      LoadT bias_vec;
      MaskLoadT mask_vec;

-      platform::Load<T, VecSize>(&dout[index], &dout_vec);
-      platform::Load<T, VecSize>(&src[index], &src_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
-      platform::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
+      phi::Load<T, VecSize>(&dout[index], &dout_vec);
+      phi::Load<T, VecSize>(&src[index], &src_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);

      StoreT dx_vec;
 #pragma unroll
@@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
        dx_vec[i] = val;
        tmp_sum[i] += val;
      }
-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
    }
  }


--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"

 namespace paddle {

--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -42,12 +42,12 @@ __device__ void CalcLayernormY(
    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
    T *y, const int row_id, const int col_id, const int cols,
    const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using LoadU = platform::AlignedVector<U, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using LoadU = phi::AlignedVector<U, VecSize>;
  using LoadScaleOrBias =
-      platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                              VecSize>;
+      phi::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                         VecSize>;
  for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
    LoadScaleOrBias scale_vec;
    LoadScaleOrBias bias_vec;
@@ -60,15 +60,15 @@ __device__ void CalcLayernormY(
          static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
    }
    // vectorize load data from global
-    platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
+    phi::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);

    if (scale != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&scale[i], &scale_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &scale[i], &scale_vec);
    }
    if (bias != nullptr) {
-      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
-                     VecSize>(&bias[i], &bias_vec);
+      phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
+          &bias[i], &bias_vec);
    }

    StoreT y_vec;
@@ -78,7 +78,7 @@ __device__ void CalcLayernormY(
                             (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
                         static_cast<U>(bias_vec[ii]));
    }
-    platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
+    phi::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
  }
 }

@@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
    const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
    T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;

  const int tidx = threadIdx.x;
  const int bidx = blockIdx.x;
@@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
  Vec_scale beta[LDGS];
 #pragma unroll
  for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
    col += THREADS_PER_ROW;
  }

@@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
    Vec residual[LDGS];
 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
-      platform::Load<T, VecSize>(
-          residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &residual[it]);
      col += THREADS_PER_ROW;
    }

@@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 // store dropout_residual_out and mask_out
 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(
+      phi::Store<T, VecSize>(
          x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
-      platform::Store<MaskType, VecSize>(
+      phi::Store<MaskType, VecSize>(
          mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
      col += THREADS_PER_ROW;
    }
@@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(

 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
      col += THREADS_PER_ROW;
    }
  }

--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
    const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test,
    typename details::MPTypeTrait<T>::Type *mean_val,
    typename details::MPTypeTrait<T>::Type *var_val, Functor act_func) {
-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
  using U = typename details::MPTypeTrait<T>::Type;

  LoadT src_vec;
@@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
    residual_vec[ii] = static_cast<T>(0);
  }
  // vectorize load data from global
-  platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
  if (residual) {
-    platform::Load<T, VecSize>(&residual[row_id * cols + col_id],
-                               &residual_vec);
+    phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
  }

  if (bias) {
-    platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
+    phi::Load<T, VecSize>(&bias[col_id], &bias_vec);
  }

  MaskStoreT mask_vec;
@@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
  }

  // store result to global
-  platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
  if (!is_test) {
-    platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
+    phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
  }
 }

@@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
                                         T *dx) {
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;

-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
    LoadT dout_vec;
    MaskLoadT mask_vec;
-    platform::Load<T, VecSize>(&dout[i], &dout_vec);
-    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    phi::Load<T, VecSize>(&dout[i], &dout_vec);
+    phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);

    StoreT dx_vec;
 #pragma unroll
    for (int ii = 0; ii < VecSize; ii++) {
      dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
    }
-    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+    phi::Store<T, VecSize>(dx_vec, &dx[i]);
  }
 }

@@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
                                             T *dbias) {
  int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;

-  using LoadT = platform::AlignedVector<T, VecSize>;
-  using StoreT = platform::AlignedVector<T, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;

  T tmp_sum[VecSize] = {static_cast<T>(0)};
  // calculate the dx and temporary sum
@@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
      LoadT out_vec;
      MaskLoadT mask_vec;
      StoreT dx_vec;
-      platform::Load<T, VecSize>(&dout[index], &out_vec);
-      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+      phi::Load<T, VecSize>(&dout[index], &out_vec);
+      phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);

 #pragma unroll
      for (int i = 0; i < VecSize; i++) {
@@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
        tmp_sum[i] += out_vec[i];
      }

-      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+      phi::Store<T, VecSize>(dx_vec, &dx[index]);
    }
  }


--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -19,9 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
+
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"

 DECLARE_bool(use_curand);

@@ -79,10 +80,10 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
      int64_t gen_offset = size * seed_offset.second;
      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
                                       seed_offset.second);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
    } else {
      auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
    }
  }
 };

--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
  for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
 #pragma unroll
    for (int i = 0; i < VecSize; ++i) {
@@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
  for (; offset < n; offset += stride) {
-    using ArrT = platform::AlignedVector<__half, VecSize>;
+    using ArrT = phi::AlignedVector<__half, VecSize>;
    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
 #pragma unroll
@@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
  do {                                                                        \
    constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
        is_aligned(y, kAlignment)) {                                          \
      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
@@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
 #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
  do {                                                                        \
    constexpr auto kAlignment =                                               \
-        alignof(platform::AlignedVector<__half, __vec_size>);                 \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
        is_aligned(x_g, kAlignment)) {                                        \

--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"

 namespace paddle {
@@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
  int numel = out->numel();
  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
  if (numel <= 0) return;
-  int vec_size = paddle::platform::GetVectorizedSize(out_data);
+  int vec_size = phi::GetVectorizedSize(out_data);
 #ifdef PADDLE_WITH_XPU_KP
  int block = 64;
  int grid = 8;

--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif

-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"

 namespace paddle {
 namespace operators {
@@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
    const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
    T *__restrict__ y_ptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;

  const int tidx = threadIdx.x;
  const int bidx = blockIdx.x;
@@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
  Vec_scale beta[LDGS];
 #pragma unroll
  for (int it = 0, col = c; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
    col += THREADS_PER_ROW;
  }

@@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
    Vec x[LDGS];
 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
      col += THREADS_PER_ROW;
    }
    U xf[LDGS * VecSize];
@@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(

 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
      col += THREADS_PER_ROW;
    }
  }
@@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
    U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr,
    T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr,
    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
-  using Vec = platform::AlignedVector<T, VecSize>;
-  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
-  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  using Vec = phi::AlignedVector<T, VecSize>;
+  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;

  const int tidx = threadIdx.x;
  const int bidx = blockIdx.x;
@@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
  int col = c;
 #pragma unroll
  for (int it = 0; it < LDGS; it++) {
-    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
    col += THREADS_PER_ROW;
  }

@@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
    int col = c;
 #pragma unroll
    for (int it = 0; it < LDGS; it++) {
-      platform::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &dout[it]);
-      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
-                                 &x[it]);
+      phi::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
+                            &dout[it]);
+      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
      if (isFusedDropoutResidualLn) {
-        platform::Load<MaskType, VecSize>(
+        phi::Load<MaskType, VecSize>(
            mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
      }

@@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
    col = c;
 #pragma unroll
    for (int it = 0; it < LDGS; it++) {
-      platform::Store<T, VecSize>(x[it],
-                                  dx_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize);
      if (isFusedDropoutResidualLn) {
-        platform::Store<T, VecSize>(
+        phi::Store<T, VecSize>(
            dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
      }
      col += THREADS_PER_ROW;
@@ -641,7 +637,7 @@ template <
 __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
    const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
    ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
-  using Vec = platform::AlignedVector<U, VecSize>;
+  using Vec = phi::AlignedVector<U, VecSize>;
  static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");

  const int tidx = threadIdx.x;
@@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
    for (int row = r; row < rows; row += ROWS_PER_CTA) {
      Vec dg;
      Vec db;
-      platform::Load<U, VecSize>(dg_part_ptr, &dg);
-      platform::Load<U, VecSize>(db_part_ptr, &db);
+      phi::Load<U, VecSize>(dg_part_ptr, &dg);
+      phi::Load<U, VecSize>(db_part_ptr, &db);
      dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
      db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;


--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx,
  PADDLE_ENFORCE_NE(
      static_cast<const void *>(x), static_cast<void *>(y),
      platform::errors::InvalidArgument("Inplace cast is not supported yet."));
-  int vec_size =
-      std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y));
+  int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
  switch (vec_size) {
    case 4:
      return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -19,11 +19,11 @@
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
 #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
 #include "paddle/fluid/operators/tensor_to_string.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"

 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -66,8 +66,8 @@ struct L2NormFunctor {
    int i;
    for (i = threadIdx.x * VecSize; i + VecSize <= size;
         i += (BlockDim * VecSize)) {
-      platform::AlignedVector<T, VecSize> tmp_vec;
-      platform::Load(ptr + i, &tmp_vec);
+      phi::AlignedVector<T, VecSize> tmp_vec;
+      phi::Load(ptr + i, &tmp_vec);
 #pragma unroll
      for (int j = 0; j < VecSize; ++j) {
        auto tmp = static_cast<MT>(tmp_vec[j]);
@@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
  constexpr int max_load_bits = 128;
  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
  auto address = reinterpret_cast<uintptr_t>(ptr);
-  constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
-  constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
-  constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  constexpr int vec8 = alignof(phi::AlignedVector<T, 8>);
+  constexpr int vec4 = alignof(phi::AlignedVector<T, 4>);
+  constexpr int vec2 = alignof(phi::AlignedVector<T, 2>);
  chunk_size *= sizeof(T);
  if (address % vec8 == 0 && chunk_size % vec8 == 0) {
    return std::min(8, valid_vec_size);
@@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
  int stride = blockDim.x * gridDim.x * VecSize;

  for (; i + VecSize <= num; i += stride) {
-    platform::AlignedVector<T1, VecSize> x_vec;
-    platform::AlignedVector<T1, VecSize> y_vec;
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T1, VecSize> y_vec;

-    platform::Load(x + i, &x_vec);
+    phi::Load(x + i, &x_vec);
 #pragma unroll
    for (int j = 0; j < VecSize; ++j) {
      y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
    }
-    platform::Store(y_vec, y + i);
+    phi::Store(y_vec, y + i);
  }

  for (; i < num; ++i) {
@@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
  int stride = blockDim.x * gridDim.x * VecSize;

  for (; i + VecSize <= num; i += stride) {
-    platform::AlignedVector<T, VecSize> param_vec;
-    platform::AlignedVector<GradT, VecSize> grad_vec;
-    platform::AlignedVector<T, VecSize> mom1_vec;
-    platform::AlignedVector<T, VecSize> mom2_vec;
-    platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
+    phi::AlignedVector<T, VecSize> param_vec;
+    phi::AlignedVector<GradT, VecSize> grad_vec;
+    phi::AlignedVector<T, VecSize> mom1_vec;
+    phi::AlignedVector<T, VecSize> mom2_vec;
+    phi::AlignedVector<T, VecSize> trust_ratio_div_vec;

    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
    if (cur_weight_decay != static_cast<T>(0.0)) {
-      platform::Load(param_p + i, &param_vec);
+      phi::Load(param_p + i, &param_vec);
    } else {
 #pragma unroll
      for (int j = 0; j < VecSize; ++j) {
        param_vec[j] = static_cast<T>(0);
      }
    }
-    platform::Load(grad_p + i, &grad_vec);
-    platform::Load(mom1_p + i, &mom1_vec);
-    platform::Load(mom2_p + i, &mom2_vec);
+    phi::Load(grad_p + i, &grad_vec);
+    phi::Load(mom1_p + i, &mom1_vec);
+    phi::Load(mom2_p + i, &mom2_vec);

 #define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2,    \
                                           __trust_ratio_div, __idx)           \
@@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
                                         mom2_vec, trust_ratio_div_vec, j);
    }

-    platform::Store(mom1_vec, mom1_p + i);
-    platform::Store(mom2_vec, mom2_p + i);
-    platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
+    phi::Store(mom1_vec, mom1_p + i);
+    phi::Store(mom2_vec, mom2_p + i);
+    phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
  }

  for (; i < num; ++i) {
@@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor {
    trust_ratio_div += offset;

    for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
-      platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
-      platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
+      phi::AlignedVector<MT, VecSize> trust_ratio_div_vec;
+      phi::Load(trust_ratio_div + i, &trust_ratio_div_vec);
      if (HasMasterParam) {
-        platform::AlignedVector<MT, VecSize> master_param_vec;
-        platform::Load(master_param + i, &master_param_vec);
-        platform::AlignedVector<ParamT, VecSize> param_vec;
+        phi::AlignedVector<MT, VecSize> master_param_vec;
+        phi::Load(master_param + i, &master_param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
 #pragma unroll
        for (int j = 0; j < VecSize; ++j) {
          MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
          master_param_vec[j] = p;
          param_vec[j] = static_cast<ParamT>(p);
        }
-        platform::Store(master_param_vec, master_param + i);
-        platform::Store(param_vec, param + i);
+        phi::Store(master_param_vec, master_param + i);
+        phi::Store(param_vec, param + i);
      } else {
-        platform::AlignedVector<ParamT, VecSize> param_vec;
-        platform::Load(param + i, &param_vec);
+        phi::AlignedVector<ParamT, VecSize> param_vec;
+        phi::Load(param + i, &param_vec);
 #pragma unroll
        for (int j = 0; j < VecSize; ++j) {
          MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
          param_vec[j] = static_cast<ParamT>(p);
        }
-        platform::Store(param_vec, param + i);
+        phi::Store(param_vec, param + i);
      }
    }


--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate(
    T* param_out, MT* velocity_out, const MT mu, MT local_lr,
    const MT lars_weight_decay, const MT rescale_grad, const int tid,
    const int grid_stride, const int numel, MT* master_param_out = nullptr) {
-  using VecType = paddle::platform::AlignedVector<T, VecSize>;
-  using VecMType = paddle::platform::AlignedVector<MT, VecSize>;
+  using VecType = phi::AlignedVector<T, VecSize>;
+  using VecMType = phi::AlignedVector<MT, VecSize>;
  int main = numel >> (VecSize >> 1);
  int tail_offset = main * VecSize;


--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -25,8 +25,9 @@ DECLARE_bool(use_curand);
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/index_impl.cu.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #endif

 namespace paddle {
@@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context,
  if (gen_cuda->GetIsInitPy() && seed_flag) {
    if (FLAGS_use_curand) {
      using MT = typename details::MPTypeTrait<T>::Type;
-      distribution::uniform_distribution<MT> dist;
-      distribution::uniform_transform<MT> trans(min, max);
-      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+      phi::funcs::uniform_distribution<MT> dist;
+      phi::funcs::uniform_real_transform<MT> trans(min, max);
+      phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
    } else {
      auto seed_offset = gen_cuda->IncrementOffset(1);
      int64_t gen_offset = size * seed_offset.second;
      auto func =
          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
                                    diag_step, diag_val, gen_offset);
-      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+      phi::IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
    }
  } else {
    auto func =
        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
-    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+    phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
  }
 }
 #endif

--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once

 #include <cstdint>
-#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"

 #define INT_BITS 32

@@ -25,7 +25,7 @@ namespace platform {
 struct FastDivMod {
  // 1st value represents the result of input number divides by recorded divisor
  // 2nd value represents the result of input number modulo by recorded divisor
-  using DivModT = AlignedVector<uint32_t, 2>;
+  using DivModT = phi::AlignedVector<uint32_t, 2>;

  FastDivMod() {}
  HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {

--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize(
              "%d-th output tensor`s shape is not.",
              i));
      out_vec_size = std::min(
-          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
-          out_vec_size);
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
    }
  } else {
-    out_vec_size =
-        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
  }

  for (auto *in : ins) {
-    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
    in_vec_size = in->dims() == (*outs)[0]->dims()
                      ? std::min(temp_size, in_vec_size)
                      : in_vec_size;

--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"

 #if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif


--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -23,9 +23,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"

 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"

 #define HOSTDEVICE __host__ __device__
@@ -546,9 +546,8 @@ struct VecSizeGetter {
                               const ArgsT &args,
                               int *vec_size) {
    using Type = std::tuple_element_t<Index, ArgsT>;
-    *vec_size = std::min<int>(
-        *vec_size,
-        paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
+    *vec_size = std::min<int>(*vec_size,
+                              phi::GetVectorizedSize(ins[Index]->data<Type>()));
  }
 };

@@ -563,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
  // The Arg VecSize=1 is to match the Unroller template.
  Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
+    vec_size =
+        std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
  }
  return vec_size;
 }

--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/phi/kernels/bernoulli_kernel.h"
+
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #ifdef __NVCC__
@@ -28,7 +30,6 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/bernoulli_kernel.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"

 // See Note [ Why still include the fluid headers? ]

--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -20,11 +20,11 @@
 #include "paddle/phi/kernels/funcs/elementwise_base.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"

 namespace phi {