change Vector to std::vector and provide MixVector class as a helper … (#39559)

* change Vector to std::vector and provide MixVector class as a helper wrapper class * solve the multi-gpu hang problem * remove the duplicate template instantialize * Copy vector to cpu * add CopyToCPU * xxx * final version: fix the problem of all reduce * remove mixvector dependence * fix * merge * fix code * fix by CI

change Vector to std::vector and provide MixVector class as a helper … (#39559)
* change Vector to std::vector and provide MixVector class as a helper wrapper class * solve the multi-gpu hang problem * remove the duplicate template instantialize * Copy vector to cpu * add CopyToCPU * xxx * final version: fix the problem of all reduce * remove mixvector dependence * fix * merge * fix code * fix by CI
728c0624 · xiongkun · GitHub · d56a0a1b · 728c0624 · 728c0624
96 changed file
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -31,15 +31,17 @@ TEST(LoD, data) {
  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));

  auto& v = lod[0];
+  paddle::framework::MixVector<size_t> mix_vector_v(&v);
  paddle::platform::CUDAPlace gpu(0);
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu),
-                     v.size());
+  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
+                     mix_vector_v.CUDAMutableData(gpu), v.size());
  hipDeviceSynchronize();
 #else
-  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
+  test<<<1, 1>>>(mix_vector_v.CUDAMutableData(gpu), v.size());
  cudaDeviceSynchronize();
 #endif
+  mix_vector_v.CopyToCPU();
  for (size_t i = 0; i < v.size(); ++i) {
    EXPECT_EQ(v[i], i * 2);
  }
@@ -62,15 +64,17 @@ TEST(LoDTensor, LoDInGPU) {
  EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);

  auto lod = lod_tensor.lod();
+  paddle::framework::MixVector<size_t> mix_vector(&(lod[0]));

 #ifdef PADDLE_WITH_HIP
  hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0,
-                     lod[0].CUDAMutableData(place), lod[0].size());
+                     mix_vector.CUDAMutableData(place), lod[0].size());
  hipDeviceSynchronize();
 #else
-  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
+  test<<<1, 8>>>(mix_vector.CUDAMutableData(place), lod[0].size());
  cudaDeviceSynchronize();
 #endif
+  mix_vector.CopyToCPU();

  for (size_t i = 0; i < src_lod[0].size(); ++i) {
    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);

--- a/paddle/fluid/framework/mixed_vector.cc
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -64,19 +64,20 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
  auto stream = dev_ctx->stream();
  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
                       platform::CPUPlace(), src, *gpu_memory_size_, stream);
+  dev_ctx->Wait();
 #endif
 }

-#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                  \
-  template <>                                                                  \
-  void Vector<__TYPE__>::VectorData::CopyToCPU() const {                       \
-    CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_);                \
-  }                                                                            \
-                                                                               \
-  template <>                                                                  \
-  void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                        \
-      const platform::Place &place) const {                                    \
-    CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                 \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyToCPU() const {                   \
+    CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                    \
+      const platform::Place &place) const {                                   \
+    CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
  }

 INSTANTIATE_VECTOR_FOR_TYPE(size_t)

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <vector>

 #include "glog/logging.h"
-#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
@@ -30,6 +29,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+template <class T>
+using Vector = std::vector<T>;
+
 inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
    const paddle::memory::allocation::AllocationPtr &gpu_) {
  return gpu_ == nullptr ? paddle::none
@@ -39,7 +41,7 @@ inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
-class Vector {
+class MixVector {
 public:
  using value_type = T;
  using iterator = typename std::vector<T>::iterator;
@@ -49,82 +51,68 @@ class Vector {
  // The actual class to implement vector logic
  class VectorData {
   public:
-    VectorData() : flag_(kDataInCPU) {}
-    VectorData(size_t count, const T &value)
-        : cpu_(count, value), flag_(kDataInCPU) {}
-    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
    template <typename U>
-    explicit VectorData(const std::vector<U> &dat)
-        : cpu_(dat), flag_(kDataInCPU) {}
+    explicit VectorData(std::vector<U> *dat) : cpu_(dat), flag_(kDataInCPU) {}
    ~VectorData() {}

-    VectorData(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-    }
+    VectorData(const VectorData &o) = delete;

-    VectorData &operator=(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-      return *this;
-    }
+    VectorData &operator=(const VectorData &o) = delete;

    T &operator[](size_t i) {
      MutableCPU();
-      return cpu_[i];
+      return (*cpu_)[i];
    }

    const T &operator[](size_t i) const {
      ImmutableCPU();
-      return cpu_[i];
+      return (*cpu_)[i];
    }

-    size_t size() const { return cpu_.size(); }
+    size_t size() const { return (*cpu_).size(); }

    iterator begin() {
      MutableCPU();
-      return cpu_.begin();
+      return (*cpu_).begin();
    }

    iterator end() {
      MutableCPU();
-      return cpu_.end();
+      return (*cpu_).end();
    }

    T &front() {
      MutableCPU();
-      return cpu_.front();
+      return (*cpu_).front();
    }

    T &back() {
      MutableCPU();
-      return cpu_.back();
+      return (*cpu_).back();
    }

    const_iterator begin() const {
      ImmutableCPU();
-      return cpu_.begin();
+      return (*cpu_).begin();
    }

    const_iterator end() const {
      ImmutableCPU();
-      return cpu_.end();
+      return (*cpu_).end();
    }

    const T &back() const {
      ImmutableCPU();
-      return cpu_.back();
+      return (*cpu_).back();
    }

-    T *data() { return &(*this)[0]; }
+    T *data() { return cpu_->data(); }

-    const T *data() const { return &(*this)[0]; }
+    const T *data() const { return cpu_->data(); }

    const T &front() const {
      ImmutableCPU();
-      return cpu_.front();
+      return (*cpu_).front();
    }

    // assign this from iterator.
@@ -132,14 +120,14 @@ class Vector {
    template <typename Iter>
    void assign(Iter begin, Iter end) {
      MutableCPU();
-      cpu_.assign(begin, end);
+      (*cpu_).assign(begin, end);
    }

    // push_back. If the previous capacity is not enough, the memory will
    // double.
    void push_back(T elem) {
      MutableCPU();
-      cpu_.push_back(elem);
+      (*cpu_).push_back(elem);
    }

    // extend a vector by iterator.
@@ -147,14 +135,14 @@ class Vector {
    template <typename It>
    void Extend(It begin, It end) {
      MutableCPU();
-      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
+      auto out_it = std::back_inserter<std::vector<T>>(*(this->cpu_));
      std::copy(begin, end, out_it);
    }

    // resize the vector
    void resize(size_t size) {
      MutableCPU();
-      cpu_.resize(size);
+      (*cpu_).resize(size);
    }

    // get cuda ptr. immutable
@@ -176,26 +164,16 @@ class Vector {

    // clear
    void clear() {
-      cpu_.clear();
+      (*cpu_).clear();
      flag_ = kDirty | kDataInCPU;
    }

-    size_t capacity() const { return cpu_.capacity(); }
-
-    // reserve data
-    void reserve(size_t size) const { cpu_.reserve(size); }
+    std::vector<T> *get_vector() { return cpu_; }

-    // implicit cast operator. Vector can be cast to std::vector implicitly.
-    operator std::vector<T>() const {
-      ImmutableCPU();
-      return cpu_;
-    }
+    size_t capacity() const { return (*cpu_).capacity(); }

-    bool operator==(const VectorData &other) const {
-      ImmutableCPU();
-      other.ImmutableCPU();
-      return cpu_ == other.cpu_;
-    }
+    // reserve data
+    void reserve(size_t size) const { (*cpu_).reserve(size); }

    std::mutex &Mutex() const { return mtx_; }

@@ -203,6 +181,13 @@ class Vector {
      return OptionalCUDAPlace(gpu_);
    }

+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
+    }
+
   private:
    enum DataFlag {
      kDataInCPU = 0x01,
@@ -213,13 +198,6 @@ class Vector {

    void CopyToCPU() const;

-    void MutableCPU() {
-      if (IsInCUDA() && IsDirty()) {
-        CopyToCPU();
-      }
-      flag_ = kDirty | kDataInCPU;
-    }
-
    void ImmutableCUDA(platform::Place place) const {
      if (IsDirty()) {
        if (IsInCPU()) {
@@ -269,7 +247,7 @@ class Vector {

    bool IsInCPU() const { return flag_ & kDataInCPU; }

-    mutable std::vector<T> cpu_;
+    std::vector<T> *cpu_;
    mutable paddle::memory::allocation::AllocationPtr gpu_;
    mutable size_t gpu_memory_size_{0};
    mutable int flag_;
@@ -278,89 +256,77 @@ class Vector {
  };

 public:
-  // Default ctor. Create empty Vector
-  Vector() : m_(new VectorData()) {}
-
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T())
-      : m_(new VectorData(count, value)) {}
-
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
-
  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  MixVector(const std::vector<U> *dat) {  // NOLINT
+    m_.reset(new VectorData(const_cast<std::vector<U> *>(dat)));
  }

  // Copy ctor
-  Vector(const Vector<T> &other) { m_ = other.m_; }
+  MixVector(const MixVector<T> &other) = delete;

  // Copy operator
-  Vector<T> &operator=(const Vector<T> &other) {
-    m_ = other.m_;
-    return *this;
-  }
+  MixVector<T> &operator=(const MixVector<T> &other) = delete;

  // Move ctor
-  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  MixVector(MixVector<T> &&other) = delete;

  // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
+  T &operator[](size_t i) { return (*m_)[i]; }

  // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return m_.Data()[i]; }
+  const T &operator[](size_t i) const { return (*m_)[i]; }

  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_.Data().size(); }
+  size_t size() const { return m_->size(); }

-  iterator begin() { return m_.MutableData()->begin(); }
+  iterator begin() { return m_->begin(); }

-  iterator end() { return m_.MutableData()->end(); }
+  iterator end() { return m_->end(); }

-  T &front() { return m_.MutableData()->front(); }
+  T &front() { return m_->front(); }

-  T &back() { return m_.MutableData()->back(); }
+  T &back() { return m_->back(); }

-  const_iterator begin() const { return m_.Data().begin(); }
+  const_iterator begin() const { return m_->begin(); }

-  const_iterator end() const { return m_.Data().end(); }
+  const_iterator end() const { return m_->end(); }

  const_iterator cbegin() const { return begin(); }

  const_iterator cend() const { return end(); }

-  const T &back() const { return m_.Data().back(); }
+  const T &back() const { return m_->back(); }

-  T *data() { return m_.MutableData()->data(); }
+  T *data() { return m_->data(); }

-  const T *data() const { return m_.Data().data(); }
+  const T *data() const { return m_->data(); }

-  const T &front() const { return m_.Data().front(); }
+  const T &front() const { return m_->front(); }
  // end of std::vector iterator methods

  // assign this from iterator.
  // NOTE: the iterator must support `end-begin`
  template <typename Iter>
  void assign(Iter begin, Iter end) {
-    m_.MutableData()->assign(begin, end);
+    m_->assign(begin, end);
  }

  // push_back. If the previous capacity is not enough, the memory will
  // double.
-  void push_back(T elem) { m_.MutableData()->push_back(elem); }
+  void push_back(T elem) { m_->push_back(elem); }

  // extend a vector by iterator.
  // NOTE: the iterator must support end-begin
  template <typename It>
  void Extend(It begin, It end) {
-    m_.MutableData()->Extend(begin, end);
+    m_->Extend(begin, end);
  }

  // resize the vector
  void resize(size_t size) {
-    if (m_.Data().size() != size) {
-      m_.MutableData()->resize(size);
+    if (m_->size() != size) {
+      m_->resize(size);
    }
  }

@@ -368,15 +334,15 @@ class Vector {
  const T *CUDAData(platform::Place place) const {
    {
      platform::CUDAPlace p(place.GetDeviceId());
-      auto &mtx = m_.Data().Mutex();
+      auto &mtx = m_->Mutex();
      std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
+      auto cuda_place = m_->CUDAPlace();
      if (cuda_place == paddle::none || cuda_place == p) {
-        return m_.Data().CUDAData(place);
+        return m_->CUDAData(place);
      }
    }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
    return CUDAData(place);
  }

@@ -384,25 +350,25 @@ class Vector {
  T *CUDAMutableData(platform::Place place) {
    {
      platform::CUDAPlace p(place.GetDeviceId());
-      auto &mtx = m_.Data().Mutex();
+      auto &mtx = m_->Mutex();
      std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
+      auto cuda_place = m_->CUDAPlace();
      if (cuda_place == paddle::none || cuda_place == p) {
-        return m_.MutableData()->CUDAMutableData(place);
+        return m_->CUDAMutableData(place);
      }
    }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
    return CUDAMutableData(place);
  }

  // clear
-  void clear() { m_.MutableData()->clear(); }
+  void clear() { m_->clear(); }

-  size_t capacity() const { return m_.Data().capacity(); }
+  size_t capacity() const { return m_->capacity(); }

  // reserve data
-  void reserve(size_t size) { m_.Data().reserve(size); }
+  void reserve(size_t size) { m_->reserve(size); }

  // the unify method to access CPU or CUDA data. immutable.
  const T *Data(platform::Place place) const {
@@ -422,26 +388,12 @@ class Vector {
    }
  }

-  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return m_.Data(); }
-
-  bool operator==(const Vector<T> &other) const {
-    if (size() != other.size()) return false;
-    auto it1 = cbegin();
-    auto it2 = other.cbegin();
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
-  }
+  void CopyToCPU() { m_->MutableCPU(); }

-  const void *Handle() const { return &m_.Data(); }
+  const void *Handle() const { return m_.get(); }

 private:
-  // Vector is an COW object.
-  mutable details::COWPtr<VectorData> m_;
+  mutable std::unique_ptr<VectorData> m_;
 };

 };  // namespace framework

--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -28,7 +28,7 @@
 #include "paddle/fluid/platform/device_context.h"

 template <typename T>
-using vec = paddle::framework::Vector<T>;
+using vec = paddle::framework::MixVector<T>;
 using gpuStream_t = paddle::gpuStream_t;

 static __global__ void multiply_10(int* ptr) {
@@ -44,10 +44,11 @@ gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
 }

 TEST(mixed_vector, GPU_VECTOR) {
-  vec<int> tmp;
+  std::vector<int> x;
  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
+    x.push_back(i);
  }
+  vec<int> tmp(&x);
  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu(0);

@@ -70,10 +71,11 @@ TEST(mixed_vector, MultiGPU) {
    return;
  }

-  vec<int> tmp;
+  std::vector<int> x;
  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
+    x.push_back(i);
  }
+  vec<int> tmp(&x);
  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu0(0);
  paddle::platform::SetDeviceId(0);

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/stream.h"

+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/phi/core/dense_tensor.h"

 namespace paddle {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1455,22 +1455,10 @@ std::ostream& print_tensor<paddle::platform::complex<double>>(
 }

 std::ostream& operator<<(std::ostream& os, const LoD& lod) {
-  os << "{";
-  for (auto& v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto& i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
+  // NOTE(xiongkun):
+  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
+  // if we don't redefine, the operator << of pten / framework LoD is not found.
+  paddle::string::operator<<(os, lod);
  return os;
 }

@@ -1479,6 +1467,11 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {

 namespace phi {

+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  paddle::string::operator<<(os, lod);
+  return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
  if (t.lod().size() > 0) {
    os << "  - lod: " << t.lod() << "\n";

--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -90,6 +90,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
      platform::DeviceContextPool::Instance().Get(place));

  bool use_calc_stream = (dev_ctx->stream() == stream);
+  VLOG(4) << "Is use calculate stream: " << use_calc_stream;

  // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
  // but we can use other ways to implement is in the future
@@ -97,7 +98,9 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
  framework::Vector<int64_t> rows_num_vector(strategy.nranks_);
  rows_num_vector[strategy.local_rank_] = static_cast<int64_t>(src_rows.size());
  // CUDAMutableData use CalStream
-  auto *gpu_rows_num_ptr = rows_num_vector.CUDAMutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_rows_num_vector(&rows_num_vector);
+  auto *gpu_rows_num_ptr = mixv_rows_num_vector.CUDAMutableData(place);
+  VLOG(4) << "start dev_ctx->wait";
  if (!use_calc_stream) {
    dev_ctx->Wait();
  }
@@ -109,6 +112,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
    platform::GpuStreamSync(stream);
  }

+  mixv_rows_num_vector.CopyToCPU();
  const auto *cpu_rows_num_ptr = rows_num_vector.data();
  auto rows_num =
      std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + strategy.nranks_,
@@ -121,8 +125,10 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,

  auto *dst_rows = dst->mutable_rows();
  dst_rows->resize(rows_num);
-  auto *dst_rows_ptr = dst_rows->CUDAMutableData(place);
-  const auto *src_rows_ptr = src_rows.CUDAData(place);
+  paddle::framework::MixVector<int64_t> mixv_dst_rows(dst_rows);
+  auto *dst_rows_ptr = mixv_dst_rows.CUDAMutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_src_rows(&src_rows);
+  const auto *src_rows_ptr = mixv_src_rows.CUDAData(place);

  auto *dst_tensor = dst->mutable_value();
  auto dims = src_tensor.dims();
@@ -150,24 +156,28 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
        src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype,
        comm->comm(), stream));
-    return;
-  }
-  for (int i = 0; i < strategy.nranks_; ++i) {
-    if (cpu_rows_num_ptr[i] > 0) {
-      // 2. Broadcast the rows of SelectedRows
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
-          src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
-          ncclInt64, i, comm->comm(), stream));
-      // 3. Broadcast the tensor data of SelectedRows
-      auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
-                               row_offset * feature_size * sizeof_dtype;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
-          src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size,
-          nccl_dtype, i, comm->comm(), stream));
-      row_offset += cpu_rows_num_ptr[i];
+  } else {
+    for (int i = 0; i < strategy.nranks_; ++i) {
+      if (cpu_rows_num_ptr[i] > 0) {
+        // 2. Broadcast the rows of SelectedRows
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
+            src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
+            ncclInt64, i, comm->comm(), stream));
+        // 3. Broadcast the tensor data of SelectedRows
+        auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
+                                 row_offset * feature_size * sizeof_dtype;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
+            src_tensor_ptr, dst_tensor_ptr_i,
+            cpu_rows_num_ptr[i] * feature_size, nccl_dtype, i, comm->comm(),
+            stream));
+        row_offset += cpu_rows_num_ptr[i];
+      }
    }
  }
-
+  if (!use_calc_stream) {
+    platform::GpuStreamSync(stream);
+  }
+  mixv_dst_rows.CopyToCPU();
  VLOG(3) << "Original SelectedRows rows: "
          << string::join_strings(src_rows, ',');
  VLOG(3) << "Result SelectedRows rows: "

--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -143,7 +143,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
  auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
  // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
  // but we can use other ways to implement is in the future
-  const auto &src_rows = src.rows();
+  auto &src_rows = src.rows();
  auto gloo_wrapper = framework::GlooWrapper::GetInstance();
  size_t local_row_num = src_rows.size();
  std::vector<size_t> rows_num_vector =
@@ -157,8 +157,10 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
          << ", height: " << src.height();
  auto *dst_rows = dst->mutable_rows();
  dst_rows->resize(rows_num);
-  auto *dst_rows_ptr = dst_rows->MutableData(place);
-  const int64_t *src_rows_ptr = src_rows.Data(place);
+  paddle::framework::MixVector<int64_t> mixv_dst_rows(dst_rows);
+  auto *dst_rows_ptr = mixv_dst_rows.MutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_src_rows(&src_rows);
+  const int64_t *src_rows_ptr = mixv_src_rows.Data(place);

  auto *dst_tensor = dst->mutable_value();
  auto dims = src_tensor.dims();

--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -38,8 +38,6 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
    dst->emplace_back(v);
  }
 }
-template void SetLoD<paddle::lite::LoD, framework::LoD>(
-    paddle::lite::LoD* dst, const framework::LoD& src);
 template void SetLoD<framework::LoD, paddle::lite::LoD>(
    framework::LoD* dst, const paddle::lite::LoD& src);


--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -110,10 +110,12 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
      // merge elements and delete blank
      T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());

+      paddle::framework::MixVector<size_t> mixv_input_lod(&input_lod[level]);
      MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
          num_tokens, tokens, num_seq,
-          input_lod[level].CUDAMutableData(ctx.GetPlace()), blank,
-          merge_repeated, dev_out_lod0_ptr, output_data);
+          mixv_input_lod.CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+          dev_out_lod0_ptr, output_data);
+      mixv_input_lod.CopyToCPU();

      // set output lod
      std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),

--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -149,11 +149,12 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
          batch_size, lod[lod.size() - 1],
          platform::errors::PreconditionNotMet(
              "Output(X@GRAD)'s dim[0] must be equal to last element of lod"));
+      paddle::framework::MixVector<size_t> mixv_lod(&lod);
      CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                 PADDLE_CUDA_NUM_THREADS,
                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
          use_cvm, item_size, cvm_data, dout_data, dx_data, true,
-          lod.CUDAData(context.GetPlace()), lod.size(), dx_numel);
+          mixv_lod.CUDAData(context.GetPlace()), lod.size(), dx_numel);
    }
  }
 };

--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -57,9 +57,11 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
    auto stream = dev_ctx.stream();
    const size_t batch_size = lod.back().size() - 1;
    T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&abs_offset_lod[0]);
    GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
-        input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
+        input->data<T>(), mix_vector.CUDAMutableData(dev_ctx.GetPlace()),
        bbox_width, im_info->data<T>(), output_data);
+    mix_vector.CopyToCPU();
  }
 };


--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -108,7 +108,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {

    auto x_lod = x->lod().back();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_x_lod(&x_lod);
+    size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
 #else
    size_t* x_lod_data = x_lod.data();
 #endif
@@ -116,6 +117,9 @@ class TargetAssignKernel : public framework::OpKernel<T> {
    TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                       mismatch_value, n, m, p, k, out_data,
                                       out_wt_data);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    mixv_x_lod.CopyToCPU();
+#endif

    auto& device_ctx = ctx.template device_context<DeviceContext>();
    platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
@@ -130,13 +134,17 @@ class TargetAssignKernel : public framework::OpKernel<T> {
      const int* neg_idx_data = neg_indices->data<int>();
      auto neg_lod = neg_indices->lod().back();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+      paddle::framework::MixVector<size_t> mixv_neg_lod(&neg_lod);
+      size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
 #else
      size_t* neg_lod_data = neg_lod.data();
 #endif
      NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
      neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                      mismatch_value, out_data, out_wt_data);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      mixv_neg_lod.CopyToCPU();
+#endif
    }
  }
 };

--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 #include "dnnl.hpp"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/multi_gru_op.h"
 #include "paddle/fluid/platform/errors.h"

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -164,8 +164,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto gpu_place = context.GetPlace();

      // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+      paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
+      memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(context.GetPlace()),
                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      mixv_new_rows.CopyToCPU();
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -152,14 +152,16 @@ struct LookupTableV2GradCUDAFunctor {
      new_rows.resize(ids_num);
      auto gpu_place = context_.GetPlace();

+      paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
      if (!std::is_same<IdT, int64_t>::value) {
        InputTypeConvert<<<grids, threads, 0, stream>>>(
-            ids_data, ids_num, new_rows.MutableData(gpu_place));
+            ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
      } else {
-        memory::Copy(gpu_place, new_rows.CUDAMutableData(gpu_place), gpu_place,
-                     ids_data, ids_num * sizeof(int64_t), stream);
+        memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(gpu_place),
+                     gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
      }

+      mixv_new_rows.CopyToCPU();
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -357,8 +357,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
    framework::LoD selected_lod(2);
    selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
    selected_lod[1].resize(scores->dims()[0] + 1);
-    size_t* selected_offsets =
-        selected_lod[1].CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&selected_lod[1]);
+    paddle::framework::MixVector<size_t> mixv_abs(&abs_lod[level]);
+    size_t* selected_offsets = mix_vector.CUDAMutableData(context.GetPlace());

    if (num_seqs == 1) {
      const int seq_length = static_cast<int>(abs_lod[level][1]);
@@ -377,7 +378,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                is_accumulated, num_used_threads));
      }
    } else if (num_seqs <= 4) {
-      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      const size_t* seq_offsets = mixv_abs.CUDAData(context.GetPlace());
      // Use only 1 block
      const int kMaxThreadsPerSeq = 32;
      const int kMaxSeqs = 4;
@@ -400,6 +401,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
    }

    context.Wait();
+    mix_vector.CopyToCPU();
    if (!framework::CheckLoD(selected_lod)) {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "lod %s is not right in"

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -170,7 +170,8 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
    auto* in2_value = input2->mutable_value();

    // concat rows
-    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    paddle::framework::MixVector<int64_t> mixv_in2_rows(&in2_rows);
+    mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end());

    auto in1_place = input1.place();
    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -161,9 +161,10 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    const int block_size = 256;
    dim3 threads(block_size, 1);
    dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
    SelectedRowsAddTensorKernel<
        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
        in1_row_numel);

    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
@@ -198,8 +199,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto* in2_value = input2->mutable_value();

    // concat rows
+    paddle::framework::MixVector<int64_t> mixv_in2_rows(&in2_rows);
    if (in1_rows.size()) {
-      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+      mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end());
    }

    auto in1_place = input1.place();
@@ -274,9 +276,10 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
    const int block_size = 256;
    dim3 threads(block_size, 1);
    dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
    SelectedRowsAddToTensorKernel<
        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
        in1_row_numel);
  }
 };
@@ -356,10 +359,13 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
    dim3 threads(block_size, 1);
    dim3 grid1(input_rows.size(), 1);

+    paddle::framework::MixVector<int64_t> mix_vector_input(&input_rows);
+    paddle::framework::MixVector<int64_t> mix_vector_out(out.mutable_rows());
    MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-        out.rows().size(), input_width);
+        input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data,
+        mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(),
+        input_width);
+    mix_vector_out.CopyToCPU();
  }

  void operator()(const platform::CUDADeviceContext& context,
@@ -423,10 +429,13 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
      auto& input_rows = input->rows();
      dim3 grid1(input_rows.size(), 1);

+      paddle::framework::MixVector<int64_t> mix_vector_input(&input_rows);
+      paddle::framework::MixVector<int64_t> mix_vector_out(out.mutable_rows());
      MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-          input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-          out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-          out.rows().size(), input_width);
+          input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data,
+          mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(),
+          input_width);
+      mix_vector_out.CopyToCPU();
    }
  }
 };

--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -72,8 +72,9 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
    dim3 threads(128, 8);
    dim3 grid(8, 1);
    auto stream = context.stream();
+    paddle::framework::MixVector<size_t> mix_index_lod(&index_lod);
    CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
+        src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height,
        width, is_src_index);
  }
 };

--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -59,7 +59,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                  int lod_level = 0, bool norm_by_times = false,
                  const PadLayout layout = kBatchLengthWidth) {
    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
+    auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
    const auto& seq_tensor_dims = seq_tensor.dims();
    const auto& pad_tensor_dims = pad_tensor->dims();
    int max_seq_len = MaximumSequenceLength(seq_offsets);
@@ -104,10 +104,11 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    T* pad_data = pad_tensor->data<T>();
    const T* pad_value_data = pad_value.data<T>();

+    paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
    SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
        pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, layout);
+        mix_vector_seq_offsets.CUDAData(context.GetPlace()), seq_num,
+        pad_seq_len, step_width, norm_by_times, layout);
  }
 };

@@ -157,9 +158,10 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    const T* pad_data = pad_tensor.data<T>();
    T* seq_data = seq_tensor->data<T>();

+    paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
    SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
        seq_data, pad_data, nullptr, false,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
+        mixv_seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
        step_width, norm_by_times, layout);
  }
 };

--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -168,41 +168,42 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
    const size_t item_dim = output->numel() / output->dims()[0];
    dim3 threads(1024, 1);
    dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
+    paddle::framework::MixVector<size_t> mix_vector(&lod);
    if (pooltype == "MAX") {
      sequence_pool_kernel<
          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          MaxPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          output->mutable_data<T>(context.GetPlace()), index->data<int>());
    } else if (pooltype == "AVERAGE") {
      sequence_pool_kernel<
          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          AvgPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          output->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "SUM") {
      sequence_pool_kernel<
          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          SumPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          output->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "SQRT") {
      sequence_pool_kernel<
          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          output->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "LAST") {
      sequence_pool_kernel<
          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          LastPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          output->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "FIRST") {
      sequence_pool_kernel<
          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          FirstPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          output->mutable_data<T>(context.GetPlace()), nullptr);
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
@@ -335,41 +336,42 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
    const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
    dim3 threads(1024, 1);
    dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
+    paddle::framework::MixVector<size_t> mix_vector(&lod);
    if (pooltype == "MAX") {
      sequence_pool_grad_kernel<
          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
    } else if (pooltype == "AVERAGE") {
      sequence_pool_grad_kernel<
          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "SUM") {
      sequence_pool_grad_kernel<
          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          SumPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "SQRT") {
      sequence_pool_grad_kernel<
          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "LAST") {
      sequence_pool_grad_kernel<
          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          LastPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
    } else if (pooltype == "FIRST") {
      sequence_pool_grad_kernel<
          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
          in_grad->mutable_data<T>(context.GetPlace()), nullptr);

    } else {

--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -41,21 +41,23 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
    auto lod = seq->lod();
    const size_t num_seq = lod[level].size() - 1;
    const size_t seq_width = seq->numel() / seq->dims()[0];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    auto abs_offset_lod = framework::ToAbsOffset(lod);
    T* seq_data = seq->mutable_data<T>(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));

 #ifdef PADDLE_WITH_HIP
    hipLaunchKernelGGL(
        HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
        dim3(num_seq), dim3(PADDLE_CUDA_NUM_THREADS), 0, context.stream(),
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
+        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+        seq_width);
 #else
    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
+        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+        seq_width);
 #endif
+    mix_vector.CopyToCPU();
  }
 };


--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -96,12 +96,14 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    const int block_size = 256;
    dim3 threads(block_size, 1);
    dim3 grid2(1, merge_rows.size());
+    paddle::framework::MixVector<int64_t> mixv_merge_rows(&merge_rows);
    SparseAdagradFunctorKernel<
        T, 256><<<grid2, threads, 0,
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
                      .stream()>>>(
-        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
-        param_data, moment_data, grad_width, epsilon);
+        grad_merge_data, mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+        lr, param_data, moment_data, grad_width, epsilon);
+    mixv_merge_rows.CopyToCPU();
  }
 };


--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -345,7 +345,10 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
      auto& grad_merge = *grad_merge_ptr;
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();

      if (beta1_pow->place() == platform::CPUPlace() &&

--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -592,7 +592,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
      auto& grad_merge = *grad_merge_ptr;
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();

      SparseAdamFunctor<T, CPUAdam> functor(

--- a/paddle/fluid/operators/optimizers/adamw_op.cu
+++ b/paddle/fluid/operators/optimizers/adamw_op.cu
@@ -368,7 +368,10 @@ class AdamWOpCUDAKernel : public framework::OpKernel<T> {
      auto& grad_merge = *grad_merge_ptr;
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();

      if (beta1_pow->place() == platform::CPUPlace() &&

--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -189,7 +189,9 @@ class FTRLOpKernel : public framework::OpKernel<T> {
      merge_func(ctx.template device_context<DeviceContext>(), *grad,
                 merged_grad);

-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto* merged_rows = merged_grad->mutable_rows();
+      paddle::framework::MixVector<int64_t> mixv_merged_rows(merged_rows);
+      const int64_t* rows = mixv_merged_rows.Data(ctx.GetPlace());
      auto row_numel = static_cast<int64_t>(merged_grad->value().dims()[1]);
      auto row_height = static_cast<int64_t>(merged_grad->rows().size());


--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -594,7 +594,10 @@ class LambOpKernel : public framework::OpKernel<T> {
      auto& grad_merge = *grad_merge_ptr;
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
      if (platform::is_gpu_place(ctx.GetPlace()) &&
          beta1_pow.place() == platform::CPUPlace() &&

--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -561,7 +561,10 @@ class MomentumOpKernel : public framework::OpKernel<T> {
      merge_func(ctx.template device_context<DeviceContext>(), *grad,
                 merged_grad);

-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = merged_grad->mutable_rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
      int64_t row_numel =
          merged_grad->value().numel() / merged_grad->rows().size();
      platform::ForRange<DeviceContext> for_range(

--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -227,7 +227,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
      merge_func(dev_ctx, grad, merged_grad);

      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto &grad_merge_rows = merged_grad->rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          &grad_merge_rows);
+      const int64_t *rows = mixv_grad_merge_rows.Data(ctx.GetPlace());

      auto &merged_tensor = merged_grad->value();
      int64_t row_count = merged_grad->rows().size();

--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -148,11 +148,11 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
      int thread_x = kThreadsPerBlock;
      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
+      paddle::framework::MixVector<int64_t> mixv_in_rows(&in_rows);
      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
                               ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel, in_rows.size());
+          in_data, mixv_in_rows.CUDAData(ctx.GetPlace()),
+          learning_rate->data<T>(), out_data, in_row_numel, in_rows.size());

    } else {
      PADDLE_ENFORCE_EQ(false, true,

--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -336,7 +336,8 @@ class RowConvKernel<platform::CUDADeviceContext, T>

    int num_sequence = batch_indices.size() - 1;
    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&batch_indices);
+    size_t *idx = mix_vector.CUDAMutableData(context.GetPlace());
    auto stream = context.cuda_device_context().stream();

    if (future_context <= 32) {
@@ -352,6 +353,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
          in, weight, num_sequence, input_dim, future_context, idx, out);
    }
+    mix_vector.CopyToCPU();
  }
 };

@@ -392,7 +394,8 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
    // int input_dim = X->dims()[1];
    int num_sequence = batch_indices.size() - 1;
    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_batch_indices(&batch_indices);
+    size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace());

    auto &device_ctx = context.cuda_device_context();
    phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
@@ -444,6 +447,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
            dout, weights, num_sequence, input_dim, future_context, idx, din);
      }
    }
+    mixv_batch_indices.CopyToCPU();
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -71,7 +71,8 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
    out->Resize({in_dims[0], win_size});
    auto out_data = out->mutable_data<T>(context.GetPlace());
    // Copy LoD to GPU
-    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_lod0(&lod0);
+    const size_t* dev_in_lod_ptr = mixv_lod0.CUDAData(context.GetPlace());
    // Calc output tensor
    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(

--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -88,7 +88,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
    // Copy LoD to GPU
    auto last_lod = lod[lod.size() - 1];
    auto lod_len = last_lod.size();
-    const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_last_lod(&last_lod);
+    const size_t* dev_in_lod_ptr = mixv_last_lod.CUDAData(ctx.GetPlace());
    // Calc output LoD
    thrust::device_vector<size_t> dev_out_lod(lod_len);
    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());

--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -81,8 +81,9 @@ struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {

    dim3 block_size(thread_x);
    dim3 grid_size(block_x);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
    sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), height, width,
+        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height, width,
        out->mutable_data<T>(context.GetPlace()));
  }
 };
@@ -107,10 +108,11 @@ struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {

    dim3 block_size(thread_x);
    dim3 grid_size(block_x);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
    sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
                                     context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), height, width,
-        dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
+        width, dx->mutable_data<T>(context.GetPlace()));
  }
 };


--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -157,7 +157,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
        out_offset[2 * x_lod_size + i] = ref_lod[i];
      }

-      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      paddle::framework::MixVector<size_t> mixv_out_offset(&out_offset);
+      const size_t* out_offset_data =
+          mixv_out_offset.CUDAData(context.GetPlace());
      const size_t* x_lod_data = out_offset_data + x_lod_size;
      const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;

@@ -193,11 +195,14 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
    int block_x = static_cast<int>(ref_lod.size());
    dim3 block_size(thread_x, thread_y, thread_z);
    dim3 grid_size(block_x, 1);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
+    paddle::framework::MixVector<size_t> mixv_x_lod(&x_lod);
+    paddle::framework::MixVector<size_t> mixv_out_offset(&out_offset);
    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        x_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
-        dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()),
+        mixv_x_lod.CUDAData(context.GetPlace()),
+        mixv_out_offset.CUDAData(context.GetPlace()), ref_lod.size(),
+        x_item_length, dx->mutable_data<T>(context.GetPlace()));
  }
 };


--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -132,7 +132,9 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(ctx.GetPlace())) {
-      lod = x.lod()[0].CUDAData(ctx.GetPlace());
+      auto xlod = x.lod()[0];
+      paddle::framework::MixVector<size_t> mixv_xlod(&xlod);
+      lod = mixv_xlod.CUDAData(ctx.GetPlace());
    } else {
 #endif
      lod = x.lod()[0].data();

--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -133,9 +133,10 @@ struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {

    dim3 block_size(thread_x);
    dim3 grid_size(max_blocks);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
    sequence_softmax_kernel<
        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), height,
+        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
        out->mutable_data<T>(context.GetPlace()));
  }
 };
@@ -156,10 +157,12 @@ struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
    dim3 block_size(thread_x);
    dim3 grid_size(max_blocks);

+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
    sequence_softmax_grad_kernel<
        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), out.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        height, dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), out.data<T>(),
+        mixv_ref_lod.CUDAData(context.GetPlace()), height,
+        dx->mutable_data<T>(context.GetPlace()));
  }
 };


--- a/paddle/phi/api/ext/dispatch.h
+++ b/paddle/phi/api/ext/dispatch.h
@@ -292,7 +292,7 @@ namespace paddle {
                           paddle::experimental::complex128,                   \
                           __VA_ARGS__)                                        \
      default:                                                                 \
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(                \
+        PADDLE_THROW(phi::errors::InvalidArgument(                             \
            "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
    }                                                                          \
  }()

--- a/paddle/phi/api/lib/utils/storage.cc
+++ b/paddle/phi/api/lib/utils/storage.cc
@@ -19,7 +19,7 @@ namespace experimental {

 ExternalStorage::ExternalStorage(void* ptr,
                                 size_t size,
-                                 const paddle::platform::Place& place)
+                                 const phi::Place& place)
    : phi::Storage(std::make_shared<phi::Allocation>(ptr, size, place)),
      size_(size) {}

@@ -29,11 +29,11 @@ ExternalStorage::ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
    : Storage(std::make_shared<phi::Allocation>(
          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
      size_(size) {
-  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
-                    root->size(),
-                    paddle::platform::errors::InvalidArgument(
-                        "The size of the external storage does "
-                        "not meet the metadata requirements."));
+  PADDLE_ENFORCE_LE(
+      static_cast<size_t>(delta + size),
+      root->size(),
+      phi::errors::InvalidArgument("The size of the external storage does "
+                                   "not meet the metadata requirements."));
 }

 }  // namespace experimental

--- a/paddle/phi/api/lib/utils/storage.h
+++ b/paddle/phi/api/lib/utils/storage.h
@@ -30,7 +30,7 @@ class ExternalStorage : public phi::Storage {
  static const char* name() { return "ExternalStorage"; }

  void Realloc(size_t n) override {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
        "The external shared storage cannot be reallocated."));
  }

@@ -55,7 +55,7 @@ class ExternalStorage : public phi::Storage {
  const phi::Place& place() const override {
    PADDLE_ENFORCE_NOT_NULL(
        data_,
-        paddle::platform::errors::Unavailable(
+        phi::errors::Unavailable(
            "Unable to visit place as data_ has not been initialized yet."));
    return data_->place();
  }

--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -54,7 +54,7 @@ bool HasCUDNN() {
 void EnforceCUDNNLoaded(const char* fn_name) {
  PADDLE_ENFORCE_NOT_NULL(
      cudnn_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "Cannot load cudnn shared library. Cannot invoke method %s.",
          fn_name));
 }

--- a/paddle/phi/backends/dynload/cufft.cc
+++ b/paddle/phi/backends/dynload/cufft.cc
@@ -33,7 +33,7 @@ bool HasCUFFT() {
 void EnforceCUFFTLoaded(const char* fn_name) {
  PADDLE_ENFORCE_NOT_NULL(
      cufft_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "Cannot load cufft shared library. Cannot invoke method %s.",
          fn_name));
 }

--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <windows.h>
 #endif

-// TODO(wilber): The pten computing library requires a component to manage flags
+// TODO(wilber): The phi computing library requires a component to manage flags
 // (maybe not use gflags).
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -299,8 +299,8 @@ static inline void* GetDsoHandleFromSearchPath(
 #endif  // !_WIN32
    if (throw_on_error) {
      // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          error_msg, dso_name, errorno));
+      PADDLE_THROW(
+          phi::errors::PreconditionNotMet(error_msg, dso_name, errorno));
    } else {
      LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
    }
@@ -547,14 +547,11 @@ void* GetOpDsoHandle(const std::string& dso_name) {

 void* GetNvtxDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("Nvtx do not support Apple."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Apple."));
 #elif defined(_WIN32)
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("Nvtx do not support Windows."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Windows."));
 #elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "Nvtx do not support without CUDA."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support without CUDA."));
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
 #endif

--- a/paddle/phi/backends/dynload/miopen.cc
+++ b/paddle/phi/backends/dynload/miopen.cc
@@ -58,7 +58,7 @@ bool HasCUDNN() {
 void EnforceCUDNNLoaded(const char* fn_name) {
  PADDLE_ENFORCE_NOT_NULL(
      miopen_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "Cannot load miopen shared library. Cannot invoke method %s.",
          fn_name));
 }

--- a/paddle/phi/backends/dynload/tensorrt.h
+++ b/paddle/phi/backends/dynload/tensorrt.h
@@ -54,21 +54,21 @@ extern void* tensorrt_plugin_dso_handle;
  };                                                                   \
  extern DynLoad__##__name __name

-#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)              \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      std::call_once(tensorrt_dso_flag, []() {                              \
-        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();            \
-      });                                                                   \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);        \
-      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                   \
-                              paddle::platform::errors::Unavailable(        \
-                                  "Load tensorrt api %s failed", #__name)); \
-      using tensorrt_func = decltype(&::__name);                            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)               \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      std::call_once(tensorrt_dso_flag, []() {                               \
+        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();             \
+      });                                                                    \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);         \
+      PADDLE_ENFORCE_NOT_NULL(                                               \
+          p_##__name,                                                        \
+          phi::errors::Unavailable("Load tensorrt api %s failed", #__name)); \
+      using tensorrt_func = decltype(&::__name);                             \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);           \
+    }                                                                        \
+  };                                                                         \
  extern DynLoad__##__name __name

 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
@@ -80,7 +80,7 @@ extern void* tensorrt_plugin_dso_handle;
      });                                                                      \
      static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
-                              paddle::platform::errors::Unavailable(           \
+                              phi::errors::Unavailable(                        \
                                  "Load tensorrt plugin %s failed", #__name)); \
      using tensorrt_plugin_func = decltype(&::__name);                        \
      return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);      \

--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
@@ -14,7 +14,7 @@

 #include "paddle/phi/backends/gpu/gpu_info.h"

-// TODO(pten): remove fluid headers.
+// TODO(phi): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"

 static std::once_flag g_device_props_size_init_flag;
@@ -74,13 +74,13 @@ int GetGPUDeviceCount() {
 }

 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int major, minor;
  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -93,26 +93,26 @@ int GetGPUComputeCapability(int id) {
 }

 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int runtime_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
  return runtime_version;
 }

 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int driver_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
  return driver_version;
@@ -125,13 +125,13 @@ bool TensorCoreAvailable() {
 }

 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }

 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }

 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -174,13 +174,13 @@ int GetCurrentDeviceId() {
 }

 std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  std::array<int, 3> ret;
  int size;
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
@@ -213,7 +213,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(paddle::platform::errors::OutOfRange(
+    PADDLE_THROW(phi::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
@@ -233,13 +233,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) {

 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }

@@ -294,13 +294,13 @@ gpuError_t GpuGetLastError() { return cudaGetLastError(); }
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
 // for more detail about managed memory requirements
 bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #if defined(__linux__) || defined(_WIN32)
  int ManagedMemoryAttr;
  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
@@ -312,13 +312,13 @@ bool IsGPUManagedMemorySupported(int dev_id) {
 }

 bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #ifdef __linux__
  return IsGPUManagedMemorySupported(dev_id) &&
         GetGPUComputeCapability(dev_id) >= 60;

--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -100,12 +100,12 @@ struct GpuLaunchConfig {
 inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
                                            int64_t numel,
                                            int vec_size = 1) {
-  PADDLE_ENFORCE_GT(numel,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "element quantity should be greater than 0,"
-                        " but received value is: %d.",
-                        numel));
+  PADDLE_ENFORCE_GT(
+      numel,
+      0,
+      phi::errors::InvalidArgument("element quantity should be greater than 0,"
+                                   " but received value is: %d.",
+                                   numel));
  // Get compute_capability
  const int capability = context.GetComputeCapability();
  /* If thread number per block is 64/128/256/512, cuda performs better.*/
@@ -142,18 +142,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                            int x_dim,
                                            int y_dim) {
-  PADDLE_ENFORCE_GT(x_dim,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "x dim number should greater than 0,"
-                        " but received value is: %d",
-                        x_dim));
-  PADDLE_ENFORCE_GT(y_dim,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "y dim number should greater than 0,"
-                        " but received value is: %d",
-                        y_dim));
+  PADDLE_ENFORCE_GT(
+      x_dim,
+      0,
+      phi::errors::InvalidArgument("x dim number should greater than 0,"
+                                   " but received value is: %d",
+                                   x_dim));
+  PADDLE_ENFORCE_GT(
+      y_dim,
+      0,
+      phi::errors::InvalidArgument("y dim number should greater than 0,"
+                                   " but received value is: %d",
+                                   y_dim));

  const int kThreadsPerBlock = 256;
  int block_cols = (std::min)(x_dim, kThreadsPerBlock);

--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -78,13 +78,13 @@ int GetGPUDeviceCount() {
 }

 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int major, minor;
  auto major_error_code = hipDeviceGetAttribute(
      &major, hipDeviceAttributeComputeCapabilityMajor, id);
@@ -97,26 +97,26 @@ int GetGPUComputeCapability(int id) {
 }

 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int runtime_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
  return runtime_version;
 }

 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int driver_version = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
  return driver_version;
@@ -125,13 +125,13 @@ int GetGPUDriverVersion(int id) {
 bool TensorCoreAvailable() { return false; }

 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
@@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }

 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
@@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }

 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  int count;
  PADDLE_ENFORCE_GPU_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
@@ -174,13 +174,13 @@ int GetCurrentDeviceId() {
 }

 std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  std::array<int, 3> ret;
  int size;
  auto error_code_x =
@@ -216,7 +216,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(paddle::platform::errors::OutOfRange(
+    PADDLE_THROW(phi::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
@@ -235,13 +235,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) {

 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
 }

@@ -293,13 +293,13 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
 gpuError_t GpuGetLastError() { return hipGetLastError(); }

 bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #if defined(__linux__) || defined(_WIN32)
  int ManagedMemoryAttr;
  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
@@ -311,13 +311,13 @@ bool IsGPUManagedMemorySupported(int dev_id) {
 }

 bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #ifdef __linux__
  return IsGPUManagedMemorySupported(dev_id) &&
         GetGPUComputeCapability(dev_id) >= 60;

--- a/paddle/phi/backends/xpu/enforce_xpu.h
+++ b/paddle/phi/backends/xpu/enforce_xpu.h
@@ -173,7 +173,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
        ::phi::backends::xpu::details::ExternalApiType<         \
            __XPU_STATUS_TYPE__>::kSuccess;                     \
    if (UNLIKELY(__cond__ != __success_type__)) {               \
-      auto __summary__ = paddle::platform::errors::External(    \
+      auto __summary__ = phi::errors::External(                 \
          ::phi::backends::xpu::build_xpu_error_msg(__cond__)); \
      __THROW_ERROR_INTERNAL__(__summary__);                    \
    }                                                           \
@@ -183,7 +183,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
  do {                                                                    \
    auto __cond__ = (COND);                                               \
    if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) {        \
-      auto __summary__ = paddle::platform::errors::External(              \
+      auto __summary__ = phi::errors::External(                           \
          ::phi::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
      __THROW_ERROR_INTERNAL__(__summary__);                              \
    }                                                                     \
@@ -192,7 +192,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
 #define PADDLE_ENFORCE_XDNN_NOT_NULL(ptr)                    \
  do {                                                       \
    if (UNLIKELY(ptr == nullptr)) {                          \
-      auto __summary__ = paddle::platform::errors::External( \
+      auto __summary__ = phi::errors::External(              \
          ::phi::backends::xpu::build_xpu_xdnn_error_msg(    \
              baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE, \
              "XPU memory is not enough"));                  \

--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -100,7 +100,7 @@ void SetXPUDeviceId(int id) {
  PADDLE_ENFORCE_LT(
      id,
      GetXPUDeviceCount(),
-      paddle::platform::errors::InvalidArgument("id must less than XPU count"));
+      phi::errors::InvalidArgument("id must less than XPU count"));
  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
 }


--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -13,8 +13,8 @@ cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)

 cc_library(ddim SRCS ddim.cc DEPS pten_enforce)
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
-cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce)
+cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce)

 cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
@@ -23,7 +23,7 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_

 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
+cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim)

 cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)


--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <initializer_list>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <vector>

--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -73,7 +73,7 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
                                size_t requested_size) {
  PADDLE_ENFORCE_NOT_NULL(
      allocator,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Required allocator shall not be nullptr, but received nullptr."));
  if (this->dtype() != dtype) {
    VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype;
@@ -81,13 +81,13 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
  }
  PADDLE_ENFORCE(
      valid(),
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "The meta data must be valid when call the mutable data function."));
  size_t bytes = numel() * SizeOf(this->dtype());
  if (requested_size) {
    PADDLE_ENFORCE_GE(requested_size,
                      bytes,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "The reserved size %d should be enough to meet the "
                          "volume required by metadata %d.",
                          requested_size,
@@ -112,7 +112,7 @@ const T* DenseTensor::data() const {
  check_memory_size();
  PADDLE_ENFORCE(
      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "The type of data we are trying to retrieve does not match the "
          "type of data currently contained in the container."));
  return static_cast<const T*>(data());
@@ -123,7 +123,7 @@ T* DenseTensor::data() {
  check_memory_size();
  PADDLE_ENFORCE(
      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "The type of data we are trying to retrieve does not match the "
          "type of data currently contained in the container."));
  return static_cast<T*>(data());
@@ -133,7 +133,7 @@ void* DenseTensor::data() {
  check_memory_size();
  PADDLE_ENFORCE_NOT_NULL(
      holder_,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "The storage must be valid when call the data function."));
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 meta_.offset);
@@ -143,7 +143,7 @@ const void* DenseTensor::data() const {
  check_memory_size();
  PADDLE_ENFORCE_NOT_NULL(
      holder_,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "The storage must be valid when call the data function."));
  return reinterpret_cast<const void*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
@@ -151,7 +151,7 @@ const void* DenseTensor::data() const {

 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
  PADDLE_ENFORCE(!meta_.valid(),
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                     "Only when the original attribute of Tensor is "
                     "incomplete, can it be reset."));
  meta_ = std::move(meta);
@@ -160,7 +160,7 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
 void DenseTensor::set_meta(const DenseTensorMeta& meta) {
  PADDLE_ENFORCE(
      meta.valid(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Input meta is invalid, please check the meta attribute."));
  meta_.dims = meta.dims;
  meta_.dtype = meta.dtype;

--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -54,22 +54,22 @@ DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
 inline bool IsInitialized() const { return holder_ != nullptr; }

 template <typename T>
-T* mutable_data(const paddle::platform::Place& place,
+T* mutable_data(const phi::Place& place,
                size_t requested_size = 0);

 template <typename T>
 T* mutable_data(const DDim& dims,
-                const paddle::platform::Place& place,
+                const phi::Place& place,
                size_t requested_size = 0);

-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                    paddle::experimental::DataType type,
                    size_t requested_size = 0);

-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                    size_t requested_size = 0);

-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                    paddle::experimental::DataType type,
                    const phi::Stream& stream);


--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
+#include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"


--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -69,7 +69,7 @@ void KernelContext::AssignInputRange(std::pair<int, int>&& range, size_t idx) {
  } else if (idx == input_range_.size()) {
    input_range_.emplace_back(range);
  } else {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
        "Invalid idx when trying to set InputRange, "
        "index is `%d`, it is greater than the size(%d) of InputRange.",
        idx,
@@ -83,7 +83,7 @@ void KernelContext::AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
  } else if (idx == output_range_.size()) {
    output_range_.emplace_back(range);
  } else {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
        "Invalid idx when trying to set InputRange, "
        "index is `%d`, it is greater than the size(%d) of InputRange.",
        idx,

--- a/paddle/phi/core/lod_utils.h
+++ b/paddle/phi/core/lod_utils.h
@@ -13,18 +13,11 @@
 // limitations under the License.

 #pragma once
-
-// See Note [ Why still include the fluid headers? ]
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-#include "paddle/fluid/framework/mixed_vector.h"
-#endif
+#include <cstddef>
+#include <vector>

 namespace phi {
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-using LoD = std::vector<paddle::framework::Vector<size_t>>;
-#else
-using LoD = std::vector<std::vector<size_t>>;
-#endif
+using LoD = std::vector<std::vector<std::size_t>>;

 void AppendLoD(LoD* lod, const LoD& lod_length);

@@ -40,4 +33,4 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
 */
 LoD ConvertToLengthBasedLoD(const LoD& offset_lod);

-}  // namespace  pten
+}  // namespace  phi
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -55,25 +55,17 @@ class SelectedRows : public TensorBase,

  void set_height(int64_t height) { impl_->set_height(height); }

-  const paddle::framework::Vector<int64_t>& rows() const {
-    return impl_->rows();
-  }
+  const std::vector<int64_t>& rows() const { return impl_->rows(); }

-  paddle::framework::Vector<int64_t>* mutable_rows() {
-    return impl_->mutable_rows();
-  }
-
-  void set_rows(const paddle::framework::Vector<int64_t>& rows) {
-    impl_->set_rows(rows);
-  }
+  std::vector<int64_t>* mutable_rows() { return impl_->mutable_rows(); }

+  void set_rows(const std::vector<int64_t>& rows) { impl_->set_rows(rows); }
  /*
   * @brief Get the index of key in rows
   *
   * @return -1 if the key does not exists.
   */
  int64_t Index(int64_t key) const { return impl_->Index(key); }
-
  /*
   * @brief whether has the specified key in the table.
   *

--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -28,7 +28,7 @@ struct ReAllocateVisitor {
  template <typename T>
  void operator()() const {
    phi::DenseTensor cpu_tensor;
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
    T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
    const T* old_ptr =
        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
@@ -57,7 +57,7 @@ struct TensorCopyVisitor {
  template <typename T>
  void apply() const {
    // TODO(Yancey1989): support other place
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
    paddle::memory::Copy(cpu,
                         dst_->mutable_data<T>(cpu) + dst_offset_,
                         cpu,
@@ -82,7 +82,7 @@ struct TensorFillVisitor {
  template <typename T>
  void apply() const {
    // TODO(qiao): support other place
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
    auto* tensor_data = dst_->mutable_data<T>(cpu);
    auto* start = tensor_data + dst_offset_;
    auto* end = start + size_;
@@ -121,16 +121,16 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
  auto iter = id_to_index_.find(key);
  if (iter == id_to_index_.end()) {
    rwlock_->UNLock();
-    PADDLE_ENFORCE_EQ(auto_grown,
-                      true,
-                      paddle::platform::errors::NotFound(
-                          "Input key(%lld) is not found.", key));
+    PADDLE_ENFORCE_EQ(
+        auto_grown,
+        true,
+        phi::errors::NotFound("Input key(%lld) is not found.", key));
    rwlock_->WRLock();
    auto map_size = id_to_index_.size();
    auto vector_size = rows_.size();
    if (map_size != vector_size) {
      rwlock_->UNLock();
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
          "Row map size(%zu) should be equal to rows size(%zu).",
          map_size,
          vector_size));
@@ -140,7 +140,7 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
      int row_num = rows_.size();
      if (row_num == value_->dims()[0]) {
        rwlock_->UNLock();
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
            "Selected rows is full, then length exceed the length of first "
            "dimension (%d).",
            row_num));
@@ -187,7 +187,7 @@ void SelectedRowsImpl::Get(const phi::DenseTensor& ids,
    PADDLE_ENFORCE_EQ(
        value_width,
        value->numel() / value->dims()[0],
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "Output tensor should have the same shape with table "
            "except the first dimmension, excepted value width not counting "
            "the first dimension is %d, actual value width is %d.",

--- a/paddle/phi/core/selected_rows_impl.h
+++ b/paddle/phi/core/selected_rows_impl.h
@@ -27,8 +27,6 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/rw_lock.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/mixed_vector.h"
 namespace phi {
 class SelectedRowsImpl {
  /*
@@ -68,13 +66,11 @@ class SelectedRowsImpl {

  void set_height(int64_t height) { height_ = height; }

-  const paddle::framework::Vector<int64_t>& rows() const { return rows_; }
+  const std::vector<int64_t>& rows() const { return rows_; }

-  paddle::framework::Vector<int64_t>* mutable_rows() { return &rows_; }
+  std::vector<int64_t>* mutable_rows() { return &rows_; }

-  void set_rows(const paddle::framework::Vector<int64_t>& rows) {
-    rows_ = rows;
-  }
+  void set_rows(const std::vector<int64_t>& rows) { rows_ = rows; }

  /*
   * @brief Get the index of key in rows
@@ -84,7 +80,7 @@ class SelectedRowsImpl {
  int64_t Index(int64_t key) const {
    auto it = std::find(rows_.begin(), rows_.end(), key);
    if (it == rows_.end()) {
-      PADDLE_THROW(paddle::platform::errors::NotFound(
+      PADDLE_THROW(phi::errors::NotFound(
          "Input id (%lld) is not in current rows table.", key));
    }
    return static_cast<int64_t>(std::distance(rows_.begin(), it));
@@ -156,10 +152,7 @@ class SelectedRowsImpl {

  /// \brief Returns the dims of the tensor.
  /// \return The dims of the tensor.
-  const DDim& dims() const noexcept {
-    return value_->dims();
-    // return phi::make_ddim(dims);
-  }
+  const DDim& dims() const noexcept { return value_->dims(); }

  /// \brief Returns the data type of the tensor.
  /// \return The data type of the tensor.
@@ -185,7 +178,7 @@ class SelectedRowsImpl {
  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
  // SelectedRowsImpl are simply concated when adding together. Until a
  // SelectedRowsImpl add a Tensor, will the duplicate rows be handled.
-  paddle::framework::Vector<int64_t> rows_;
+  std::vector<int64_t> rows_;
  std::unordered_map<int64_t, int64_t>
      id_to_index_;  // should not be used when rows_ has duplicate member
  std::unique_ptr<DenseTensor> value_{nullptr};

--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -69,17 +69,17 @@ void SparseCooTensor::Resize(const DDim& dense_dims,
                             const int64_t non_zero_num) {
  PADDLE_ENFORCE_GE(non_zero_num,
                    this->nnz(),
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "the non_zero_num must be greater than or equal to the "
                        "origin non_zero_num."));
  PADDLE_ENFORCE_GE(sparse_dim,
                    1,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "the sparse_dim must be greater than or equal 1."));
  PADDLE_ENFORCE_LE(
      sparse_dim,
      dense_dims.size(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "the sparse_dim must be less than or equal dense_dims."));

  DDim indices_dims = phi::make_ddim({sparse_dim, non_zero_num});

--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -20,7 +20,7 @@ inline void check_shape(const DDim& dims) {
  bool valid = dims.size() == 2 || dims.size() == 3;

  PADDLE_ENFORCE(valid,
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                     "the SparseCsrTensor only support 2-D Tensor."));
 }
 #define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims)          \
@@ -29,12 +29,12 @@ inline void check_shape(const DDim& dims) {
    PADDLE_ENFORCE_EQ(                                                         \
        non_zero_cols.place(),                                                 \
        non_zero_crows.place(),                                                \
-        paddle::platform::errors::InvalidArgument(                             \
+        phi::errors::InvalidArgument(                                          \
            "non_zero_crows and non_zero_cols must have the same place."));    \
    PADDLE_ENFORCE_EQ(                                                         \
        non_zero_cols.place(),                                                 \
        non_zero_elements.place(),                                             \
-        paddle::platform::errors::InvalidArgument(                             \
+        phi::errors::InvalidArgument(                                          \
            "non_zero_cols and non_zero_elements must have the same place.")); \
  }

@@ -77,7 +77,7 @@ void* SparseCsrTensor::AllocateFrom(Allocator* allocator,
 void SparseCsrTensor::Resize(const DDim& dense_dims,
                             const int64_t non_zero_num) {
  PADDLE_ENFORCE(this->initialized(),
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                     "the SparseCsrTensor must be initialized when call Resize "
                     "function."));
  check_shape(dense_dims);

--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"

 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
@@ -31,11 +33,7 @@ limitations under the License. */
 namespace phi {

 using DDim = phi::DDim;
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-using LoD = std::vector<paddle::framework::Vector<size_t>>;
-#else
 using LoD = std::vector<std::vector<size_t>>;
-#endif
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -23,7 +23,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
  auto x_rank = static_cast<size_t>(x_dims.size());
  PADDLE_ENFORCE_EQ(true,
                    1 == x_rank || 2 == x_rank,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                        "ShapeError: The dimensions of input tensor X (%s) "
                        "should be 1 or 2",
                        x_dims.to_str()));
@@ -32,7 +32,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
  PADDLE_ENFORCE_EQ(
      true,
      x_rank == static_cast<size_t>(y_dims.size()),
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
          "ShapeError: The shape of input tensor Y: %s should match with "
          "input tenosr X: %s",
          y_dims.to_str(),
@@ -47,7 +47,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {

  PADDLE_ENFORCE_EQ(true,
                    shape_match,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                        "ShapeError: The shape of input tensor X: %s should "
                        "be exactly the same "
                        "with input tensor Y: %s",
@@ -71,12 +71,12 @@ void MatmulInferMeta(const MetaTensor& x,
  auto ndims_y = dims_y.size();
  PADDLE_ENFORCE_GT(ndims_x,
                    0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The Input(x) dims size must be greater than 0,"
                        " but reviced dims size is 0. "));
  PADDLE_ENFORCE_GT(ndims_y,
                    0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The Input(y) dims size must be greater than 0,"
                        " but reviced dims size is 0. "));

@@ -150,7 +150,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
    if (x_dims.size() == y_dims.size()) {
      PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
                        true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                            "axis should be -1 or 0 while the dimension of "
                            "tensor X (%s) is equal to the dimension of "
                            "tensor Y (%s), but received axis: %s",
@@ -160,7 +160,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
    }
    PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
                      true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "The axis range must be [%s, %s), but axis is %s. "
                          "Please set the axis again.",
                          -1 * max_dim,

--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -24,7 +24,7 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
                     MetaConfig config) {
  PADDLE_ENFORCE_GE(x.size(),
                    0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The size of input meta vector should be greater"
                        "than 0."));

@@ -34,7 +34,7 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
  PADDLE_ENFORCE_EQ(
      axis >= -rank && axis < rank,
      true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "The axis is expected to be in range of [%d, %d), but got %d",
          -rank,
          rank,

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -38,11 +38,11 @@ void FlattenInferMeta(const MetaTensor& x,
  if (stop_axis < 0) {
    stop_axis = stop_axis + in_dims_size;
  }
-  PADDLE_ENFORCE_GE(stop_axis,
-                    start_axis,
-                    paddle::platform::errors::InvalidArgument(
-                        "The stop_axis should be greater"
-                        "than or equal to start_axis."));
+  PADDLE_ENFORCE_GE(
+      stop_axis,
+      start_axis,
+      phi::errors::InvalidArgument("The stop_axis should be greater"
+                                   "than or equal to start_axis."));

  int64_t outer = 1;
  std::vector<int32_t> out_shape;
@@ -113,7 +113,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
      PADDLE_ENFORCE_EQ(
          unk_dim_idx,
          -1,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "Only one dimension value of 'shape' in ReshapeOp can "
              "be -1. But received shape = [%s], shape[%d] is also -1.",
              phi::make_ddim(shape),
@@ -123,7 +123,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
      PADDLE_ENFORCE_LT(
          static_cast<int>(i),
          in_dims.size(),
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "The index of 0 in `shape` must be less than "
              "the input tensor X's dimensions. "
              "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
@@ -136,7 +136,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
      PADDLE_ENFORCE_GT(
          shape[i],
          0,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "Each dimension value of 'shape' in ReshapeOp must not "
              "be negative except one unknown dimension. "
              "But received  shape = [%s], shape[%d] = %d.",
@@ -161,7 +161,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
      PADDLE_ENFORCE_EQ(
          output_shape[unk_dim_idx] * capacity,
          -in_size,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "The 'shape' attribute in ReshapeOp is invalid. "
              "The input tensor X'size must be divisible by known "
              "capacity of 'shape'. "
@@ -179,7 +179,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
      PADDLE_ENFORCE_EQ(
          capacity,
          in_size,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "The 'shape' in ReshapeOp is invalid. "
              "The input tensor X'size must be equal to the capacity of "
              "'shape'. "
@@ -199,7 +199,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
    PADDLE_ENFORCE_LE(
        capacity,
        in_size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The 'shape' in ReshapeOp is invalid. "
            "The input tensor X's shape = [%s], X's capacity = %d."
            "But the target shape of Out is [%s],  the "
@@ -364,7 +364,7 @@ void SplitInferMeta(const MetaTensor& x,
  PADDLE_ENFORCE_EQ(
      axis_value >= -rank && axis_value < rank,
      true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "The axis is expected to be in range of [%d, %d), but got %d",
          -rank,
          rank,
@@ -383,7 +383,7 @@ void SplitInferMeta(const MetaTensor& x,

    PADDLE_ENFORCE_EQ(input_axis_dim % num,
                      0,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "The input's size along the split dimension "
                          "must be evenly divisible by Attr(num_or_sections). "
                          "But received Attr(num_or_sections) "
@@ -416,7 +416,7 @@ void SplitInferMeta(const MetaTensor& x,
    if (config.is_runtime) {
      PADDLE_ENFORCE_LE(num_of_unknow,
                        1,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                            "Only one dimension value of Attr(num_or_sections) "
                            "in SplitOp can be -1. "
                            "But received Attr(num_or_sections) = [%s].",
@@ -430,7 +430,7 @@ void SplitInferMeta(const MetaTensor& x,
      PADDLE_ENFORCE_LT(
          sum_of_section,
          input_axis_dim,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "Sum of Attr(num_or_sections) other than unknown section "
              "must be less than the input's "
              "size "
@@ -447,7 +447,7 @@ void SplitInferMeta(const MetaTensor& x,
      PADDLE_ENFORCE_EQ(
          sum_of_section,
          input_axis_dim,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "Sum of Attr(num_or_sections) must be equal to the input's "
              "size "
              "along the split dimension. But received Attr(num_or_sections)"

--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx,
        PADDLE_ENFORCE_EQ(
            x[i].lod().size(),
            lod_size_0,
-            paddle::platform::errors::Unimplemented(
+            phi::errors::Unimplemented(
                "The lod level of all input LoDTensors should be same. "
                "Maybe different lod level of input LoDTensors can concat,"
                "it is not supported currently. The lod level of %dth input "

--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -127,7 +127,7 @@ struct SameDimsDivideFunctor<
                  const DenseTensor& x,
                  const DenseTensor& y,
                  DenseTensor* z) {
-    paddle::platform::errors::InvalidArgument(
+    phi::errors::InvalidArgument(
        "If use SameDimsDivideFunctor, template args(T) must be floating "
        "point. ");
  }
@@ -278,12 +278,10 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
  std::vector<int> index_array(max_dim, 0);
  const T* x_data = x.data<T>();
  const T* y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(x_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(y_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input Y should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      x_data, phi::errors::InvalidArgument("The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
  OutType* out_data = ctx.Alloc<OutType>(z);

  const int out_size = std::accumulate(
@@ -317,12 +315,12 @@ void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
  PADDLE_ENFORCE_GE(
      axis,
      0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Axis should be great than or equal to 0, but received axis is %d.",
          axis));
  PADDLE_ENFORCE_LT(axis,
                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Axis should be less than %d, but received axis is %d.",
                        max_dim,
                        axis));
@@ -385,12 +383,12 @@ void ElementwiseCompute(const CPUContext& dev_ctx,
  PADDLE_ENFORCE_GE(
      axis,
      0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Axis should be great than or equal to 0, but received axis is %d.",
          axis));
  PADDLE_ENFORCE_LT(axis,
                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Axis should be less than %d, but received axis is %d.",
                        max_dim,
                        axis));
@@ -630,12 +628,12 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
  PADDLE_ENFORCE_GE(
      axis,
      0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Axis should be great than or equal to 0, but received axis is %d.",
          axis));
  PADDLE_ENFORCE_LT(axis,
                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Axis should be less than %d, but received axis is %d.",
                        max_dim,
                        axis));

--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -48,7 +48,7 @@ void MaskedSelectKernel(const Context& dev_ctx,

  DDim out_dim{out_size};
  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(paddle::platform::CPUPlace());
+  auto out_data = out->mutable_data<T>(phi::CPUPlace());

  int index = 0;
  for (int i = 0; i < mask_size; i++) {

--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -42,12 +42,12 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
  PADDLE_ENFORCE_GE(
      axis,
      0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Axis should be great than or equal to 0, but received axis is %d.",
          axis));
  PADDLE_ENFORCE_LT(axis,
                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Axis should be less than %d, but received axis is %d.",
                        max_dim,
                        axis));
@@ -72,7 +72,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
        x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
            y_dims_array[i] <= 1,
        true,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "Broadcast dimension mismatch. Operands could "
            "not be broadcast together with the shape of X = [%s] and "
            "the shape of Y = [%s]. Received [%d] in X is not equal to "

--- a/paddle/phi/kernels/funcs/concat_funcs.h
+++ b/paddle/phi/kernels/funcs/concat_funcs.h
@@ -23,7 +23,7 @@ static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
  PADDLE_ENFORCE_EQ(
      axis >= -rank && axis < rank,
      true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "The axis is expected to be in range of [%d, %d), but got %d",
          -rank,
          rank,
@@ -42,17 +42,17 @@ static inline phi::DDim ComputeAndCheckShape(
  auto out_dims = inputs_dims[0];
  size_t in_zero_dims_size = out_dims.size();
  for (size_t i = 1; i < n; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
-                      out_dims.size(),
-                      paddle::platform::errors::InvalidArgument(
-                          "The shape of input[0] and input[%d] "
-                          "is expected to be equal."
-                          "But received input[0]'s shape = "
-                          "[%s], input[%d]'s shape = [%s].",
-                          i,
-                          inputs_dims[0],
-                          i,
-                          inputs_dims[i]));
+    PADDLE_ENFORCE_EQ(
+        inputs_dims[i].size(),
+        out_dims.size(),
+        phi::errors::InvalidArgument("The shape of input[0] and input[%d] "
+                                     "is expected to be equal."
+                                     "But received input[0]'s shape = "
+                                     "[%s], input[%d]'s shape = [%s].",
+                                     i,
+                                     inputs_dims[0],
+                                     i,
+                                     inputs_dims[i]));
    for (size_t j = 0; j < in_zero_dims_size; j++) {
      if (j == axis) {
        if (is_runtime) {
@@ -71,7 +71,7 @@ static inline phi::DDim ComputeAndCheckShape(
          // check all shape in run time
          PADDLE_ENFORCE_EQ(inputs_dims[0][j],
                            inputs_dims[i][j],
-                            paddle::platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                "The %d-th dimension of input[0] and input[%d] "
                                "is expected to be equal."
                                "But received input[0]'s shape = "
@@ -92,4 +92,4 @@ static inline phi::DDim ComputeAndCheckShape(
 }

 }  // namespace funcs
-}  // namespace  pten
+}  // namespace  phi
--- a/paddle/phi/kernels/funcs/eigen/common.h
+++ b/paddle/phi/kernels/funcs/eigen/common.h
@@ -21,7 +21,7 @@ limitations under the License. */

 namespace phi {

-// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+// EigenDim converts phi::DDim into Eigen::DSizes.
 template <int D>
 struct EigenDim {
  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
@@ -29,7 +29,7 @@ struct EigenDim {
  static Type From(const DDim& dims) {
    PADDLE_ENFORCE_EQ(arity(dims),
                      D,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "Input dimension size should be equal to %d, but "
                          "received dimension size is %d.",
                          arity(dims),
@@ -42,7 +42,7 @@ struct EigenDim {
  }
 };

-// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+// Interpret phi::Tensor as EigenTensor and EigenConstTensor.
 template <typename T,
          size_t D,
          int MajorType = Eigen::RowMajor,
@@ -86,7 +86,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
    int rank = tensor.dims().size();
    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                      true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "Input dimension number(num_col_dims) must be "
                          "between 0 and %d, but received number is %d.",
                          rank,
@@ -100,7 +100,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
    int rank = tensor.dims().size();
    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                      true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "Input dimension number(num_col_dims) must be "
                          "between 0 and %d, but received number is %d.",
                          rank,

--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -343,7 +343,7 @@ inline void get_mid_dims(const DDim &x_dims,
    if (x_dims[i + axis] != y_dims[i]) {
      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
                        true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                            "Broadcast dimension mismatch. Operands "
                            "could not be broadcast together with the shape of "
                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
@@ -754,7 +754,7 @@ void ElementwiseKernel(const KPDevice &ctx,
  const int kArity = Traits::arity;
  PADDLE_ENFORCE_EQ(ins.size(),
                    kArity,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The number of inputs is expected to be equal to the "
                        "arity of functor. But recieved: the number of inputs "
                        "is %d, the arity of functor is %d.",
@@ -762,7 +762,7 @@ void ElementwiseKernel(const KPDevice &ctx,
                        kArity));
  PADDLE_ENFORCE_EQ(outs->size(),
                    NumOuts,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Number of outputs shall equal to number of functions, "
                        "but number of outputs is %d, of functions is %d.",
                        outs->size(),
@@ -773,7 +773,7 @@ void ElementwiseKernel(const KPDevice &ctx,
      PADDLE_ENFORCE_EQ(
          (*outs)[i]->dims(),
          (*outs)[0]->dims(),
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "The shape of each output tensor shall be identical yet, "
              "but %dth output tensor`s shape is not.",
              i));
@@ -796,7 +796,7 @@ void ElementwiseKernel(const KPDevice &ctx,
          ctx, ins, outs, func);
      break;
    default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
          "Unsupported vectorized size: %d !", vec_size));
      break;
    }

--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -184,7 +184,7 @@ struct TensorSetConstantCPU {
      : tensor_(tensor), value_(value) {}
  template <typename T>
  void apply() const {
-    auto cpu = paddle::platform::CPUPlace();
+    auto cpu = phi::CPUPlace();
    auto* begin = tensor_->mutable_data<T>(cpu);
    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
  }
@@ -197,8 +197,7 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("XPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("XPUPlace is not supported"));
 }

 template <>
@@ -206,8 +205,7 @@ void set_constant_with_place<paddle::platform::NPUPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("NPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
 }

 template <>
@@ -215,8 +213,7 @@ void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "NPUPinnedPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }

 template <>
@@ -224,8 +221,7 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
 }

 template <>
@@ -233,12 +229,11 @@ void set_constant_with_place<paddle::platform::CustomPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("CustomPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
 }

 template <>
-void set_constant_with_place<paddle::platform::CPUPlace>(
+void set_constant_with_place<phi::CPUPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
@@ -250,8 +245,7 @@ void set_constant_with_place<paddle::platform::MLUPlace>(
    const paddle::platform::DeviceContext& context,
    paddle::framework::Tensor* tensor,
    float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("MLUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
 }

 template <>
@@ -286,7 +280,7 @@ void set_constant(const paddle::platform::DeviceContext& context,
  // tensor->place().apply_visitor(func);
  paddle::platform::VisitPlace(tensor->place(), func);
 #else
-  func(paddle::platform::CPUPlace());
+  func(phi::CPUPlace());
 #endif
 }

@@ -302,7 +296,7 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
    PADDLE_ENFORCE_EQ(
        vector.numel(),
        size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The input vector size"
            " should be equal to the size of each row of input tensor."
            " Expected vector size=%d, but received %d",
@@ -312,7 +306,7 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
    const char* out_dims_cstr = out_dims.to_str().c_str();
    PADDLE_ENFORCE_EQ(out_dims,
                      in_dims,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                          "The output tensor shape should be same as the input"
                          " tensor shape. Expected output tensor shape: %s,"
                          " but received %s",

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -257,7 +257,7 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(
        vector.numel(),
        size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The input vector size"
            " should be equal to the size of each row of input tensor."
            " Expected vector size=%d, but received %d",
@@ -268,7 +268,7 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(
        out_dims,
        in_dims,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The output tensor shape should be same as the input tensor"
            " shape. Expected output tensor shape: %s,"
            " but received %s",
@@ -303,7 +303,7 @@ void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
  auto size = input.numel() / in_dims[0];
  PADDLE_ENFORCE_EQ(vector->numel(),
                    size,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The size of input vector"
                        " should be equal to the size of input tensor column"
                        " dimension. Expected vector size=%d, but received %d",
@@ -339,7 +339,7 @@ void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
  auto size = input.numel() / in_dims[0];
  PADDLE_ENFORCE_EQ(vector->numel(),
                    in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The size of input vector"
                        " should be equal to the size of input tensor row"
                        " dimension. Expected vector size=%d, but received %d",

--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -115,7 +115,7 @@ struct TensorSetConstantXPU {
    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
    paddle::memory::Copy(place_,
                         begin,
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                         static_cast<void*>(data_cpu.get()),
                         numel * sizeof(T));
  }

--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -74,7 +74,7 @@ void ColwiseSum<DeviceContext, T>::operator()(
  auto size = input.numel() / in_dims[0];
  PADDLE_ENFORCE_EQ(out->numel(),
                    size,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The size of output tensor "
                        "should be equal to the size of input tensor column"
                        " dimension. Expected output size=%d, but received %d",
@@ -102,7 +102,7 @@ class ColwiseSum<paddle::platform::CPUDeviceContext, T> {
    PADDLE_ENFORCE_EQ(
        out->numel(),
        size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The size of output tensor "
            "should be equal to the size of input tensor column"
            " dimension. Expected output size=%d, but received %d",
@@ -130,15 +130,14 @@ void RowwiseMean<DeviceContext, T>::operator()(
    const paddle::framework::Tensor& input,
    paddle::framework::Tensor* out) {
  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      2U,
-      paddle::platform::errors::InvalidArgument("The rank of input tensor "
-                                                "should be 2, but received %d",
-                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    2U,
+                    phi::errors::InvalidArgument("The rank of input tensor "
+                                                 "should be 2, but received %d",
+                                                 in_dims.size()));
  PADDLE_ENFORCE_EQ(out->numel(),
                    in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The size of output tensor "
                        "should be equal to the size of input tensor row"
                        " dimension. Expected output size=%d, but received %d",
@@ -161,18 +160,18 @@ class RowwiseMean<paddle::platform::CPUDeviceContext, T> {
                  const paddle::framework::Tensor& input,
                  paddle::framework::Tensor* out) {
    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(),
-                      2U,
-                      paddle::platform::errors::InvalidArgument(
-                          "The rank of input tensor "
-                          "should be 2, but received %d",
-                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(),
+        2U,
+        phi::errors::InvalidArgument("The rank of input tensor "
+                                     "should be 2, but received %d",
+                                     in_dims.size()));
    auto height = in_dims[0];
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(
        out->numel(),
        height,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The size of output tensor "
            "should be equal to the size of input tensor row"
            " dimension. Expected output size=%d, but received %d",
@@ -198,15 +197,14 @@ void RowwiseSum<DeviceContext, T>::operator()(
    const paddle::framework::Tensor& input,
    paddle::framework::Tensor* out) {
  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      2U,
-      paddle::platform::errors::InvalidArgument("The rank of input tensor "
-                                                "should be 2, but received %d",
-                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    2U,
+                    phi::errors::InvalidArgument("The rank of input tensor "
+                                                 "should be 2, but received %d",
+                                                 in_dims.size()));
  PADDLE_ENFORCE_EQ(out->numel(),
                    in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The size of output tensor "
                        "should be equal to the size of input tensor row"
                        " dimension. Expected output size=%d, but received %d",
@@ -229,18 +227,18 @@ class RowwiseSum<paddle::platform::CPUDeviceContext, T> {
                  const paddle::framework::Tensor& input,
                  paddle::framework::Tensor* out) {
    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(),
-                      2U,
-                      paddle::platform::errors::InvalidArgument(
-                          "The rank of input tensor "
-                          "should be 2, but received %d",
-                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(),
+        2U,
+        phi::errors::InvalidArgument("The rank of input tensor "
+                                     "should be 2, but received %d",
+                                     in_dims.size()));
    auto height = in_dims[0];
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(
        out->numel(),
        height,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "The size of output tensor "
            "should be equal to the size of input tensor row"
            " dimension. Expected output size=%d, but received %d",

--- a/paddle/phi/kernels/gpu/concat_and_split.h
+++ b/paddle/phi/kernels/gpu/concat_and_split.h
@@ -16,7 +16,6 @@
 #include <algorithm>
 #include <vector>
 #include "gflags/gflags.h"
-#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
@@ -329,7 +328,7 @@ void ConcatImpl(const Context& context,
        inputs_data, in_num);
    paddle::memory::Copy(context.GetPlace(),
                         tmp_dev_ins_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                         restored,
                         in_num * sizeof(T*),
                         context.stream());
@@ -376,7 +375,7 @@ void ConcatImpl(const Context& context,
        inputs_col, inputs_col_num);
    paddle::memory::Copy(context.GetPlace(),
                         tmp_dev_ins_col_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                         restored,
                         inputs_col_num * sizeof(int64_t),
                         context.stream());
@@ -488,7 +487,7 @@ void SplitImpl(const Context& context,
        outputs_data, o_num);
    paddle::memory::Copy(context.GetPlace(),
                         tmp_dev_outs_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                         restored,
                         o_num * sizeof(T*),
                         context.stream());
@@ -535,7 +534,7 @@ void SplitImpl(const Context& context,
        outputs_cols, outputs_cols_num);
    paddle::memory::Copy(context.GetPlace(),
                         tmp_dev_ins_col_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                         restored,
                         outputs_cols_num * sizeof(int64_t),
                         context.stream());

--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx,
        PADDLE_ENFORCE_EQ(
            x[i].lod().size(),
            lod_size_0,
-            paddle::platform::errors::Unimplemented(
+            phi::errors::Unimplemented(
                "The lod level of all input LoDTensors should be same. "
                "Maybe different lod level of input LoDTensors can concat,"
                "it is not supported currently. The lod level of %dth input "

--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -35,7 +35,7 @@ void Copy(const Context& dev_ctx,
  auto dst_place = dst->place();

  if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
        "The src and dst tensor are all CPU tensor, you should call copy "
        "function in CPU mode."));
  }
@@ -74,13 +74,13 @@ void Copy(const Context& dev_ctx,
    PADDLE_ENFORCE_EQ(
        paddle::platform::is_gpu_place(ctx_place),
        true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
    auto ctx_gpu_place = ctx_place;
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                          "Source place and context place do not match, source "
                          "place is %s, context place is %s.",
                          src_gpu_place,
@@ -98,13 +98,13 @@ void Copy(const Context& dev_ctx,
    PADDLE_ENFORCE_EQ(
        paddle::platform::is_gpu_place(ctx_place),
        true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
    auto ctx_gpu_place = ctx_place;
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                          "Destination place and context place do not match, "
                          "destination place is %s, context place is %s.",
                          dst_gpu_place,
@@ -121,14 +121,14 @@ void Copy(const Context& dev_ctx,
    auto ctx_place = dev_ctx.GetPlace();
    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
                      true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                          "Device context place mismatch. When copying Tensor "
                          "data from GPU memory to CUDA Pinned memory, current "
                          "device context place should be GPU."));
    auto ctx_gpu_place = ctx_place;
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                          "The source GPU device and current device context do "
                          "not match. The source GPU device number is %d, but "
                          "device context GPU number is %d.",
@@ -146,14 +146,14 @@ void Copy(const Context& dev_ctx,
    auto ctx_place = dev_ctx.GetPlace();
    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
                      true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                          "Device context place mismatch. When copying Tensor "
                          "data from CUDA Pinned memory to GPU memory, current "
                          "device context place should be GPU."));
    auto ctx_gpu_place = ctx_place;
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                          "The target GPU device and current device context do "
                          "not match. The target GPU device number is %d, but "
                          "device context GPU number is %d.",
@@ -172,7 +172,7 @@ void Copy(const Context& dev_ctx,
    PADDLE_ENFORCE_EQ(
        paddle::platform::is_gpu_place(ctx_place),
        true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
    auto stream =
@@ -195,12 +195,12 @@ void Copy(const Context& dev_ctx,
        paddle::memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
      } else {
-        PADDLE_THROW(paddle::platform::errors::Unavailable(
+        PADDLE_THROW(phi::errors::Unavailable(
            "Context place dose not match the source and destination place."));
      }
    }
  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
        "Place type error. Please check the place of src and dst Tensor."));
  }
 }

--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -714,7 +714,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                             DX_OP dx_op,
                             DY_OP dy_op) {
  const auto gplace = ctx.GetPlace();
-  auto cplace = paddle::platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
  const T *x_data = x.data<T>();
  const T *y_data = y.data<T>();
  const Tout *out_data = out.data<Tout>();
@@ -1339,12 +1339,12 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
  PADDLE_ENFORCE_GE(
      axis,
      0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "Axis should be great than or equal to 0, but received axis is %d.",
          axis));
  PADDLE_ENFORCE_LT(axis,
                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Axis should be less than %d, but received axis is %d.",
                        max_dim,
                        axis));

--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -111,9 +111,9 @@ void HistogramKernel(const Context& dev_ctx,

    DenseTensor input_min_cpu, input_max_cpu;
    paddle::framework::TensorCopySync(
-        input_min_t, paddle::platform::CPUPlace(), &input_min_cpu);
+        input_min_t, phi::CPUPlace(), &input_min_cpu);
    paddle::framework::TensorCopySync(
-        input_max_t, paddle::platform::CPUPlace(), &input_max_cpu);
+        input_max_t, phi::CPUPlace(), &input_max_cpu);

    output_min = input_min_cpu.data<T>()[0];
    output_max = input_max_cpu.data<T>()[0];

--- a/paddle/phi/kernels/impl/full_kernel_impl.h
+++ b/paddle/phi/kernels/impl/full_kernel_impl.h
@@ -59,7 +59,7 @@ void FullLikeKernel(const Context& dev_ctx,
          (common_type_value <=
           static_cast<CommonType>(std::numeric_limits<T>::max())),
      true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "The filled value is out of range for target type, "
          "current kernel type is %s, the range should between %f "
          "and %f, but now value is %f.",

--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -38,7 +38,7 @@ static void GetBroadcastFromDims(const int x_ndim,
    PADDLE_ENFORCE_EQ(
        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
        true,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "Input(X) and Input(Y) has error dim."
            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s],"
            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1,"
@@ -110,7 +110,7 @@ void MatMulFunction(const Context& dev_ctx,
    PADDLE_ENFORCE_EQ(
        M,
        N,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
            "X's numbers must be equal to Y's numbers,"
            "when X/Y's dims =1. But received X has [%d] elements,"
            "received Y has [%d] elements",
@@ -135,27 +135,27 @@ void MatMulFunction(const Context& dev_ctx,
  if (x_ndim == 1) {
    const int N = X.numel();
    if (trans_y) {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 1,
-                            N,
-                            y_ndim - 1,
-                            y_dims[y_ndim - 1]));
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 1,
+                                       N,
+                                       y_ndim - 1,
+                                       y_dims[y_ndim - 1]));
    } else {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 2,
-                            N,
-                            y_ndim - 2,
-                            y_dims[y_ndim - 2]));
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 2,
+                                       N,
+                                       y_ndim - 2,
+                                       y_dims[y_ndim - 2]));
    }
    std::vector<std::int64_t> out_dims(y_ndim - 1);
    if (trans_y) {
@@ -213,27 +213,27 @@ void MatMulFunction(const Context& dev_ctx,
  if (y_ndim == 1) {
    const int N = Y.numel();
    if (trans_x) {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 2],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 2,
-                            N,
-                            x_ndim - 2,
-                            x_dims[x_ndim - 2]));
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 2,
+                                       N,
+                                       x_ndim - 2,
+                                       x_dims[x_ndim - 2]));
    } else {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 1],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 1,
-                            N,
-                            x_ndim - 1,
-                            x_dims[x_ndim - 1]));
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 1,
+                                       N,
+                                       x_ndim - 1,
+                                       x_dims[x_ndim - 1]));
    }
    std::vector<std::int64_t> out_dims(x_ndim - 1);
    if (trans_x) {
@@ -292,27 +292,27 @@ void MatMulFunction(const Context& dev_ctx,
  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
  if (trans_y) {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
-                      K,
-                      paddle::platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 1,
-                          K,
-                          y_ndim - 1,
-                          y_dims[y_ndim - 1]));
+    PADDLE_ENFORCE_EQ(
+        y_dims[y_ndim - 1],
+        K,
+        phi::errors::InvalidArgument("Input(Y) has error dim."
+                                     "Y'dims[%d] must be equal to %d"
+                                     "But received Y'dims[%d] is %d",
+                                     y_ndim - 1,
+                                     K,
+                                     y_ndim - 1,
+                                     y_dims[y_ndim - 1]));
  } else {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
-                      K,
-                      paddle::platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 2,
-                          K,
-                          y_ndim - 2,
-                          y_dims[y_ndim - 2]));
+    PADDLE_ENFORCE_EQ(
+        y_dims[y_ndim - 2],
+        K,
+        phi::errors::InvalidArgument("Input(Y) has error dim."
+                                     "Y'dims[%d] must be equal to %d"
+                                     "But received Y'dims[%d] is %d",
+                                     y_ndim - 2,
+                                     K,
+                                     y_ndim - 2,
+                                     y_dims[y_ndim - 2]));
  }
  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
  const int ndim = (std::max)(x_ndim, y_ndim);
@@ -493,16 +493,16 @@ void MatmulKernel(const Context& dev_ctx,
                  bool transpose_x,
                  bool transpose_y,
                  DenseTensor* out) {
-  PADDLE_ENFORCE_NE(phi::product(x.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(X) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(phi::product(y.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(Y) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(
+      phi::product(x.dims()),
+      0,
+      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
+                                   " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(
+      phi::product(y.dims()),
+      0,
+      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
+                                   " but reviced dims size is 0. "));
  MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
 }


--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -41,7 +41,7 @@ inline int64_t GetNonZeroNum(const DenseTensor& dense,
  PADDLE_ENFORCE_GE(
      dims.size(),
      sparse_dim,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
          "sparse_dim(%d) should be less than or equal to dense.dim(%d)",
          sparse_dim,
          dims.size()));
@@ -161,7 +161,7 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
  bool valid = x_dims.size() == 2 || x_dims.size() == 3;
  PADDLE_ENFORCE_EQ(valid,
                    true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "SparseCsrTensor only support 2-D or 3-D matrix"));
  const int64_t non_zero_num = x.nnz();
  if (non_zero_num <= 0) return;

--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -379,7 +379,7 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
  bool valid = x_dims.size() == 2 || x_dims.size() == 3;
  PADDLE_ENFORCE_EQ(valid,
                    true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "SparseCsrTensor only support 2-D or 3-D matrix"));
  const int64_t non_zero_num = x.nnz();
  if (non_zero_num <= 0) return;

--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -97,7 +97,7 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
  bool valid = x_dims.size() == 2 || x_dims.size() == 3;
  PADDLE_ENFORCE_EQ(valid,
                    true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "SparseCsrTensor only support 2-D or 3-D Tensor."));
  const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);

--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -62,7 +62,7 @@ void Copy(const Context& dev_ctx,
    }
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 }

--- a/paddle/phi/kernels/xpu/scale_kernel.cc
+++ b/paddle/phi/kernels/xpu/scale_kernel.cc
@@ -32,13 +32,13 @@ void ScaleKernel(const Context& dev_ctx,
                 DenseTensor* out) {
  out->mutable_data<T>(dev_ctx.GetPlace());

-  PADDLE_ENFORCE_EQ(x.dims(),
-                    out->dims(),
-                    paddle::platform::errors::InvalidArgument(
-                        "In and out should have the same dim,"
-                        " expected %s, but got %s.",
-                        x.dims().to_str().c_str(),
-                        out->dims().to_str().c_str()));
+  PADDLE_ENFORCE_EQ(
+      x.dims(),
+      out->dims(),
+      phi::errors::InvalidArgument("In and out should have the same dim,"
+                                   " expected %s, but got %s.",
+                                   x.dims().to_str().c_str(),
+                                   out->dims().to_str().c_str()));
  using XPUType = typename XPUTypeTrait<T>::Type;
  int r = xpu::scale(dev_ctx.x_context(),
                     reinterpret_cast<const XPUType*>(x.data<T>()),
@@ -50,7 +50,7 @@ void ScaleKernel(const Context& dev_ctx,
  PADDLE_ENFORCE_EQ(
      r,
      XPU_SUCCESS,
-      paddle::platform::errors::External(
+      phi::errors::External(
          "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
 }


--- a/paddle/phi/tests/core/allocator.h
+++ b/paddle/phi/tests/core/allocator.h
@@ -29,8 +29,7 @@ class FancyAllocator : public phi::Allocator {

  AllocationPtr Allocate(size_t bytes_size) override {
    void* data = ::operator new(bytes_size);
-    auto* allocation =
-        new phi::Allocation(data, bytes_size, paddle::platform::CPUPlace());
+    auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace());
    return AllocationPtr(allocation, Delete);
  }
 };

--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -85,7 +85,7 @@ TEST(dense_tensor, ctor) {
    r = r && (t.dims() == m.dims);
    r = r && (t.dtype() == m.dtype);
    r = r && (t.layout() == m.layout);
-    r = r && (t.place() == paddle::platform::CPUPlace());
+    r = r && (t.place() == phi::CPUPlace());
    r = r && t.initialized();
    r = r && t.IsSharedWith(t);
    return r;

--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -53,7 +53,7 @@ TEST(sparse_coo_tensor, construct) {
  CHECK(sparse.dims() == dense_dims);
  CHECK(sparse.dtype() == DataType::FLOAT32);
  CHECK(sparse.layout() == DataLayout::SPARSE_COO);
-  CHECK(sparse.place() == paddle::platform::CPUPlace());
+  CHECK(sparse.place() == phi::CPUPlace());
 }

 TEST(sparse_coo_tensor, other_function) {

--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -133,6 +133,8 @@
 #include <iostream>
 #include <sstream>

+#include "paddle/utils/string/to_string.h"
+
 namespace paddle {
 namespace string {
 namespace tinyformat {

--- a/paddle/utils/string/to_string.h
+++ b/paddle/utils/string/to_string.h
@@ -56,5 +56,26 @@ inline std::string to_string(const char* v) {
  return std::string(v);
 }

+inline std::ostream& operator<<(std::ostream& os,
+                                const std::vector<std::vector<size_t>>& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    bool is_first = true;
+    for (auto& i : v) {
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 }  // namespace string
 }  // namespace paddle