[PTen] Tidy pten core headers (#39188)

* open header for custom kernel * add core utils * tidy core code * tify header * tidy include * tidy namespace * resolve conflit * fix unittest and coverage * remove platform using * resolve conflict * resolve conflict * fix digamma namespace error * fix xpu full kernel error * fix xpu full kernel error * polish details * add place for lib storage

[PTen] Tidy pten core headers (#39188)
* open header for custom kernel * add core utils * tidy core code * tify header * tidy include * tidy namespace * resolve conflit * fix unittest and coverage * remove platform using * resolve conflict * resolve conflict * fix digamma namespace error * fix xpu full kernel error * fix xpu full kernel error * polish details * add place for lib storage
dd990981 · Chen Weihang · GitHub · 64e7c715 · dd990981 · dd990981
99 changed file
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -21,7 +21,7 @@
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 /**
 * This class is used by Eager mode for now. It's painful to do this in Eager
 * Mode, the better

--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/api/ext/op_kernel_info.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_registry.h"


--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/utils/any.h"

 namespace paddle {

--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "paddle/pten/core/dim.h"
+#include "paddle/pten/core/utils/dim.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -18,12 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/compat/op_utils.h"
-#include "paddle/pten/core/compat_utils.h"
-#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/infermeta_utils.h"
 #include "paddle/pten/core/meta_tensor.h"
+#include "paddle/pten/core/tensor_utils.h"

 namespace paddle {
 namespace framework {
@@ -126,8 +126,9 @@ class CompatMetaTensor : public pten::MetaTensor {
      auto* var = BOOST_GET_CONST(Variable*, var_);
      return var->Get<LoDTensor>().layout();
    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported get layout for VarDesc now."));
+      // NOTE(chenweihang): do nothing
+      // Unsupported get layout for VarDesc now
+      return DataLayout::UNDEFINED;
    }
  }

@@ -135,7 +136,7 @@ class CompatMetaTensor : public pten::MetaTensor {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
          static_cast<pten::DenseTensor*>(tensor))
          ->dims = dims;
    } else {
@@ -148,7 +149,7 @@ class CompatMetaTensor : public pten::MetaTensor {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
          static_cast<pten::DenseTensor*>(tensor))
          ->dtype = dtype;
    } else {
@@ -161,12 +162,12 @@ class CompatMetaTensor : public pten::MetaTensor {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
          static_cast<pten::DenseTensor*>(tensor))
          ->layout = layout;
    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported set layout for VarDesc now."));
+      // NOTE(chenweihang): do nothing
+      // Unsupported set layout for VarDesc now
    }
  }

@@ -174,7 +175,7 @@ class CompatMetaTensor : public pten::MetaTensor {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
          static_cast<pten::DenseTensor*>(tensor))
          ->lod =
          static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();

--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <sstream>

 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/compat/op_utils.h"
-#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"

 #include "paddle/fluid/framework/lod_tensor.h"

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/pten/api/lib/utils/storage.h"

 DECLARE_bool(use_stream_safe_cuda_allocator);


--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/pten/core/type_defs.h"
+#include "paddle/pten/core/compat/type_defs.h"
 #include "paddle/utils/small_vector.h"

 namespace paddle {

--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -13,4 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/pten/core/type_defs.h"
+#include "paddle/pten/core/compat/type_defs.h"
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once

 #include "paddle/fluid/platform/complex.h"
-#include "paddle/pten/core/array.h"
+#include "paddle/pten/core/utils/array.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"

 namespace paddle {

--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/operators/roll_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/pten/core/array.h"
+#include "paddle/pten/core/utils/array.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "pybind11/detail/internals.h"
 #include "pybind11/numpy.h"

--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"

 namespace paddle {

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/pten/api/include/api.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 namespace paddle {
 namespace pybind {

--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"


--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"

 namespace paddle {

--- a/paddle/pten/api/lib/api_utils.h
+++ b/paddle/pten/api/lib/api_utils.h
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"

 namespace paddle {

--- a/paddle/pten/api/lib/kernel_dispatch.cc
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/pten/api/lib/kernel_dispatch.h"

-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"

 namespace paddle {
 namespace experimental {

--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -23,11 +23,11 @@ limitations under the License. */
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/compat_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_utils.h"

 /**
 * [ Why still include the fluid headers? ]
@@ -77,7 +77,7 @@ Tensor::Tensor(const PlaceType &place)
          std::move(pten::make_intrusive<SharedStorage>(
              ConvertExtPlaceToInnerPlace(place))),
          std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
-                                          framework::make_ddim({}),
+                                          pten::framework::make_ddim({}),
                                          pten::DataLayout::NCHW))))),
      place_{place} {}

@@ -86,7 +86,7 @@ Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
          std::move(pten::make_intrusive<SharedStorage>(
              ConvertExtPlaceToInnerPlace(place))),
          std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
-                                          framework::make_ddim(shape),
+                                          pten::framework::make_ddim(shape),
                                          pten::DataLayout::NCHW))))),
      place_{place} {}

@@ -113,7 +113,7 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
                  "the tensor to remain constant.";
  if (is_dense_tensor()) {
    std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->set_meta(
-        pten::DenseTensorMeta(dtype(), framework::make_ddim(shape)));
+        pten::DenseTensorMeta(dtype(), pten::framework::make_ddim(shape)));
  } else {
    PADDLE_THROW(pten::errors::Unimplemented(
        "Only support reshape operation on DenseTensor now."));
@@ -270,7 +270,7 @@ Tensor::data<paddle::platform::float16>();
 Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
  if (is_dense_tensor()) {
    return Tensor(std::make_shared<pten::DenseTensor>(
-        std::move(pten::CompatibleDenseTensorUtils::Slice(
+        std::move(pten::DenseTensorUtils::Slice(
            *(std::dynamic_pointer_cast<pten::DenseTensor>(impl_).get()),
            begin_idx,
            end_idx))));

--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -22,7 +22,7 @@ namespace experimental {

 class ExternalStorage : public pten::Storage {
 public:
-  ExternalStorage(void* ptr, size_t size, const paddle::platform::Place& place);
+  ExternalStorage(void* ptr, size_t size, const pten::Place& place);
  ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
                  size_t delta,
                  size_t size);
@@ -52,7 +52,7 @@ class ExternalStorage : public pten::Storage {
  }

  size_t size() const noexcept override { return size_; }
-  const paddle::platform::Place& place() const override {
+  const pten::Place& place() const override {
    PADDLE_ENFORCE_NOT_NULL(
        data_,
        paddle::platform::errors::Unavailable(
@@ -78,9 +78,7 @@ class SharedStorage : public pten::Storage {
  // In order to be compatible with the original Tensor design and execution
  // system, we need to allow the uninitialized SharedStorage to exist,
  // and it can be removed after the compatibility phase is over in the future
-  explicit SharedStorage(const paddle::platform::Place& place) {
-    place_ = place;
-  }
+  explicit SharedStorage(const pten::Place& place) { place_ = place; }

  void Realloc(size_t n) override {
    this->Clear();
@@ -106,14 +104,14 @@ class SharedStorage : public pten::Storage {

  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
    size_ = 0;
-    place_ = Place();
+    place_ = pten::Place();
    return std::move(data_);
  }

  size_t size() const noexcept override {
    return data_ ? data_->size() : size_;
  }
-  const paddle::platform::Place& place() const override {
+  const pten::Place& place() const override {
    return data_ ? data_->place() : place_;
  }
  bool OwnsMemory() const noexcept override { return false; }
@@ -130,15 +128,13 @@ class SharedStorage : public pten::Storage {
  }

  // Temporary method: For compatible with fluid Tensor and improve performance
-  void ResetAllocationPlace(const paddle::platform::Place& place) {
-    place_ = place;
-  }
+  void ResetAllocationPlace(const pten::Place& place) { place_ = place; }

  // Temporary method: For compatible with fluid Tensor and improve performance
  void Reset() { this->Clear(); }

 private:
-  Place place_;
+  pten::Place place_;
  int64_t size_{0};
 };


--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>

-#include "paddle/pten/core/compat_utils.h"
+#include "paddle/pten/core/tensor_utils.h"

 namespace paddle {
 namespace experimental {
@@ -201,7 +201,7 @@ pten::ScalarArray MakePtenScalarArrayFromVarList(
 void ResetTensorByArgDef(pten::DenseTensor* dst,
                         const pten::TensorArgDef& arg_def) {
  VLOG(5) << "ResetTensor by TensorArgDef.";
-  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  auto* meta = pten::DenseTensorUtils::GetMutableMeta(dst);
  meta->dtype = arg_def.dtype;
  meta->layout = arg_def.layout;
 }

--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_factory.h"


--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
-# utils used for compatible for fluid op system
+# compatible utils used for fluid op system
 add_subdirectory(compat)

-if(WITH_GPU)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
-elseif(WITH_ROCM)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
-else()
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
-endif()
-
 cc_library(errors SRCS errors.cc)
 set(pten_enforce_deps errors flags)
 if(WITH_GPU)
@@ -19,21 +11,20 @@ cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps})
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)

-cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS pten_enforce)
+cc_library(ddim SRCS ddim.cc DEPS pten_enforce)
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
 cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+
+cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base)
+cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
-cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base)
-cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base )

 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector pten_enforce ddim)

-cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
-
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
    add_dependencies(dense_tensor mkldnn)

--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -16,7 +16,9 @@ limitations under the License. */

 #include <cstdint>
 #include <functional>
-#include "paddle/fluid/platform/place.h"
+#include <memory>
+
+#include "paddle/pten/common/place.h"

 namespace pten {

@@ -26,7 +28,7 @@ namespace pten {
 /// support being inherited.
 class Allocation {
 public:
-  using Place = paddle::platform::Place;
+  using Place = pten::Place;
  using DeleterFnPtr = void (*)(Allocation*);

  Allocation() = default;

--- a/paddle/pten/core/compat/CMakeLists.txt
+++ b/paddle/pten/core/compat/CMakeLists.txt
 cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce)
+if(WITH_GPU)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+elseif(WITH_ROCM)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+else()
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
+endif()
 cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce convert_utils)
--- a/paddle/pten/core/compat/arg_map_context.cc
+++ b/paddle/pten/core/compat/arg_map_context.cc
@@ -14,8 +14,8 @@ limitations under the License. */

 #include "paddle/pten/core/compat/arg_map_context.h"

-#include "paddle/fluid/string/string_helper.h"
 #include "paddle/pten/core/enforce.h"
+#include "paddle/utils/string/string_helper.h"

 namespace pten {
 std::ostream& operator<<(std::ostream& os, KernelSignature signature) {

--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_alias_name.h"
+
+#include "paddle/pten/core/compat/convert_utils.h"
+
+#include "paddle/pten/core/compat/kernel_alias_name.h"
+
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"

@@ -126,26 +129,6 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
  }
 }

-paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod) {
-  paddle::framework::LoD out;
-  out.reserve(lod.size());
-
-  for (auto& elem : lod) {
-    out.emplace_back(elem);
-  }
-  return out;
-}
-
-pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod) {
-  pten::LoD out;
-  out.reserve(lod.size());
-
-  for (auto& elem : lod) {
-    out.emplace_back(elem);
-  }
-  return out;
-}
-
 size_t DataTypeSize(DataType dtype) {
  switch (dtype) {
    case DataType::UNDEFINED:

--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -17,33 +17,26 @@ limitations under the License. */
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/common/place.h"
 #include "paddle/pten/core/tensor_meta.h"

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"

 // TODO(chenweihang): this file may need to be removed

 namespace pten {

-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
-
 const std::string& TransToPtenKernelName(const std::string& fluid_op_name);

-Backend TransToPtenBackend(const paddle::platform::Place& place);
+Backend TransToPtenBackend(const pten::Place& place);
 DataType TransToPtenDataType(
    const paddle::framework::proto::VarType::Type& dtype);

-paddle::platform::Place TransToFluidPlace(const Backend& backend);
+pten::Place TransToFluidPlace(const Backend& backend);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
    const DataType& dtype);

-paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod);
-pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
-
 size_t DataTypeSize(DataType dtype);
 DataType String2DataType(const std::string& str);
 std::string DataType2String(DataType dtype);

--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
--- a/paddle/pten/core/compat/op_utils.h
+++ b/paddle/pten/core/compat/op_utils.h
@@ -17,13 +17,12 @@ limitations under the License. */
 #include <mutex>

 #include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/infermeta_utils.h"
-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/macros.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"

-#include "paddle/fluid/platform/enforce.h"
-
 namespace pten {

 class DefaultKernelSignatureMap {
@@ -37,7 +36,7 @@ class DefaultKernelSignatureMap {
    PADDLE_ENFORCE_NE(
        it,
        map_.end(),
-        paddle::platform::errors::NotFound(
+        pten::errors::NotFound(
            "Operator `%s`'s kernel signature is not registered.", op_type));
    return it->second;
  }
@@ -46,7 +45,7 @@ class DefaultKernelSignatureMap {
    PADDLE_ENFORCE_NE(
        Has(op_type),
        true,
-        paddle::platform::errors::AlreadyExists(
+        pten::errors::AlreadyExists(
            "Operator (%s)'s Kernel Siginature has been registered.", op_type));
    map_.insert({std::move(op_type), std::move(signature)});
  }
@@ -71,7 +70,7 @@ class OpUtilsMap {
    PADDLE_ENFORCE_EQ(
        name_map_.count(op_type),
        0UL,
-        paddle::platform::errors::AlreadyExists(
+        pten::errors::AlreadyExists(
            "Operator (%s)'s api name has been registered.", op_type));
    name_map_.insert({std::move(op_type), std::move(api_name)});
  }
@@ -80,7 +79,7 @@ class OpUtilsMap {
    PADDLE_ENFORCE_EQ(
        arg_mapping_fn_map_.count(op_type),
        0UL,
-        paddle::platform::errors::AlreadyExists(
+        pten::errors::AlreadyExists(
            "Operator (%s)'s argu,emt mapping function has been registered.",
            op_type));
    arg_mapping_fn_map_.insert({std::move(op_type), std::move(fn)});

--- a/paddle/pten/core/compat/type_defs.h
+++ b/paddle/pten/core/compat/type_defs.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <boost/variant.hpp>
+
+namespace egr {
+class EagerTensor;
+}
+namespace paddle {
+namespace framework {
+// The order should be as same as framework.proto
+// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer
+// enforce.h
+class BlockDesc;
+using Attribute = boost::variant<boost::blank,
+                                 int,
+                                 float,
+                                 std::string,
+                                 std::vector<int>,
+                                 std::vector<float>,
+                                 std::vector<std::string>,
+                                 bool,
+                                 std::vector<bool>,
+                                 BlockDesc*,
+                                 int64_t,
+                                 std::vector<BlockDesc*>,
+                                 std::vector<int64_t>,
+                                 std::vector<double>>;
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+}  // namespace framework
+
+namespace imperative {
+
+class VariableWrapper;
+class SavedVariableWrapperList;
+class VarBase;
+class OpBase;
+class GradOpNode;
+class Tracer;
+
+using WeakNameVarBaseMap =
+    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
+
+namespace details {
+template <typename T>
+struct NameVarMapTrait {};
+
+template <>
+struct NameVarMapTrait<VarBase> {
+  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
+};
+
+template <>
+struct NameVarMapTrait<VariableWrapper> {
+  using Type = std::map<std::string, SavedVariableWrapperList>;
+};
+
+template <>
+struct NameVarMapTrait<egr::EagerTensor> {
+  using Type =
+      std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>;
+};
+
+}  // namespace details
+
+template <typename T>
+using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+
+using NameVarBaseMap = NameVarMap<VarBase>;
+using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
+using NameTensorMap = NameVarMap<egr::EagerTensor>;
+
+using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/pten/core/ddim.cc
+++ b/paddle/pten/core/ddim.cc
@@ -13,10 +13,10 @@
 // limitations under the License.

 #include "paddle/pten/core/ddim.h"
+
 #include <set>

 namespace pten {
-namespace platform = paddle::platform;
 namespace framework {

 DDim make_ddim(std::initializer_list<int64_t> dims) {
@@ -84,7 +84,7 @@ DDim slice_ddim(const DDim& dim, int begin, int end) {
  PADDLE_ENFORCE_EQ(
      (begin >= 0 && end <= dim.size()),
      true,
-      platform::errors::InvalidArgument(
+      pten::errors::InvalidArgument(
          "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
          begin,
          end,
@@ -111,30 +111,30 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 }

 DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) {
-  PADDLE_ENFORCE_GE(src.size(),
-                    3,
-                    platform::errors::InvalidArgument(
-                        "The rank of src dim should be at least 3 "
-                        "in flatten_to_3d, but received %d.",
-                        src.size()));
-  PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The num_row_dims should be inside [1, %d] "
-                        "in flatten_to_3d, but received %d.",
-                        src.size() - 1,
-                        num_row_dims));
-  PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The num_col_dims should be inside [2, %d] "
-                        "in flatten_to_3d, but received %d.",
-                        src.size(),
-                        num_col_dims));
+  PADDLE_ENFORCE_GE(
+      src.size(),
+      3,
+      pten::errors::InvalidArgument("The rank of src dim should be at least 3 "
+                                    "in flatten_to_3d, but received %d.",
+                                    src.size()));
+  PADDLE_ENFORCE_EQ(
+      (num_row_dims >= 1 && num_row_dims < src.size()),
+      true,
+      pten::errors::InvalidArgument("The num_row_dims should be inside [1, %d] "
+                                    "in flatten_to_3d, but received %d.",
+                                    src.size() - 1,
+                                    num_row_dims));
+  PADDLE_ENFORCE_EQ(
+      (num_col_dims >= 2 && num_col_dims <= src.size()),
+      true,
+      pten::errors::InvalidArgument("The num_col_dims should be inside [2, %d] "
+                                    "in flatten_to_3d, but received %d.",
+                                    src.size(),
+                                    num_col_dims));
  PADDLE_ENFORCE_GE(
      num_col_dims,
      num_row_dims,
-      platform::errors::InvalidArgument(
+      pten::errors::InvalidArgument(
          "The num_row_dims should be less than num_col_dims in flatten_to_3d,"
          "but received num_row_dims = %d, num_col_dims = %d.",
          num_row_dims,
@@ -181,7 +181,7 @@ DDim DDim::reshape(const std::vector<int>& shape) const {
    if (shape[i] == copy_dim_val) {
      PADDLE_ENFORCE_LT(static_cast<int>(i),
                        in_dims.size(),
-                        platform::errors::InvalidArgument(
+                        pten::errors::InvalidArgument(
                            "Index %d of shape under which the value of 0 "
                            "is stored, must be lower than the number of "
                            "old dimensions. But received shape[%d] = 0, "
@@ -205,22 +205,22 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
  auto axis_set = std::set<int>(axis.begin(), axis.end());
  PADDLE_ENFORCE_EQ(axis_set.size(),
                    axis_size,
-                    platform::errors::InvalidArgument(
+                    pten::errors::InvalidArgument(
                        "In an axis array, elements must be unique."));

  PADDLE_ENFORCE_EQ(
      in_rank,
      axis_size,
-      platform::errors::InvalidArgument("The input dimension's size "
-                                        "should be equal to the axis's size. "
-                                        "But received dimension is %d, "
-                                        "axis's size is %d",
-                                        in_rank,
-                                        axis_size));
+      pten::errors::InvalidArgument("The input dimension's size "
+                                    "should be equal to the axis's size. "
+                                    "But received dimension is %d, "
+                                    "axis's size is %d",
+                                    in_rank,
+                                    axis_size));

  PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
                    axis_size,
-                    platform::errors::InvalidArgument(
+                    pten::errors::InvalidArgument(
                        "Axis values must be ranging from 0 to (dims - 1)."));

  DDim out_dims(in_dims);
@@ -231,4 +231,4 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
 }

 }  // namespace framework
-}  // namespace pten
\ No newline at end of file
+}  // namespace pten
--- a/paddle/pten/core/ddim.h
+++ b/paddle/pten/core/ddim.h
@@ -17,10 +17,10 @@
 #include <string>
 #include <vector>

-#include "paddle/pten/core/dim.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/utils/dim.h"

 namespace pten {
-namespace platform = paddle::platform;
 namespace framework {

 #define PADDLE_VISIT_DDIM_BASE(rank, callback) \
@@ -42,7 +42,7 @@ namespace framework {
    PADDLE_VISIT_DDIM_BASE(8, callback);                                   \
    PADDLE_VISIT_DDIM_BASE(9, callback);                                   \
    default:                                                               \
-      PADDLE_THROW(platform::errors::Unimplemented(                        \
+      PADDLE_THROW(pten::errors::Unimplemented(                            \
          "Invalid dimension to be accessed. Now only supports access to " \
          "dimension 0 to 9, but received dimension is %d.",               \
          rank));                                                          \
@@ -98,14 +98,14 @@ class DDim {
  int64_t& at(int idx) {
    PADDLE_ENFORCE_GE(idx,
                      0,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                          "Invalid DDim index to be accessed. The valid index "
                          "is between 0 and %d, but received index is %d.",
                          rank_,
                          idx));
    PADDLE_ENFORCE_LT(idx,
                      rank_,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                          "Invalid DDim index to be accessed. The valid index "
                          "is between 0 and %d, but received index is %d.",
                          rank_,
@@ -116,14 +116,14 @@ class DDim {
  int64_t at(int idx) const {
    PADDLE_ENFORCE_GE(idx,
                      0,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                          "Invalid DDim index to be accessed. The valid index "
                          "is between 0 and %d, but received index is %d.",
                          rank_,
                          idx));
    PADDLE_ENFORCE_LT(idx,
                      rank_,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                          "Invalid DDim index to be accessed. The valid index "
                          "is between 0 and %d, but received index is %d.",
                          rank_,

--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -14,13 +14,13 @@ limitations under the License. */

 #include "paddle/pten/core/dense_tensor.h"

-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/bfloat16.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/compat/convert_utils.h"

-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/convert_utils.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/malloc.h"

 namespace pten {


--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -14,15 +14,15 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/stream/stream.h"
-
 #include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"

+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/stream/stream.h"
+
 /* @jim19930609: Move to MKLDNN_Tensor in the future
    */
 #ifdef PADDLE_WITH_MKLDNN
@@ -31,7 +31,7 @@ limitations under the License. */

 namespace pten {

-class CompatibleDenseTensorUtils;
+class DenseTensorUtils;

 /// \brief The Dense tensor store values in a contiguous sequential block
 /// of memory where all values are represented. Tensors or multi-dimensional
@@ -120,8 +120,8 @@ class DenseTensor : public TensorBase,
  /// \return Whether the metadata is valid.
  bool valid() const noexcept override { return meta_.valid(); }

-  /// \brief Test whether the storage is allocated.
-  /// return Whether the storage is allocated.
+  /// \brief Test whether the allocation is allocated.
+  /// return Whether the allocation is allocated.
  bool initialized() const override { return holder_ && holder_->ptr(); }

  /// \brief Allocate memory with requested size from allocator.
@@ -130,12 +130,12 @@ class DenseTensor : public TensorBase,
                     DataType dtype,
                     size_t requested_size = 0) override;

-  /// \brief Check if storage is shared with other objects.
-  /// \return Whether the storage is shared with other objects.
+  /// \brief Check if allocation is shared with other objects.
+  /// \return Whether the allocation is shared with other objects.
  bool IsSharedWith(const DenseTensor& b) const;

  /// \brief Change the shape information in the metadata. If the new size is
-  /// larger than the original value, the storage area will be reallocated.
+  /// larger than the original value, the allocation area will be reallocated.
  /// \param dims The new dims of the dense tensor.
  /// \param lod The new lod of the dense tensor.
  // void Resize(const DDim& dims);
@@ -147,9 +147,10 @@ class DenseTensor : public TensorBase,
  /// \param lod The new lod of the dense tensor.
  void ResetLoD(const LoD& lod);

-  /// \brief Returns the actual storage size occupied by tensor, may be larger
+  /// \brief Returns the actual allocation size occupied by tensor, may be
+  /// larger
  /// than its shape dims.
-  /// \return The actual storage size occupied by tensor.
+  /// \return The actual allocation size occupied by tensor.
  size_t capacity() const { return holder_->size(); }

  /// \brief Get the const data pointer value of type T.
@@ -162,7 +163,7 @@ class DenseTensor : public TensorBase,
  const void* data() const;

 private:
-  friend class CompatibleDenseTensorUtils;
+  friend class DenseTensorUtils;

 protected:
  DenseTensorMeta meta_;

--- a/paddle/pten/core/dense_tensor_impl.cc
+++ b/paddle/pten/core/dense_tensor_impl.cc
@@ -14,13 +14,12 @@ limitations under the License. */

 #include "paddle/pten/core/dense_tensor.h"

-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/bfloat16.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"

 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"

 namespace pten {
 /* --------------------------- */

--- a/paddle/pten/core/enforce.h
+++ b/paddle/pten/core/enforce.h
@@ -49,7 +49,7 @@ limitations under the License. */
 #include "paddle/utils/string/to_string.h"

 // Note: these headers for simplify demangle type string
-#include "paddle/pten/core/type_defs.h"
+#include "paddle/pten/core/compat/type_defs.h"

 namespace pten {
 class ErrorSummary;

--- a/paddle/pten/core/infermeta_utils.h
+++ b/paddle/pten/core/infermeta_utils.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <utility>

 #include "paddle/pten/core/enforce.h"
-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/macros.h"
 #include "paddle/pten/core/meta_tensor.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"


--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -17,20 +17,16 @@
 #include <iterator>
 #include <utility>

-#include "paddle/pten/core/compat_utils.h"
+#include "paddle/pten/core/device_context.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_utils.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/small_vector.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/core/enforce.h"
-
 namespace pten {

-using DeviceContext = paddle::platform::DeviceContext;
-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
+using DeviceContext = pten::DeviceContext;

 /**
 * Note: KernelContext doesn't manage the life if DeviceContext and Tensor

--- a/paddle/pten/core/kernel_def.h
+++ b/paddle/pten/core/kernel_def.h
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-
-namespace pten {
-
-class Kernel;
-class KernelKey;
-class KernelArgsDef;
-class KernelContext;
-class KernelSignature;
-class ArgumentMappingContext;
-class InferMetaContext;
-
-using KernelFn = std::function<void(KernelContext* ctx)>;
-using KernelArgsDefFn = void (*)(Kernel* kernel);
-using KernelArgsParseFn = void (*)(const KernelKey& default_key,
-                                   KernelArgsDef* args_def);
-
-using ArgumentMappingFn =
-    std::function<KernelSignature(const ArgumentMappingContext&)>;
-using InferMetaFn = void (*)(InferMetaContext* ctx);
-
-}  // namespace pten
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -23,11 +23,9 @@
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_def.h"
-
-// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"


--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -21,10 +21,10 @@
 #include <typeinfo>
 #include <vector>

-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/core/kernel_utils.h"
 #include "paddle/pten/core/macros.h"
+#include "paddle/pten/core/type_defs.h"

 #include "paddle/pten/core/enforce.h"


--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -18,14 +18,12 @@
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/selected_rows.h"
 #include "paddle/pten/core/sparse_coo_tensor.h"
 #include "paddle/pten/core/sparse_csr_tensor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/type_defs.h"

 namespace pten {


--- a/paddle/pten/core/lod_utils.cc
+++ b/paddle/pten/core/lod_utils.cc
@@ -14,14 +14,14 @@

 #include "paddle/pten/core/lod_utils.h"

-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"

 namespace pten {

 void AppendLoD(LoD *lod, const LoD &lod_length) {
  PADDLE_ENFORCE(
      lod->empty() || lod->size() == lod_length.size(),
-      paddle::platform::errors::InvalidArgument(
+      pten::errors::InvalidArgument(
          "The input LoD length should be equal to the appended LoD size, but "
          "received input LoD length is %d, actual LoD size is %d.",
          lod_length.size(),

--- a/paddle/pten/core/meta_tensor.cc
+++ b/paddle/pten/core/meta_tensor.cc
@@ -14,10 +14,9 @@ limitations under the License. */

 #include "paddle/pten/core/meta_tensor.h"

-#include "paddle/pten/core/compat_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/tensor_utils.h"

 namespace pten {

@@ -31,9 +30,8 @@ DataLayout MetaTensor::layout() const { return tensor_->layout(); }

 void MetaTensor::set_dims(const DDim& dims) {
  if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
-        ->dims = dims;
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->dims =
+        dims;
  } else {
    PADDLE_THROW(paddle::platform::errors::Unimplemented(
        "Unsupported setting dims for `%s`.", tensor_->type_info().name()));
@@ -42,8 +40,7 @@ void MetaTensor::set_dims(const DDim& dims) {

 void MetaTensor::set_dtype(DataType dtype) {
  if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
        ->dtype = dtype;
  } else {
    PADDLE_THROW(paddle::platform::errors::Unimplemented(
@@ -53,8 +50,7 @@ void MetaTensor::set_dtype(DataType dtype) {

 void MetaTensor::set_layout(DataLayout layout) {
  if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
        ->layout = layout;
  } else {
    PADDLE_THROW(paddle::platform::errors::Unimplemented(
@@ -64,9 +60,8 @@ void MetaTensor::set_layout(DataLayout layout) {

 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
  if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
-        ->lod = meta_tensor.lod();
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->lod =
+        meta_tensor.lod();
  } else {
    PADDLE_THROW(paddle::platform::errors::Unimplemented(
        "Unsupported share lod inplace for `%s`.",

--- a/paddle/pten/core/meta_tensor.h
+++ b/paddle/pten/core/meta_tensor.h
@@ -16,13 +16,11 @@ limitations under the License. */

 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/macros.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/ddim.h"
-
 namespace pten {

 class MetaTensor {

--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -17,14 +17,12 @@ limitations under the License. */
 #include <cstddef>

 #include "boost/intrusive_ptr.hpp"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/utils/intrusive_ptr.h"
 #include "paddle/pten/core/utils/intrusive_ref_counter.h"
 #include "paddle/pten/core/utils/type_info.h"

-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/pten/core/allocator.h"
-
 namespace pten {

 /// \brief The interface of contiguous storage used for the dense tensor.
@@ -32,7 +30,6 @@ namespace pten {
 /// all default copy operations to ensure the integrity of the package.
 class Storage : public intrusive_ref_counter<Storage> {
 public:
-  using Place = paddle::platform::Place;
  Storage() = default;
  Storage(const Storage&) = delete;

@@ -43,11 +40,11 @@ class Storage : public intrusive_ref_counter<Storage> {

  /*   --------- shared_ptr<Allocation> -------- */
  // Initialize a Storage with unique Allocation
-  explicit Storage(std::shared_ptr<paddle::memory::Allocation>&& data)
+  explicit Storage(std::shared_ptr<pten::Allocation>&& data)
      : data_(std::move(data)) {}

  // Initialize a Storage shareing Allocation with another storage
-  explicit Storage(const std::shared_ptr<paddle::memory::Allocation>& data)
+  explicit Storage(const std::shared_ptr<pten::Allocation>& data)
      : data_(data) {}

  void* data() const {
@@ -56,17 +53,15 @@ class Storage : public intrusive_ref_counter<Storage> {
                 : nullptr;
  }

-  const std::shared_ptr<paddle::memory::Allocation>& data_shared() const {
-    return data_;
-  }
+  const std::shared_ptr<pten::Allocation>& data_shared() const { return data_; }

  virtual void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) = 0;
+      const std::shared_ptr<pten::Allocation>& holder) = 0;

-  virtual std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() = 0;
+  virtual std::shared_ptr<pten::Allocation>&& move_data_shared() = 0;

  virtual void ReallocShared(size_t n) {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
        "ReallocShared has not been overrided by the current Storage"));
  }
  /* --------- shared_ptr<Allocation> -------- */
@@ -81,13 +76,11 @@ class Storage : public intrusive_ref_counter<Storage> {
  virtual void Realloc(size_t n) = 0;

 protected:
-  std::shared_ptr<paddle::memory::Allocation> data_;
+  std::shared_ptr<pten::Allocation> data_;
 };

 class TensorStorage : public Storage {
 public:
-  using Place = paddle::platform::Place;
-
  explicit TensorStorage(Allocator* a) : alloc_(a) {}

  TensorStorage(Allocator* a, size_t size)
@@ -110,7 +103,7 @@ class TensorStorage : public Storage {

  const Place& place() const override {
    if (!data_) {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      PADDLE_THROW(pten::errors::Unimplemented(
          "Unable to visit place: either data_ or alloc_ has to be initialized "
          "first."));
    }
@@ -120,13 +113,13 @@ class TensorStorage : public Storage {
  bool OwnsMemory() const noexcept override { return true; }

  void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
+      const std::shared_ptr<pten::Allocation>& holder) override {
    CHECK(holder);
    data_ = holder;
    size_ = holder->size();
  }

-  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
+  std::shared_ptr<pten::Allocation>&& move_data_shared() override {
    size_ = 0;
    return std::move(data_);
  }

--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -14,23 +14,19 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/place.h"
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/common/place.h"
 #include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/ddim.h"
-#include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/utils/type_registry.h"

 namespace pten {

 class TensorBase {
 public:
-  using DataType = paddle::experimental::DataType;
-  using DataLayout = paddle::experimental::DataLayout;
  using DDim = pten::framework::DDim;
-  using Place = paddle::platform::Place;

  virtual ~TensorBase() = default;


--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
-
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/core/ddim.h"

 // Note: mixed_vector include many header now, LoD will be

--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/compat_utils.h
@@ -14,33 +14,17 @@ limitations under the License. */

 #pragma once

-#include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/tensor_meta.h"

 namespace pten {

-/**
- * In order to meet some adaptation requirements of the compatible state,
- * these class is added to provide some tool functions.
- *
- * These utility functions may be deleted in the future, It is not recommended
- * to be widely used in the framework
- */
-
-class CompatibleDenseTensorUtils {
+class DenseTensorUtils {
 public:
  static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
    return &(tensor->meta_);
  }

-  // only can deal with SharedStorage now
-  static void ClearStorage(DenseTensor* tensor) {
-    // use static_cast to improve performance, replace by dynamic_cast later
-    tensor->MoveMemoryHolder();
-  }
-
  static DenseTensor Slice(const DenseTensor& tensor,
                           int64_t begin_idx,
                           int64_t end_idx) {

--- a/paddle/pten/core/type_defs.h
+++ b/paddle/pten/core/type_defs.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.

 #pragma once

-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include <boost/variant.hpp>
-
-namespace egr {
-class EagerTensor;
-}
-namespace paddle {
-namespace framework {
-// The order should be as same as framework.proto
-// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer
-// enforce.h
-class BlockDesc;
-using Attribute = boost::variant<boost::blank,
-                                 int,
-                                 float,
-                                 std::string,
-                                 std::vector<int>,
-                                 std::vector<float>,
-                                 std::vector<std::string>,
-                                 bool,
-                                 std::vector<bool>,
-                                 BlockDesc*,
-                                 int64_t,
-                                 std::vector<BlockDesc*>,
-                                 std::vector<int64_t>,
-                                 std::vector<double>>;
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-}  // namespace framework
-
-namespace imperative {
-
-class VariableWrapper;
-class SavedVariableWrapperList;
-class VarBase;
-class OpBase;
-class GradOpNode;
-class Tracer;
-
-using WeakNameVarBaseMap =
-    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
-
-namespace details {
-template <typename T>
-struct NameVarMapTrait {};
-
-template <>
-struct NameVarMapTrait<VarBase> {
-  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
-};
-
-template <>
-struct NameVarMapTrait<VariableWrapper> {
-  using Type = std::map<std::string, SavedVariableWrapperList>;
-};
-
-template <>
-struct NameVarMapTrait<egr::EagerTensor> {
-  using Type =
-      std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>;
-};
+#include <functional>

-}  // namespace details
+namespace pten {

-template <typename T>
-using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+class Kernel;
+class KernelKey;
+class KernelArgsDef;
+class KernelContext;
+class KernelSignature;
+class ArgumentMappingContext;
+class InferMetaContext;

-using NameVarBaseMap = NameVarMap<VarBase>;
-using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
-using NameTensorMap = NameVarMap<egr::EagerTensor>;
+using KernelFn = std::function<void(KernelContext* ctx)>;
+using KernelArgsDefFn = void (*)(Kernel* kernel);
+using KernelArgsParseFn = void (*)(const KernelKey& default_key,
+                                   KernelArgsDef* args_def);

-using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
+using ArgumentMappingFn =
+    std::function<KernelSignature(const ArgumentMappingContext&)>;
+using InferMetaFn = void (*)(InferMetaContext* ctx);

-}  // namespace imperative
-}  // namespace paddle
+}  // namespace pten
--- a/paddle/pten/core/array.h
+++ b/paddle/pten/core/array.h
@@ -15,12 +15,11 @@
 #pragma once

 #include <cstdint>
-#include "paddle/pten/core/unroll_array_ops.h"
-// TODO(paddle-dev): Need to modify into pten/core/enforce.h
-#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/utils/unroll_array_ops.h"

 namespace pten {
-namespace platform = paddle::platform;
 namespace framework {

 template <typename T, size_t N>
@@ -58,7 +57,7 @@ class Array {
  HOSTDEVICE inline T &at(size_t i) {
 #if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
    PADDLE_ENFORCE_LT(
-        i, N, platform::errors::OutOfRange("Array index out of bounds."));
+        i, N, pten::errors::OutOfRange("Array index out of bounds."));
 #endif
    return (*this)[i];
  }
@@ -66,7 +65,7 @@ class Array {
  HOSTDEVICE inline const T &at(size_t i) const {
 #if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
    PADDLE_ENFORCE_LT(
-        i, N, platform::errors::OutOfRange("Array index out of bounds."));
+        i, N, pten::errors::OutOfRange("Array index out of bounds."));
 #endif
    return (*this)[i];
  }
@@ -114,7 +113,7 @@ class Array<T, 0> {
    static T obj();
    return obj;
 #else
-    PADDLE_THROW(platform::errors::Unavailable("Array<T, 0> has no element."));
+    PADDLE_THROW(pten::errors::Unavailable("Array<T, 0> has no element."));
 #endif
  }

@@ -128,7 +127,7 @@ class Array<T, 0> {
    static const T obj();
    return obj;
 #else
-    PADDLE_THROW(platform::errors::Unavailable("Array<T, 0> has no element."));
+    PADDLE_THROW(pten::errors::Unavailable("Array<T, 0> has no element."));
 #endif
  }


--- a/paddle/pten/core/dim.h
+++ b/paddle/pten/core/dim.h
@@ -20,8 +20,8 @@
 #include <string>
 #include <type_traits>

-#include "paddle/pten/core/array.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/core/utils/array.h"

 namespace pten {
 namespace framework {

--- a/paddle/pten/core/utils/rw_lock.h
+++ b/paddle/pten/core/utils/rw_lock.h
@@ -20,8 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #endif            // !_WIN32

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"

 namespace pten {

@@ -32,24 +31,23 @@ struct RWLock {
  ~RWLock() { pthread_rwlock_destroy(&lock_); }

  inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_),
-                      0,
-                      paddle::platform::errors::External(
-                          "The pthread failed to acquire read lock."));
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_),
+        0,
+        pten::errors::External("The pthread failed to acquire read lock."));
  }

  inline void WRLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_),
-                      0,
-                      paddle::platform::errors::External(
-                          "The pthread failed to acquire write lock."));
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_wrlock(&lock_),
+        0,
+        pten::errors::External("The pthread failed to acquire write lock."));
  }

  inline void UNLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_unlock(&lock_),
-        0,
-        paddle::platform::errors::External("The pthread failed to unlock."));
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_),
+                      0,
+                      pten::errors::External("The pthread failed to unlock."));
  }

 private:

--- a/paddle/pten/core/unroll_array_ops.h
+++ b/paddle/pten/core/unroll_array_ops.h
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
@@ -24,6 +24,8 @@ void UnchangedInferMetaNew(MetaConfig config,
                           const MetaTensor& x,
                           MetaTensor* out) {
  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
  out->share_lod(x);
 }


--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]

--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]

--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -29,7 +29,7 @@ namespace cub = hipcub;

 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_registry.h"


--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -38,14 +38,13 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/string/string_helper.h"
-#include "paddle/pten/core/array.h"
-#include "paddle/pten/core/enforce.h"
-#include "paddle/pten/kernels/primitive/kernel_primitives.h"
-
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/utils/array.h"
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"

 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512

--- a/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
@@ -47,7 +47,7 @@ void DigammaGradKernel(const Context& ctx,
  auto* x_data = x.data<T>();
  auto* dx_data = x_grad->data<T>();
  auto numel = out_grad.numel();
-  platform::ForRange<Context> for_range(ctx, numel);
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
  DigammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
  for_range(functor);
 }

--- a/paddle/pten/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/pten/kernels/impl/digamma_kernel_impl.h
@@ -41,7 +41,7 @@ void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
  auto* x_data = x.data<T>();
  auto* out_data = out->data<T>();
  auto numel = x.numel();
-  platform::ForRange<Context> for_range(ctx, numel);
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
  DigammaFunctor<T> functor(x_data, out_data, numel);
  for_range(functor);
 }

--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/binary.h"
 #include "paddle/pten/infermeta/unary.h"

--- a/paddle/pten/kernels/xpu/cast_kernel.cc
+++ b/paddle/pten/kernels/xpu/cast_kernel.cc
@@ -14,11 +14,13 @@

 #include "paddle/pten/kernels/cast_kernel.h"

-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_registry.h"

-#include "paddle/pten/core/enforce.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"

 namespace pten {

@@ -28,7 +30,7 @@ void CastKernel(const Context& dev_ctx,
                DataType out_dtype,
                DenseTensor* out) {
  using XPUInTDType = typename XPUTypeTrait<T>::Type;
-  using float16 = typename XPUTypeTrait<pten::platform::float16>::Type;
+  using float16 = typename XPUTypeTrait<pten::dtype::float16>::Type;

  auto* in_data = x.data<T>();
  auto numel = x.numel();
@@ -47,7 +49,7 @@ void CastKernel(const Context& dev_ctx,
          dev_ctx.x_context(),
          reinterpret_cast<const XPUInTDType*>(in_data),
          reinterpret_cast<float16*>(
-              out->mutable_data<pten::platform::float16>(dev_ctx.GetPlace())),
+              out->mutable_data<pten::dtype::float16>(dev_ctx.GetPlace())),
          numel);
      break;
    case pten::DataType::INT64:
@@ -72,7 +74,7 @@ void CastKernel(const Context& dev_ctx,
          numel);
      break;
    default:
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(pten::errors::Unavailable(
          "Not supported cast %d -> %d", x.dtype(), out_dtype));
  }

@@ -90,7 +92,7 @@ PT_REGISTER_KERNEL(cast,
                   pten::CastKernel,
                   int32_t,
                   float,
-                   pten::platform::float16,
+                   pten::dtype::float16,
                   int64_t,
                   bool) {
  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);

--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]

--- a/paddle/pten/kernels/xpu/full_kernel.cc
+++ b/paddle/pten/kernels/xpu/full_kernel.cc
@@ -16,9 +16,15 @@

 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/kernel_registry.h"

+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
 namespace pten {

 template <typename InType, typename OutType>
@@ -64,7 +70,7 @@ void FullLikeKernel(const Context& dev_ctx,
  using XPUInTDType = typename XPUTypeTrait<T>::Type;
  using CommonType = typename std::common_type<
      float,
-      typename std::conditional<std::is_same<T, pten::platform::float16>::value,
+      typename std::conditional<std::is_same<T, pten::dtype::float16>::value,
                                float,
                                T>::type>::type;

@@ -118,10 +124,10 @@ PT_REGISTER_KERNEL(full,
                   int,
                   int64_t,
                   bool,
-                   pten::platform::float16,
-                   pten::platform::bfloat16,
-                   pten::platform::complex<float>,
-                   pten::platform::complex<double>) {}
+                   pten::dtype::float16,
+                   pten::dtype::bfloat16,
+                   pten::dtype::complex<float>,
+                   pten::dtype::complex<double>) {}

 PT_REGISTER_KERNEL(full_like,
                   XPU,
@@ -130,4 +136,4 @@ PT_REGISTER_KERNEL(full_like,
                   float,
                   int,
                   int64_t,
-                   pten::platform::float16) {}
+                   pten::dtype::float16) {}
--- a/paddle/pten/kernels/xpu/scale_kernel.cc
+++ b/paddle/pten/kernels/xpu/scale_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/float16.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"

 namespace pten {

--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -35,7 +35,7 @@ TEST(API, cast) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_concat_api.cc
+++ b/paddle/pten/tests/api/test_concat_api.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace tests {

-namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;

 // TODO(chentianyu03): Remove this test after the API is used in the dygraph
 TEST(API, concat) {
@@ -35,7 +34,7 @@ TEST(API, concat) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -43,7 +42,7 @@ TEST(API, concat) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
@@ -35,7 +35,7 @@ TEST(API, conj) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::COMPLEX64,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<paddle::complex64>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -35,7 +35,7 @@ TEST(API, dot) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -43,7 +43,7 @@ TEST(API, dot) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -35,7 +35,7 @@ TEST(API, add) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -43,7 +43,7 @@ TEST(API, add) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -91,7 +91,7 @@ TEST(API, subtract) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -99,7 +99,7 @@ TEST(API, subtract) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -147,7 +147,7 @@ TEST(API, divide) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -155,7 +155,7 @@ TEST(API, divide) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -203,7 +203,7 @@ TEST(API, multiply) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -211,7 +211,7 @@ TEST(API, multiply) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -35,7 +35,7 @@ TEST(API, empty_like) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                            pten::DataLayout::NCHW));

  paddle::experimental::Tensor x(dense_x);
@@ -59,7 +59,7 @@ TEST(API, empty1) {
  auto dense_shape = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::INT64,
-                            framework::make_ddim({2}),
+                            pten::framework::make_ddim({2}),
                            pten::DataLayout::NCHW));
  auto* shape_data =
      dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
@@ -86,7 +86,7 @@ TEST(API, empty2) {
  auto dense_scalar = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::INT32,
-                            framework::make_ddim({1}),
+                            pten::framework::make_ddim({1}),
                            pten::DataLayout::NCHW));
  dense_scalar->mutable_data<int32_t>(paddle::platform::CPUPlace())[0] = 2;


--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -35,7 +35,7 @@ TEST(API, full_like) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -71,7 +71,7 @@ TEST(API, zeros_like) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -105,7 +105,7 @@ TEST(API, ones_like) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::INT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<int32_t>(paddle::platform::CPUPlace());
@@ -140,7 +140,7 @@ TEST(API, full1) {
  auto dense_shape = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::INT64,
-                            framework::make_ddim({2}),
+                            pten::framework::make_ddim({2}),
                            pten::DataLayout::NCHW));
  auto* shape_data =
      dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
@@ -150,7 +150,7 @@ TEST(API, full1) {
  auto dense_scalar = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({1}),
+                            pten::framework::make_ddim({1}),
                            pten::DataLayout::NCHW));
  dense_scalar->mutable_data<float>(paddle::platform::CPUPlace())[0] = 1.0;

@@ -187,7 +187,7 @@ TEST(API, full2) {
  auto dense_scalar = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::INT32,
-                            framework::make_ddim({1}),
+                            pten::framework::make_ddim({1}),
                            pten::DataLayout::NCHW));
  dense_scalar->mutable_data<int>(paddle::platform::CPUPlace())[0] = 2;


--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -35,7 +35,7 @@ TEST(API, flatten) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -35,7 +35,7 @@ TEST(API, matmul_cpu) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                            pten::DataLayout::NCHW));

  auto* dense_x_data =
@@ -44,7 +44,7 @@ TEST(API, matmul_cpu) {
  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -86,7 +86,7 @@ TEST(API, matmul_cuda) {
  auto ref_x = std::make_shared<pten::DenseTensor>(
      alloc_cpu.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                            pten::DataLayout::NCHW));

  auto* ref_x_data = ref_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -94,7 +94,7 @@ TEST(API, matmul_cuda) {
  auto ref_y = std::make_shared<pten::DenseTensor>(
      alloc_cpu.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                            pten::DataLayout::NCHW));
  auto* ref_y_data = ref_y->mutable_data<float>(paddle::platform::CPUPlace());

@@ -111,13 +111,13 @@ TEST(API, matmul_cuda) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc_cuda.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                            pten::DataLayout::NCHW));

  auto dense_y = std::make_shared<pten::DenseTensor>(
      alloc_cuda.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                            pten::DataLayout::NCHW));

  auto& pool = paddle::platform::DeviceContextPool::Instance();

--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -35,7 +35,7 @@ TEST(API, mean) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -35,7 +35,7 @@ TEST(API, reshape) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -35,7 +35,7 @@ TEST(API, sum) {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -33,7 +33,7 @@ paddle::experimental::Tensor CreateInputTensor() {
  auto dense_x = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::INT64,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x->mutable_data<int64_t>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -23,3 +23,5 @@ endif()
 if (NOT WIN32)
 cc_test(test_rw_lock SRCS test_rw_lock.cc)
 endif (NOT WIN32)
+
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
--- a/paddle/pten/tests/core/test_convert_utils.cc
+++ b/paddle/pten/tests/core/test_convert_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "gtest/gtest.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"

 namespace pten {
 namespace tests {

--- a/paddle/pten/tests/core/test_device_context.cc
+++ b/paddle/pten/tests/core/test_device_context.cc
@@ -29,7 +29,7 @@ TEST(DeviceContext, cpu_context) {
  std::cout << "test training scenarios" << std::endl;
  {
    pten::CPUContext ctx;
-    CHECK(ctx.eigen_device() != nullptr);
+    EXPECT_TRUE(ctx.eigen_device() != nullptr);
  }

  std::cout << "test inference scenarios" << std::endl;
@@ -37,13 +37,13 @@ TEST(DeviceContext, cpu_context) {
  {
    pten::CPUContextResource ctx_res{device};
    pten::CPUContext ctx(ctx_res);
-    CHECK(ctx.eigen_device() != nullptr);
+    EXPECT_TRUE(ctx.eigen_device() != nullptr);
  }
  {
    pten::CPUContextResource ctx_res{nullptr};
    pten::CPUContext ctx(ctx_res);
    ctx.SetEigenDevice(device);
-    CHECK(ctx.eigen_device() != nullptr);
+    EXPECT_TRUE(ctx.eigen_device() != nullptr);
  }
  delete device;

@@ -51,7 +51,7 @@ TEST(DeviceContext, cpu_context) {
  {
    pten::CPUContext ctx1;
    pten::CPUContext ctx2(ctx1);
-    CHECK_EQ(ctx1.eigen_device(), ctx2.eigen_device());
+    EXPECT_EQ(ctx1.eigen_device(), ctx2.eigen_device());
  }

  std::cout << "test move constructor" << std::endl;
@@ -60,7 +60,7 @@ TEST(DeviceContext, cpu_context) {
    auto* eigen_device1 = ctx1.eigen_device();
    pten::CPUContext ctx2(std::move(ctx1));
    auto* eigen_device2 = ctx2.eigen_device();
-    CHECK_EQ(eigen_device1, eigen_device2);
+    EXPECT_EQ(eigen_device1, eigen_device2);
  }
 }


--- a/paddle/pten/tests/core/test_dim.cu
+++ b/paddle/pten/tests/core/test_dim.cu
@@ -16,7 +16,7 @@
 #include <sstream>

 #include "gtest/gtest.h"
-#include "paddle/pten/core/dim.h"
+#include "paddle/pten/core/utils/dim.h"

 namespace pten {
 namespace tests {
@@ -100,4 +100,4 @@ TEST(Dim, Print) {
 }

 }  // namespace tests
-}  // namespace pten
\ No newline at end of file
+}  // namespace pten
--- a/paddle/pten/core/unroll_array_ops_test.cc
+++ b/paddle/pten/core/unroll_array_ops_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/pten/core/unroll_array_ops.h"
+#include "paddle/pten/core/utils/unroll_array_ops.h"

 #include <gtest/gtest.h>
 #include <array>
@@ -79,4 +79,4 @@ TEST(unroll_ops, product) {
 }

 }  // namespace framework
-}  // namespace pten
\ No newline at end of file
+}  // namespace pten
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -35,10 +35,11 @@ TEST(DEV_API, cast) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());


--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -32,17 +32,19 @@ TEST(DEV_API, concat) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());


--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -33,10 +33,11 @@ TEST(DEV_API, conj) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::COMPLEX64,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::COMPLEX64,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));

  auto* dense_x_data =
      dense_x.mutable_data<paddle::complex64>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -38,7 +38,7 @@ TEST(DEV_API, copy) {
  auto dense_src = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({2, 3}),
+                            pten::framework::make_ddim({2, 3}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_src->mutable_data<float>(paddle::platform::CPUPlace());
@@ -46,7 +46,7 @@ TEST(DEV_API, copy) {
  auto dense_dst = std::make_shared<pten::DenseTensor>(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({2, 3}),
+                            pten::framework::make_ddim({2, 3}),
                            pten::DataLayout::NCHW));

  for (size_t i = 0; i < 2; ++i) {

--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -53,10 +53,11 @@ TEST(DEV_API, empty_like) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 2}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
  dense_x_data[0] = 0;
@@ -106,10 +107,11 @@ TEST(DEV_API, full_like) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 2}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
  dense_x_data[0] = 0;

--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -33,17 +33,19 @@ TEST(DEV_API, dot) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());


--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -33,17 +33,19 @@ TEST(DEV_API, add) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());

@@ -85,17 +87,19 @@ TEST(DEV_API, subtract) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());

@@ -137,17 +141,19 @@ TEST(DEV_API, divide) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());

@@ -189,17 +195,19 @@ TEST(DEV_API, multiply) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());


--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -46,7 +46,7 @@ TEST(DEV_API, flatten) {
  pten::DenseTensor dense_x(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -34,7 +34,7 @@ TEST(DEV_API, dot) {
      paddle::platform::CPUPlace());
  DenseTensor dense_x(alloc.get(),
                      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                            framework::make_ddim({3, 3}),
+                                            pten::framework::make_ddim({3, 3}),
                                            pten::DataLayout::NCHW));

  auto* dense_x_data =
@@ -42,7 +42,7 @@ TEST(DEV_API, dot) {

  DenseTensor dense_y(alloc.get(),
                      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                            framework::make_ddim({3, 3}),
+                                            pten::framework::make_ddim({3, 3}),
                                            pten::DataLayout::NCHW));
  auto* dense_y_data =
      dense_y.mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -32,10 +32,11 @@ TEST(DEV_API, mean) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());


--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -36,7 +36,7 @@ TEST(DEV_API, reshape) {
  pten::DenseTensor dense_x(
      alloc.get(),
      pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());

--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -32,10 +32,11 @@ TEST(DEV_API, scale) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));

  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
@@ -70,10 +71,11 @@ TEST(DEV_API, scale_host) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
  for (size_t i = 0; i < 12; ++i) {
@@ -82,7 +84,7 @@ TEST(DEV_API, scale_host) {

  pten::DenseTensor scale(alloc.get(),
                          pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                framework::make_ddim({1}),
+                                                pten::framework::make_ddim({1}),
                                                pten::DataLayout::NCHW));
  scale.data<float>()[0] = 2;
  float bias = 1;

--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -31,10 +31,11 @@ TEST(DEV_API, sum) {
  // 1. create tensor
  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
  auto* dense_x_data =
      dense_x.mutable_data<float>(paddle::platform::CPUPlace());


--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -567,13 +567,15 @@ def find_files(pattern, root, recursive=False):
            break

 headers = (
+    # paddle level api headers
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api')) +  # pten unify api header
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api/ext')) +  # custom op api
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api/include')) +  # pten api
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
-    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
-    # to `paddle/pten/api/ext`,
+    # pten level api headers (low level api)
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) +  # pten core headers
+    # utila api headers
    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])

@@ -620,8 +622,6 @@ class InstallHeaders(Command):
        elif 'third_party' not in header:
            # paddle headers
            install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'fluid' in install_dir:
-                install_dir = "paddle/pten/common/"
        else:
            # third_party
            install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)