diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index dfd93f49e73404427dbeaedd87c8c094068cb76f..e5a7f0d2bef5403e49330afd539ee0702cb528c5 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -216,18 +216,36 @@ copy(inference_lib_dist
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
+# TODO(chenweihang, before 11.27) Here, the header file of pten is copied to
+# the experimental directory, the include path needs to be changed, so the
+# header file path needs to be processed here
+# copy api headers for custom op
 copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/pten/api/ext/*
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/api/ext/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/pten/api/include/*
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/api/include/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/pten/api/all.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/api/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/pten/common/*
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+# In order to be compatible with the original behavior, the header file name needs to be changed
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h)
+
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
diff --git a/paddle/extension.h b/paddle/extension.h
index 98d4bfd0326c5c524fcac9129f58d0ae99fc8afe..8394f87c31d1bdeaad75e94ad00e5f8c8d3433dd 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once
 
 // All paddle apis in C++ frontend
-#include "paddle/extension/include/ext_all.h"
+#include "paddle/pten/api/all.h"
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index dcff02a662e2734bc66d4cf219fce527fd0961aa..369aa64016951ac4429c963ff35e9403dfa1f3ce 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -1,3 +1,8 @@
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file, remove it after moving
+# float16.h/complex.h/bfloat16.h into pten
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(distributed)
diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h
deleted file mode 100644
index a0816b65a3d15c9cf1384d1b6f18fa79f9199a83..0000000000000000000000000000000000000000
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include "complex.h"        // NOLINT
-#include "ext_exception.h"  // NOLINT
-#include "float16.h"        // NOLINT
-
-namespace paddle {
-
-using complex64 = paddle::platform::complex<float>;
-using complex128 = paddle::platform::complex<double>;
-using float16 = paddle::platform::float16;
-
-enum class DataType {
-  BOOL,
-  INT8,
-  UINT8,
-  INT16,
-  INT32,
-  INT64,
-  FLOAT16,
-  FLOAT32,
-  FLOAT64,
-  COMPLEX64,
-  COMPLEX128,
-  // TODO(JiabinYang) support more data types if needed.
-};
-
-inline std::string ToString(DataType dtype) {
-  switch (dtype) {
-    case DataType::BOOL:
-      return "bool";
-    case DataType::INT8:
-      return "int8_t";
-    case DataType::UINT8:
-      return "uint8_t";
-    case DataType::INT16:
-      return "int16_t";
-    case DataType::INT32:
-      return "int32_t";
-    case DataType::INT64:
-      return "int64_t";
-    case DataType::FLOAT16:
-      return "float16";
-    case DataType::FLOAT32:
-      return "float";
-    case DataType::FLOAT64:
-      return "double";
-    case DataType::COMPLEX64:
-      return "complex64";
-    case DataType::COMPLEX128:
-      return "complex128";
-    default:
-      PD_THROW("Unsupported paddle enum data type.");
-  }
-}
-
-#define PD_FOR_EACH_DATA_TYPE(_)    \
-  _(bool, DataType::BOOL)           \
-  _(int8_t, DataType::INT8)         \
-  _(uint8_t, DataType::UINT8)       \
-  _(int16_t, DataType::INT16)       \
-  _(int, DataType::INT32)           \
-  _(int64_t, DataType::INT64)       \
-  _(float16, DataType::FLOAT16)     \
-  _(float, DataType::FLOAT32)       \
-  _(double, DataType::FLOAT64)      \
-  _(complex64, DataType::COMPLEX64) \
-  _(complex128, DataType::COMPLEX128)
-
-template <paddle::DataType T>
-struct DataTypeToCPPType;
-
-#define PD_SPECIALIZE_DataTypeToCPPType(cpp_type, data_type) \
-  template <>                                                \
-  struct DataTypeToCPPType<data_type> {                      \
-    using type = cpp_type;                                   \
-  };
-
-PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCPPType)
-
-#undef PD_SPECIALIZE_DataTypeToCPPType
-
-}  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
deleted file mode 100644
index 970be905cc2566f7de3bac1c593e237cf6c7dac0..0000000000000000000000000000000000000000
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-using gpuStream_t = cudaStream_t;
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-using gpuStream_t = hipStream_t;
-#endif
-
-#include "ext_dll_decl.h"  // NOLINT
-#include "ext_dtype.h"     // NOLINT
-#include "ext_place.h"     // NOLINT
-
-namespace paddle {
-namespace framework {
-class CustomTensorUtils;
-}  // namespace framework
-
-class StreamWrapper {
- public:
-  StreamWrapper() : stream_(nullptr), is_stream_set_(false) {}
-  void SetStream(void* stream) {
-    stream_ = stream;
-    is_stream_set_ = true;
-  }
-
-  void* GetStream() const { return stream_; }
-
-  bool IsStreamSet() const { return is_stream_set_; }
-
- private:
-  //  cudaStream_t stream_;
-  void* stream_;
-  bool is_stream_set_;
-};
-
-class PD_DLL_DECL Tensor {
- public:
-  /// \brief Construct a Tensor on target Place for CustomOp.
-  /// Generally it's only used for user to create Tensor.
-  explicit Tensor(const PlaceType& place);
-  /// \brief Construct a Tensor on target Place with shape for CustomOp.
-  /// Generally it's only used for user to create Tensor.
-  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
-  /// \brief Reset the shape of the tensor.
-  /// Generally it's only used for the input tensor.
-  /// Reshape must be called before calling
-  /// mutable_data() or copy_to(const PlaceType& place)
-  /// \param shape The shape to set.
-  void reshape(const std::vector<int64_t>& shape);
-
-  /// \brief Get the memory pointer in CPU or GPU with
-  /// specific data type.
-  /// Please Reshape the tensor first before call this.
-  /// It's usually used to get input data pointer.
-  /// \param place The place of the tensor this will
-  /// override the original place of current tensor.
-  template <typename T>
-  T* mutable_data(const PlaceType& place);
-
-  /// \brief Get the memory pointer in CPU or GPU with
-  /// specific data type. Please Reshape the tensor
-  /// first before call this.It's usually used to get
-  /// input data pointer.
-  template <typename T>
-  T* mutable_data();
-
-  /// \brief Get the memory pointer directly.
-  /// It's usually used to get the output data pointer.
-  /// \return The tensor data buffer pointer.
-  template <typename T>
-  T* data() const;
-
-  /// \brief Copy the host memory to tensor data.
-  /// It's usually used to set the input tensor data.
-  /// \param PlaceType of target place, of which
-  /// the tensor will copy to.
-  template <typename T>
-  Tensor copy_to(const PlaceType& place) const;
-
-  /// \brief Return a sub-tensor of the given tensor.
-  /// It is usually used to extract a sub-tensor (which supports
-  /// modifying the data of the original tensor) to perform further
-  /// operations.
-  /// \param begin_idx The index of the start row (inclusive) to slice.
-  ///                  The index number begins from 0.
-  /// \param end_idx  The index of the end row (exclusive) to slice.
-  ///                 The index number begins from begin_idx + 1.
-  /// \return The sliced tensor.
-  Tensor slice(const int64_t begin_idx, const int64_t end_idx) const;
-
-  /// \brief Return the shape of the Tensor.
-  std::vector<int64_t> shape() const;
-
-  /// \brief Return the data type of the tensor.
-  /// It's usually used to get the output tensor data type.
-  /// \return The data type of the tensor.
-  DataType type() const;
-
-  /// \brief Get the size of current tensor.
-  /// Use this method to get the size of tensor
-  /// \return int64_t.
-  int64_t size() const;
-
-  /// \brief Get the place of current tensor.
-  /// Use this method to get the place of tensor
-  /// \return Place.
-  const PlaceType& place() const;
-
-  /// \brief Cast datatype from one to another
-  Tensor cast(const DataType& target_type) const;
-
-  /// \brief Check Tensor is initialized
-  bool is_initialized() const;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  /// \bref Get current stream of Tensor
-  gpuStream_t stream() const;
-#endif
-
- private:
-  friend class framework::CustomTensorUtils;
-  mutable std::shared_ptr<void> tensor_;
-  mutable PlaceType place_;
-  StreamWrapper stream_;
-};
-
-}  // namespace paddle
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
deleted file mode 100644
index b5cd9e0b5c0e1590f6455c317f7056317f46fab9..0000000000000000000000000000000000000000
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/extension/include/ext_tensor.h"
-
-#include <utility>
-
-#include "paddle/fluid/framework/custom_tensor_utils.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-
-template <typename InType, typename OutType>
-struct CastDataTypeFunctor {
-  HOSTDEVICE inline OutType operator()(InType in) const {
-    return static_cast<OutType>(in);
-  }
-};
-
-template <typename InType>
-struct CastDataType {
-  CastDataType(const framework::Tensor &in, framework::Tensor *out,
-               const platform::DeviceContext *ctx)
-      : in_(in), out_(out), ctx_(ctx) {}
-  const framework::Tensor in_;
-  framework::Tensor *out_;
-  const platform::DeviceContext *ctx_;
-
-  template <typename OutType>
-  void apply() {
-    auto *in_begin = in_.data<InType>();
-    auto *in_end = in_begin + in_.numel();
-    auto *out_begin = out_->mutable_data<OutType>(in_.place());
-
-    if (platform::is_cpu_place(in_.place())) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      auto *context = static_cast<const platform::CPUDeviceContext *>(ctx_);
-      trans(*context, in_begin, in_end, out_begin,
-            CastDataTypeFunctor<InType, OutType>());
-#if defined(__NVCC__) || defined(__HIPCC__)
-    } else if (platform::is_gpu_place(in_.place())) {
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto *context = static_cast<const platform::CUDADeviceContext *>(ctx_);
-      trans(*context, in_begin, in_end, out_begin,
-            CastDataTypeFunctor<InType, OutType>());
-      context->Wait();
-#endif
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Place type is not supported when casting data type."));
-    }
-  }
-};
-
-template <typename T>
-void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
-             int64_t ele_size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  int device_num = paddle::platform::GetCurrentDeviceId();
-  platform::CUDAPlace gpu_place(device_num);
-  auto *dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-  if ((src_plc == PlaceType::kGPU) && (dst_plc == PlaceType::kCPU)) {
-    memory::Copy(platform::CPUPlace(), static_cast<void *>(dst), gpu_place, src,
-                 ele_size, dev_ctx->stream());
-  } else if ((src_plc == PlaceType::kGPU) && (dst_plc == PlaceType::kGPU)) {
-    memory::Copy(gpu_place, static_cast<void *>(dst), gpu_place, src, ele_size,
-                 dev_ctx->stream());
-  } else if ((src_plc == PlaceType::kCPU) && (dst_plc == PlaceType::kGPU)) {
-    memory::Copy(gpu_place, static_cast<void *>(dst), platform::CPUPlace(), src,
-                 ele_size, dev_ctx->stream());
-  } else {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Only GPU related Copy can reach this func."));
-  }
-#ifdef PADDLE_WITH_HIP
-  hipStreamSynchronize(dev_ctx->stream());
-#else
-  cudaStreamSynchronize(dev_ctx->stream());
-#endif
-#endif
-}
-
-#define GET_CASTED_TENSOR                               \
-  if (!tensor_) {                                       \
-    tensor_ = std::make_shared<framework::LoDTensor>(); \
-  }                                                     \
-  auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
-
-#define GET_INNER_PLACE                               \
-  platform::Place place;                              \
-  switch (place_) {                                   \
-    case PlaceType::kCPU:                             \
-      place = platform::CPUPlace();                   \
-      break;                                          \
-    case PlaceType::kGPU:                             \
-      place = platform::CUDAPlace();                  \
-      break;                                          \
-    default:                                          \
-      PADDLE_THROW(platform::errors::Unavailable(     \
-          "Custom operator unsupported place id(%d)", \
-          static_cast<int>(place_)));                 \
-  }
-
-void Tensor::reshape(const std::vector<int64_t> &shape) {
-  GET_CASTED_TENSOR
-  auto new_dim = framework::make_ddim(shape);
-  tensor->Resize(new_dim);
-}
-
-Tensor::Tensor(const PlaceType &place)
-    : tensor_(std::make_shared<framework::LoDTensor>()),
-      place_(place),
-      stream_(StreamWrapper()) {}
-
-Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
-    : tensor_(std::make_shared<framework::LoDTensor>()),
-      place_(place),
-      stream_(StreamWrapper()) {
-  GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
-}
-
-template <typename T>
-T *Tensor::mutable_data(const PlaceType &place) {
-  place_ = place;
-  return mutable_data<T>();
-}
-
-template <typename T>
-T *Tensor::mutable_data() {
-  GET_CASTED_TENSOR
-  PADDLE_ENFORCE_GT(
-      tensor->numel(), 0,
-      platform::errors::PreconditionNotMet(
-          "You should call Tensor::Reshape(const std::vector<int> "
-          "&shape)"
-          "function before retrieving mutable_data from input tensor."));
-  switch (static_cast<int>(place_)) {
-    case static_cast<int>(PlaceType::kCPU): {
-      return tensor->mutable_data<T>(platform::CPUPlace());
-    }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case static_cast<int>(PlaceType::kGPU): {
-      int device_num = platform::GetCurrentDeviceId();
-      return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
-    }
-#endif
-    default:
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Custom operator unsupported place id(%d)",
-          static_cast<int>(place_)));
-  }
-}
-
-template <typename T>
-T *Tensor::data() const {
-  GET_CASTED_TENSOR;
-  auto *res = tensor->data<T>();
-  return res;
-}
-
-DataType Tensor::type() const {
-  GET_CASTED_TENSOR;
-  auto type = tensor->type();
-  if (type == framework::proto::VarType::FP32) {
-    return DataType::FLOAT32;
-  } else if (type == framework::proto::VarType::INT64) {
-    return DataType::INT64;
-  } else if (type == framework::proto::VarType::INT32) {
-    return DataType::INT32;
-  } else if (type == framework::proto::VarType::INT16) {
-    return DataType::INT16;
-  } else if (type == framework::proto::VarType::INT8) {
-    return DataType::INT8;
-  } else if (type == framework::proto::VarType::UINT8) {
-    return DataType::UINT8;
-  } else if (type == framework::proto::VarType::FP64) {
-    return DataType::FLOAT64;
-  } else if (type == framework::proto::VarType::BOOL) {
-    return DataType::BOOL;
-  } else if (type == framework::proto::VarType::COMPLEX64) {
-    return DataType::COMPLEX64;
-  } else if (type == framework::proto::VarType::COMPLEX128) {
-    return DataType::COMPLEX128;
-  } else if (type == framework::proto::VarType::FP16) {
-    return DataType::FLOAT16;
-  }
-  // TODO(JiabinYang) Support more dtype here
-  return DataType::FLOAT32;
-}
-
-template <typename T>
-Tensor Tensor::copy_to(const PlaceType &target_place) const {
-  GET_CASTED_TENSOR;
-  PADDLE_ENFORCE_GE(tensor->numel(), 0,
-                    platform::errors::PreconditionNotMet(
-                        "You should call Tensor::Reshape(const "
-                        "std::vector<int> &shape)"
-                        "function before copying data from cpu."));
-  size_t ele_size = tensor->numel() * sizeof(T);
-  auto *p_src_data = tensor->data<T>();
-  auto src_place = place();
-  Tensor target = Tensor(target_place);
-  target.reshape(shape());
-  auto *p_target_data = target.template mutable_data<T>();
-
-  if ((src_place == PlaceType::kCPU) && (target_place == PlaceType::kCPU)) {
-    std::memcpy(static_cast<void *>(p_target_data), p_src_data, ele_size);
-  } else if ((src_place == PlaceType::kGPU) &&
-             (target_place == PlaceType::kCPU)) {
-    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
-  } else if ((src_place == PlaceType::kCPU) &&
-             (target_place == PlaceType::kGPU)) {
-    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
-  } else if ((src_place == PlaceType::kGPU) &&
-             (target_place == PlaceType::kGPU)) {
-    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
-  } else {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not supported place transform of place: %d to place: %d",
-        static_cast<int>(src_place), static_cast<int>(target_place)));
-  }
-  return target;
-}
-
-Tensor Tensor::slice(const int64_t begin_idx, const int64_t end_idx) const {
-  GET_CASTED_TENSOR
-  GET_INNER_PLACE
-  framework::Tensor intermediate = tensor->Slice(begin_idx, end_idx);
-  Tensor target = Tensor(place_);
-  framework::CustomTensorUtils::ShareDataFrom(
-      static_cast<const void *>(&intermediate), target);
-  return target;
-}
-
-template PD_DLL_DECL Tensor
-Tensor::copy_to<float>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<double>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
-    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
-    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
-Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
-
-template PD_DLL_DECL float *Tensor::data<float>() const;
-template PD_DLL_DECL double *Tensor::data<double>() const;
-template PD_DLL_DECL int64_t *Tensor::data<int64_t>() const;
-template PD_DLL_DECL int32_t *Tensor::data<int32_t>() const;
-template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
-template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
-template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
-template PD_DLL_DECL bool *Tensor::data<bool>() const;
-template PD_DLL_DECL paddle::platform::complex<float>
-    *Tensor::data<paddle::platform::complex<float>>() const;
-template PD_DLL_DECL paddle::platform::complex<double>
-    *Tensor::data<paddle::platform::complex<double>>() const;
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::data<paddle::platform::float16>() const;
-
-template PD_DLL_DECL float *Tensor::mutable_data<float>();
-template PD_DLL_DECL double *Tensor::mutable_data<double>();
-template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
-template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
-template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
-template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
-template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
-template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
-template PD_DLL_DECL paddle::platform::complex<float>
-    *Tensor::mutable_data<paddle::platform::complex<float>>();
-template PD_DLL_DECL paddle::platform::complex<double>
-    *Tensor::mutable_data<paddle::platform::complex<double>>();
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::mutable_data<paddle::platform::float16>();
-
-template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
-template PD_DLL_DECL double *Tensor::mutable_data<double>(
-    const PlaceType &place);
-template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
-    const PlaceType &place);
-template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
-    const PlaceType &place);
-template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
-    const PlaceType &place);
-template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
-    const PlaceType &place);
-template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
-    const PlaceType &place);
-template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex<float> *
-Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex<double> *
-Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::float16 *
-Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
-
-std::vector<int64_t> Tensor::shape() const {
-  GET_CASTED_TENSOR
-  return framework::vectorize<int64_t>(tensor->dims());
-}
-
-const PlaceType &Tensor::place() const {
-  GET_CASTED_TENSOR;
-  if (platform::is_cpu_place(tensor->place())) {
-    place_ = PlaceType::kCPU;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  } else if (platform::is_gpu_place(tensor->place())) {
-    place_ = PlaceType::kGPU;
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Current Tensor hold unsupported Place Type, Please Init it"
-        "using Tensor::mutable_data<T>(PaddlePlace) with T among:"
-        "Place::kCPU or Place::kGPU"));
-  }
-  return place_;
-}
-
-Tensor Tensor::cast(const DataType &target_type) const {
-  GET_CASTED_TENSOR;
-  Tensor rlt = Tensor(place());
-  rlt.reshape(this->shape());
-  auto rlt_tensor_ = static_cast<framework::LoDTensor *>(rlt.tensor_.get());
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto ctx = pool.Get(tensor->place());
-  auto src_type = tensor->type();
-  auto dst_type =
-      framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(target_type);
-  switch (src_type) {
-    case framework::proto::VarType::FP32:
-      framework::VisitDataType(dst_type,
-                               CastDataType<float>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::FP64:
-      framework::VisitDataType(dst_type,
-                               CastDataType<double>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::INT32:
-      framework::VisitDataType(dst_type,
-                               CastDataType<int>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::INT64:
-      framework::VisitDataType(
-          dst_type, CastDataType<int64_t>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::BOOL:
-      framework::VisitDataType(dst_type,
-                               CastDataType<bool>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::INT16:
-      framework::VisitDataType(
-          dst_type, CastDataType<int16_t>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::UINT8:
-      framework::VisitDataType(
-          dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::COMPLEX64:
-      framework::VisitDataType(dst_type,
-                               CastDataType<paddle::platform::complex<float>>(
-                                   *tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::COMPLEX128:
-      framework::VisitDataType(dst_type,
-                               CastDataType<paddle::platform::complex<double>>(
-                                   *tensor, rlt_tensor_, ctx));
-      break;
-    case framework::proto::VarType::FP16:
-      framework::VisitDataType(
-          dst_type,
-          CastDataType<paddle::platform::float16>(*tensor, rlt_tensor_, ctx));
-      break;
-    // TODO(JiabinYang) Support more dtype here
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when casting data type.",
-          framework::DataTypeToString(src_type)));
-  }
-  return rlt;
-}
-
-int64_t Tensor::size() const {
-  GET_CASTED_TENSOR;
-  return tensor->numel();
-}
-
-bool Tensor::is_initialized() const {
-  GET_CASTED_TENSOR;
-  if (tensor->IsInitialized()) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-gpuStream_t Tensor::stream() const {
-  if (!stream_.IsStreamSet()) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Stream is not Set, only input tensor will have "
-        "stream which is set by framework "));
-  } else {
-    return reinterpret_cast<gpuStream_t>(stream_.GetStream());
-  }
-}
-#endif
-
-namespace framework {
-
-void CustomTensorUtils::ShareDataTo(const paddle::Tensor &src, void *dst) {
-  static_cast<framework::LoDTensor *>(dst)->ShareDataWith(
-      *static_cast<framework::LoDTensor *>(src.tensor_.get()));
-}
-
-void CustomTensorUtils::ShareDataFrom(const void *src,
-                                      const paddle::Tensor &dst) {
-  if (!dst.tensor_) {
-    dst.tensor_ = std::make_shared<framework::LoDTensor>();
-  }
-  auto *tensor = static_cast<framework::LoDTensor *>(dst.tensor_.get());
-  tensor->ShareDataWith(*static_cast<const framework::LoDTensor *>(src));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/extension/src/ext_tensor.cu b/paddle/fluid/extension/src/ext_tensor.cu
deleted file mode 120000
index 23ae523331877f1ee4818dba4618259e0914aeb4..0000000000000000000000000000000000000000
--- a/paddle/fluid/extension/src/ext_tensor.cu
+++ /dev/null
@@ -1 +0,0 @@
-ext_tensor.cc
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1fcb9d2ca6607a2b1e826d8450f32f8ee61c0bd2..1b45193e3d0a3455d81e7056f3790a6ff26c2d25 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -431,34 +431,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include)
-include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
-
-if (WITH_GPU)
-  if (WIN32)
-    windows_symbolic(ext_tensor_cu SRCS ext_tensor.cu PATH ../extension/src)
-    nv_library(custom_tensor SRCS ../extension/src/.ext_tensor.cu DEPS lod_tensor memory enforce)
-    add_dependencies(custom_tensor ext_tensor_cu)
-  else()
-    nv_library(custom_tensor SRCS ../extension/src/ext_tensor.cu DEPS lod_tensor memory enforce)
-  endif(WIN32)
-elseif (WITH_ROCM)
-  hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cu DEPS lod_tensor memory enforce)
-else()
-  cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
-endif()
-
-cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
-cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
-
-if(WITH_ROCM)
-  hip_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
-else()
-  cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
-endif()
+cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
 
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index bb8258dcd9228f53e106642de19ae8ad96ab7eec..88cfd74cdaf99228238457f9d2f16f57677c468a 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -25,15 +25,18 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/api/all.h"
+#include "paddle/pten/api/lib/ext_compat_utils.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
@@ -128,10 +131,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                               "The %d-th tensor in input vector<tensor> (%s) "
                               "is not initialized.",
                               i, in_name));
-        auto custom_t = paddle::Tensor(
-            CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
-        CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_t);
-        CustomTensorUtils::SetTensorCurrentStream(&custom_t, ctx.GetPlace());
+        paddle::Tensor custom_t;
+        custom_t.set_impl(std::move(experimental::MakePtenDenseTensor(*x)));
         custom_vec_in.emplace_back(custom_t);
       }
       custom_vec_ins.emplace_back(custom_vec_in);
@@ -142,10 +143,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
                         platform::errors::InvalidArgument(
                             "Input tensor (%s) is not initialized.", in_name));
-      auto custom_in = paddle::Tensor(
-          CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
-      CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_in);
-      CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace());
+      paddle::Tensor custom_in;
+      custom_in.set_impl(std::move(experimental::MakePtenDenseTensor(*x)));
       custom_ins.emplace_back(custom_in);
     }
   }
@@ -207,11 +206,17 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                 "Tensors.",
                 vec_true_outs.size(), outs.size()));
         for (size_t j = 0; j < vec_true_outs.size(); ++j) {
-          CustomTensorUtils::ShareDataTo(outs.at(j), vec_true_outs.at(j));
+          experimental::MovesStorage(
+              std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(j).impl())
+                  .get(),
+              vec_true_outs.at(j));
         }
       } else {
         auto* true_out = ctx.Output<Tensor>(out_name);
-        CustomTensorUtils::ShareDataTo(outs.at(i), true_out);
+        experimental::MovesStorage(
+            std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(i).impl())
+                .get(),
+            true_out);
       }
     }
   } catch (platform::EnforceNotMet& exception) {
@@ -479,8 +484,7 @@ void RegisterOperatorKernelWithPlace(const std::string& name,
                                      const std::vector<std::string>& inputs,
                                      const std::vector<std::string>& outputs,
                                      const std::vector<std::string>& attrs) {
-  OpKernelType key(type,
-                   CustomTensorUtils::ConvertEnumPlaceToInnerPlace(place));
+  OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place));
   VLOG(1) << "Custom Operator: op kernel key: " << key;
   OperatorWithKernel::AllOpKernels()[name][key] =
       [kernel_func, inputs, outputs,
@@ -717,14 +721,12 @@ void RegisterOperatorWithMetaInfo(
           std::vector<DataType> vec_custom_dtype;
           for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
             auto dtype = ctx->GetInputDataType(in_name, i);
-            vec_custom_dtype.emplace_back(
-                CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+            vec_custom_dtype.emplace_back(pten::TransToPtenDataType(dtype));
           }
           vec_input_dtypes.emplace_back(vec_custom_dtype);
         } else {
           auto dtype = ctx->GetInputDataType(in_name);
-          input_dtypes.emplace_back(
-              CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+          input_dtypes.emplace_back(pten::TransToPtenDataType(dtype));
         }
       }
 
@@ -736,14 +738,12 @@ void RegisterOperatorWithMetaInfo(
         auto out_name = op_outputs[i];
         if (detail::IsDuplicableVar(out_name)) {
           for (size_t j = 0; j < output_dtypes.size(); ++j) {
-            auto dtype = CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-                output_dtypes[i]);
+            auto dtype = pten::TransToProtoVarType(output_dtypes[i]);
             ctx->SetOutputDataType(out_name, dtype, j);
           }
         } else {
-          ctx->SetOutputDataType(
-              out_name, CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-                            output_dtypes[i]));
+          ctx->SetOutputDataType(out_name,
+                                 pten::TransToProtoVarType(output_dtypes[i]));
         }
       }
     };
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 259901c09f3e00729876d7bea062237ad5bad94a..d8712e60d24ba5d440eaa35d8ea15d1d87df549d 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <string>
 
-#include "paddle/fluid/extension/include/ext_op_meta_info.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
deleted file mode 100644
index d7bde04b84b1615d2f7e2f56e4b1218741faf674..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "paddle/fluid/extension/include/ext_tensor.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-class CustomTensorUtils {
- public:
-  /// \brief Share data TO another tensor.
-  /// Use this to pass tensor from op to op
-  /// \return void.
-  static void ShareDataTo(const paddle::Tensor& src, void* dst);
-
-  /// \brief Share data FROM another tensor.
-  /// Use this to pass tensor from op to op
-  /// \return void.
-  static void ShareDataFrom(const void* src, const paddle::Tensor& dst);
-
-  static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
-      const paddle::DataType& dtype) {
-    switch (dtype) {
-      case paddle::DataType::FLOAT64:
-        return framework::proto::VarType::FP64;
-      case paddle::DataType::FLOAT32:
-        return framework::proto::VarType::FP32;
-      case paddle::DataType::UINT8:
-        return framework::proto::VarType::UINT8;
-      case paddle::DataType::INT8:
-        return framework::proto::VarType::INT8;
-      case paddle::DataType::INT32:
-        return framework::proto::VarType::INT32;
-      case paddle::DataType::INT64:
-        return framework::proto::VarType::INT64;
-      case paddle::DataType::INT16:
-        return framework::proto::VarType::INT16;
-      case paddle::DataType::COMPLEX64:
-        return framework::proto::VarType::COMPLEX64;
-      case paddle::DataType::COMPLEX128:
-        return framework::proto::VarType::COMPLEX128;
-      case paddle::DataType::FLOAT16:
-        return framework::proto::VarType::FP16;
-      case paddle::DataType::BOOL:
-        return framework::proto::VarType::BOOL;
-      default:
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported data type code(%d) when casting enum data type into "
-            "paddle data type.",
-            static_cast<int>(dtype)));
-    }
-  }
-
-  static paddle::DataType ConvertInnerDTypeToEnumDType(
-      const framework::proto::VarType::Type& dtype) {
-    switch (dtype) {
-      case framework::proto::VarType::FP64:
-        return paddle::DataType::FLOAT64;
-      case framework::proto::VarType::FP32:
-        return paddle::DataType::FLOAT32;
-      case framework::proto::VarType::INT64:
-        return paddle::DataType::INT64;
-      case framework::proto::VarType::INT32:
-        return paddle::DataType::INT32;
-      case framework::proto::VarType::INT8:
-        return paddle::DataType::INT8;
-      case framework::proto::VarType::UINT8:
-        return paddle::DataType::UINT8;
-      case framework::proto::VarType::INT16:
-        return paddle::DataType::INT16;
-      case framework::proto::VarType::COMPLEX64:
-        return paddle::DataType::COMPLEX64;
-      case framework::proto::VarType::COMPLEX128:
-        return paddle::DataType::COMPLEX128;
-      case framework::proto::VarType::FP16:
-        return paddle::DataType::FLOAT16;
-      case framework::proto::VarType::BOOL:
-        return paddle::DataType::BOOL;
-      default:
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported data type `%s` when casting paddle data type into "
-            "enum data type.",
-            DataTypeToString(dtype)));
-    }
-  }
-
-  // PaddlePlace <-> platform::Place
-  static platform::Place ConvertEnumPlaceToInnerPlace(const PlaceType& pc) {
-    if (pc == PlaceType::kCPU) {
-      return platform::Place(platform::CPUPlace());
-    } else if (pc == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return platform::Place(
-          platform::CUDAPlace(platform::GetCurrentDeviceId()));
-#endif
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported place type code(%d) when "
-          "casting enum place to paddle place.",
-          static_cast<int>(pc)));
-    }
-    return platform::Place();
-  }
-
-  static PlaceType ConvertInnerPlaceToEnumPlace(const platform::Place& pc) {
-    if (platform::is_cpu_place(pc)) {
-      return PlaceType::kCPU;
-    } else if (platform::is_gpu_place(pc)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return PlaceType::kGPU;
-#endif
-    } else {
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported place type `%s` when "
-                                          "casting paddle place to enum place.",
-                                          pc));
-    }
-    return PlaceType::kUNK;
-  }
-
-  static void SetTensorCurrentStream(paddle::Tensor* src,
-                                     const platform::Place& pc) {
-    if (platform::is_gpu_place(pc)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
-          platform::DeviceContextPool::Instance().Get(pc));
-      src->stream_.SetStream(reinterpret_cast<void*>(dev_ctx->stream()));
-#endif
-    } else {
-      return;
-    }
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_meta_info_helper.h b/paddle/fluid/framework/op_meta_info_helper.h
index c70fe2f38ab63d7ddb4581f55cb121fa514870fe..b5b84dc5768e4d5757a84538b9b12b2f65de0c38 100644
--- a/paddle/fluid/framework/op_meta_info_helper.h
+++ b/paddle/fluid/framework/op_meta_info_helper.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/extension/include/ext_op_meta_info.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 09c72cb13b8033bdbd7e81d23707eb8f243ee7a3..3c66e35abf1d59a43e86541614a56ad78f957416 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -37,11 +37,6 @@ endif()
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
 
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
-
 add_subdirectory(api)
 
 # Create static inference library if needed
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ad0647236acb969dd32b7564a943db14ea83ee65..b1408995fa157bbbf5c52eae928d3226138de2e7 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -24,7 +24,6 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/extension/include/ext_op_meta_info.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
@@ -46,6 +45,7 @@
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index d78560239de50eb224641583d62b55bac75be465..630e85778c3307c7f24719be8f8102b72278c91d 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/extension/include/ext_op_meta_info.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 28c209018d662d054c27fa68d6b87f3eb58a7e31..ad726f2bf19b6ab48e86ef9894f322695b6449ab 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -22,6 +22,7 @@ template <typename T>
 class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
   using MPDType = typename details::MPTypeTrait<T>::Type;
   using XPUTyp = typename XPUTypeTrait<T>::Type;
+  using float16 = typename XPUTypeTrait<paddle::platform::float16>::Type;
 
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index c1a296f2b2788d9c1a32c36b2688b11b8751ded2..c3e23831e65286d3d65e8ae80933e0725ad85f27 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -29,6 +29,7 @@ namespace plat = paddle::platform;
 template <typename DeviceContext, typename InT>
 class CastXPUKernel : public framework::OpKernel<InT> {
   using XPUInTDType = typename XPUTypeTrait<InT>::Type;
+  using float16 = typename XPUTypeTrait<paddle::platform::float16>::Type;
 
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 595c833cbfa8a065bc7ee4d5c97fcf407186aec3..47027d69f82e00b741a67a6406ac01c0b268cf27 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,8 +1,3 @@
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
-
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 8fc63e8317e9e57c71c6f39ea405bfada7974991..2c2004162452c5da06a1a413d33737b236ea4516 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -1,3 +1,8 @@
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file, remove it after moving
+# float16.h/complex.h/bfloat16.h into pten
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 # pten (low level) api headers: include
 # pten (high level) api
 add_subdirectory(api)
@@ -20,4 +25,5 @@ endif()
 if(WITH_XPU)
   set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
 endif()
+
 cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
index e8e41a8c3eabc860799dfd63b33ea8dc2a4c232b..f9532eb33d897998e2521d04daf83d680630c4ce 100644
--- a/paddle/pten/all.h
+++ b/paddle/pten/all.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-// develop apis
+// developer apis
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/creation.h"
 #include "paddle/pten/include/infershape.h"
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
index 3a2c7b3fa98fe4adb813e7178bb6ebfad84fbec6..e4d079de9d41aa1c9a417a32a9855ec92f40f34c 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -14,9 +14,40 @@ limitations under the License. */
 
 #pragma once
 
-// user apis
+#if !defined(_MSC_VER) && __cplusplus < 201402L
+#error C++14 or later compatible compiler is required to use Paddle.
+#endif
+
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+#endif
+
+// new pten apis
 #include "paddle/pten/api/include/creation.h"
 #include "paddle/pten/api/include/linalg.h"
 #include "paddle/pten/api/include/manipulation.h"
 #include "paddle/pten/api/include/math.h"
 #include "paddle/pten/api/include/tensor.h"
+
+// pten common headers
+#include "paddle/pten/common/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/common/scalar.h"
+
+// original custom op headers
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/api/ext/dll_decl.h"
+#include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
+#include "paddle/pten/api/ext/place.h"
+
+// api symbols declare, remove in the future
+#include "paddle/pten/api/include/registry.h"
+
+PT_DECLARE_API(Creation);
+PT_DECLARE_API(Linalg);
+PT_DECLARE_API(Manipulation);
+PT_DECLARE_API(Math);
diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/pten/api/ext/dispatch.h
similarity index 51%
rename from paddle/fluid/extension/include/ext_dispatch.h
rename to paddle/pten/api/ext/dispatch.h
index 9b3e199708adc93356c214df3be217f67d2e8949..2b90bd77943f5d289d1057f388aa5f41c7d25fad 100644
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/pten/api/ext/dispatch.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "ext_dtype.h"      // NOLINT
-#include "ext_exception.h"  // NOLINT
+#include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/data_type.h"
 
 namespace paddle {
 
@@ -37,30 +37,32 @@ namespace paddle {
   [&] {                                                                   \
     const auto& __dtype__ = TYPE;                                         \
     switch (__dtype__) {                                                  \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,      \
-                           __VA_ARGS__)                                   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,     \
-                           __VA_ARGS__)                                   \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)         \
       default:                                                            \
         PD_THROW("function " #NAME " is not implemented for data type `", \
-                 ::paddle::ToString(__dtype__), "`");                     \
+                 __dtype__,                                               \
+                 "`");                                                    \
     }                                                                     \
   }()
 
-#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)                   \
-  [&] {                                                                        \
-    const auto& __dtype__ = TYPE;                                              \
-    switch (__dtype__) {                                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,           \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,          \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \
-                           __VA_ARGS__)                                        \
-      default:                                                                 \
-        PD_THROW("function " #NAME " is not implemented for data type `",      \
-                 ::paddle::ToString(__dtype__), "`");                          \
-    }                                                                          \
+#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)               \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT16, paddle::float16, __VA_ARGS__) \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `",  \
+                 __dtype__,                                                \
+                 "`");                                                     \
+    }                                                                      \
   }()
 
 ///////// Integral Dispatch Marco ///////////
@@ -70,34 +72,40 @@ namespace paddle {
     const auto& __dtype__ = TYPE;                                             \
     switch (__dtype__) {                                                      \
       PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
-                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
       default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `" +    \
-                 ::paddle::ToString(__dtype__) + "`");                        \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
     }                                                                         \
   }()
 
 ///////// Complex Dispatch Marco ///////////
 
-#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                         \
-  [&] {                                                                    \
-    const auto& __dtype__ = TYPE;                                          \
-    switch (__dtype__) {                                                   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
-                           ::paddle::complex64, __VA_ARGS__)               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
-                           ::paddle::complex128, __VA_ARGS__)              \
-      default:                                                             \
-        PD_THROW("function " #NAME " is not implemented for data type `" + \
-                 ::paddle::ToString(__dtype__) + "`");                     \
-    }                                                                      \
+#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                        \
+  [&] {                                                                   \
+    const auto& __dtype__ = TYPE;                                         \
+    switch (__dtype__) {                                                  \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX64,                 \
+                           ::paddle::complex64,                           \
+                           __VA_ARGS__)                                   \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX128,                \
+                           ::paddle::complex128,                          \
+                           __VA_ARGS__)                                   \
+      default:                                                            \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
+                 __dtype__,                                               \
+                 "`");                                                    \
+    }                                                                     \
   }()
 
 ///////// Floating and Integral Dispatch Marco ///////////
@@ -106,43 +114,49 @@ namespace paddle {
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
     switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
-                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
       PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
-                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
       default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `" +    \
-                 ::paddle::ToString(__dtype__) + "`");                        \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
     }                                                                         \
   }()
 
 ///////// Floating and Complex Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)            \
-  [&] {                                                                    \
-    const auto& __dtype__ = TYPE;                                          \
-    switch (__dtype__) {                                                   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,       \
-                           __VA_ARGS__)                                    \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,      \
-                           __VA_ARGS__)                                    \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
-                           ::paddle::complex64, __VA_ARGS__)               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
-                           ::paddle::complex128, __VA_ARGS__)              \
-      default:                                                             \
-        PD_THROW("function " #NAME " is not implemented for data type `" + \
-                 ::paddle::ToString(__dtype__) + "`");                     \
-    }                                                                      \
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)           \
+  [&] {                                                                   \
+    const auto& __dtype__ = TYPE;                                         \
+    switch (__dtype__) {                                                  \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)         \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX64,                 \
+                           ::paddle::complex64,                           \
+                           __VA_ARGS__)                                   \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX128,                \
+                           ::paddle::complex128,                          \
+                           __VA_ARGS__)                                   \
+      default:                                                            \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
+                 __dtype__,                                               \
+                 "`");                                                    \
+    }                                                                     \
   }()
 
 ///////// Floating, Integral and Complex Dispatch Marco ///////////
@@ -151,26 +165,31 @@ namespace paddle {
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
     switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
-                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
       PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(NAME,                                              \
+                           ::paddle::DataType::COMPLEX64,                     \
+                           ::paddle::complex64,                               \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME,                                              \
+                           ::paddle::DataType::COMPLEX128,                    \
+                           ::paddle::complex128,                              \
                            __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
-                           ::paddle::complex64, __VA_ARGS__)                  \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
-                           ::paddle::complex128, __VA_ARGS__)                 \
       default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `" +    \
-                 ::paddle::ToString(__dtype__) + "`");                        \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
     }                                                                         \
   }()
 
diff --git a/paddle/fluid/extension/include/ext_dll_decl.h b/paddle/pten/api/ext/dll_decl.h
similarity index 100%
rename from paddle/fluid/extension/include/ext_dll_decl.h
rename to paddle/pten/api/ext/dll_decl.h
diff --git a/paddle/fluid/extension/include/ext_exception.h b/paddle/pten/api/ext/exception.h
similarity index 81%
rename from paddle/fluid/extension/include/ext_exception.h
rename to paddle/pten/api/ext/exception.h
index 632d91d5285c22ac9143f49cfae133d1d1e67bd3..92b17b4898d3f7f9af26a0ba7172fc1a1ad36fe1 100644
--- a/paddle/fluid/extension/include/ext_exception.h
+++ b/paddle/pten/api/ext/exception.h
@@ -32,7 +32,9 @@ namespace paddle {
 struct PD_Exception : public std::exception {
  public:
   template <typename... Args>
-  explicit PD_Exception(const std::string& msg, const char* file, int line,
+  explicit PD_Exception(const std::string& msg,
+                        const char* file,
+                        int line,
                         const char* default_msg) {
     std::ostringstream sout;
     if (msg.empty()) {
@@ -75,24 +77,13 @@ class ErrorMessage {
   std::ostringstream oss;
 };
 
-#if defined _WIN32
-#define HANDLE_THE_ERROR try {
-#define END_HANDLE_THE_ERROR            \
-  }                                     \
-  catch (const std::exception& e) {     \
-    std::cerr << e.what() << std::endl; \
-    throw e;                            \
-  }
-#else
-#define HANDLE_THE_ERROR
-#define END_HANDLE_THE_ERROR
-#endif
-
 #define PD_CHECK(COND, ...)                                               \
   do {                                                                    \
     if (PD_UNLIKELY(!(COND))) {                                           \
       auto __message__ = ::paddle::ErrorMessage(__VA_ARGS__).to_string(); \
-      throw ::paddle::PD_Exception(__message__, __FILE__, __LINE__,       \
+      throw ::paddle::PD_Exception(__message__,                           \
+                                   __FILE__,                              \
+                                   __LINE__,                              \
                                    "Expected " #COND                      \
                                    ", but it's not satisfied.");          \
     }                                                                     \
@@ -101,8 +92,8 @@ class ErrorMessage {
 #define PD_THROW(...)                                                   \
   do {                                                                  \
     auto __message__ = ::paddle::ErrorMessage(__VA_ARGS__).to_string(); \
-    throw ::paddle::PD_Exception(__message__, __FILE__, __LINE__,       \
-                                 "An error occurred.");                 \
+    throw ::paddle::PD_Exception(                                       \
+        __message__, __FILE__, __LINE__, "An error occurred.");         \
   } while (0)
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/pten/api/ext/op_meta_info.h
similarity index 83%
rename from paddle/fluid/extension/include/ext_op_meta_info.h
rename to paddle/pten/api/ext/op_meta_info.h
index 6f2528030e603de37dbfb09c3e07539812f2c818..140874f93aed72ac1a4a56937e0b8a6fe908cdf4 100644
--- a/paddle/fluid/extension/include/ext_op_meta_info.h
+++ b/paddle/pten/api/ext/op_meta_info.h
@@ -19,10 +19,10 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "any.h"
-#include "ext_dll_decl.h"   // NOLINT
-#include "ext_exception.h"  // NOLINT
-#include "ext_tensor.h"     // NOLINT
+#include "paddle/pten/api/ext/dll_decl.h"
+#include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/utils/any.h"
 
 /**
  * Op Meta Info Related Define.
@@ -87,7 +87,9 @@ using KernelFunc =
 #define PD_SPECIALIZE_ComputeCallHelper(attr_type)                            \
   template <typename... Tail>                                                 \
   struct ComputeCallHelper<attr_type, Tail...> {                              \
-    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+    template <int in_idx,                                                     \
+              int vec_in_idx,                                                 \
+              int attr_idx,                                                   \
               typename... PreviousArgs>                                       \
     static Return Compute(const std::vector<Tensor>& inputs,                  \
                           const std::vector<std::vector<Tensor>>& vec_inputs, \
@@ -95,9 +97,10 @@ using KernelFunc =
                           const PreviousArgs&... pargs) {                     \
       try {                                                                   \
         attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);         \
-        return ComputeCallHelper<Tail...>::template Compute<                  \
-            in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs,      \
-                                              pargs..., arg);                 \
+        return ComputeCallHelper<Tail...>::template Compute<in_idx,           \
+                                                            vec_in_idx,       \
+                                                            attr_idx + 1>(    \
+            inputs, vec_inputs, attrs, pargs..., arg);                        \
       } catch (paddle::bad_any_cast&) {                                       \
         PD_THROW(                                                             \
             "Attribute cast error in custom operator. Expected " #attr_type   \
@@ -127,7 +130,9 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
 
   template <typename... Tail>
   struct ComputeCallHelper<const Tensor&, Tail...> {
-    template <int in_idx, int vec_in_idx, int attr_idx,
+    template <int in_idx,
+              int vec_in_idx,
+              int attr_idx,
               typename... PreviousArgs>
     static Return Compute(const std::vector<Tensor>& inputs,
                           const std::vector<std::vector<Tensor>>& vec_inputs,
@@ -135,23 +140,27 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
                           const PreviousArgs&... pargs) {
       const Tensor& arg = inputs[in_idx];
       return ComputeCallHelper<Tail...>::template Compute<in_idx + 1,
-                                                          vec_in_idx, attr_idx>(
+                                                          vec_in_idx,
+                                                          attr_idx>(
           inputs, vec_inputs, attrs, pargs..., arg);
     }
   };
 
   template <typename... Tail>
   struct ComputeCallHelper<const std::vector<Tensor>&, Tail...> {
-    template <int in_idx, int vec_in_idx, int attr_idx,
+    template <int in_idx,
+              int vec_in_idx,
+              int attr_idx,
               typename... PreviousArgs>
     static Return Compute(const std::vector<Tensor>& inputs,
                           const std::vector<std::vector<Tensor>>& vec_inputs,
                           const std::vector<paddle::any>& attrs,
                           const PreviousArgs&... pargs) {
       const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];
-      return ComputeCallHelper<Tail...>::template Compute<
-          in_idx, vec_in_idx + 1, attr_idx>(inputs, vec_inputs, attrs, pargs...,
-                                            arg);
+      return ComputeCallHelper<Tail...>::template Compute<in_idx,
+                                                          vec_in_idx + 1,
+                                                          attr_idx>(
+          inputs, vec_inputs, attrs, pargs..., arg);
     }
   };
 
@@ -206,65 +215,75 @@ using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
     const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
     const std::vector<paddle::any>& attrs);
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)               \
-  template <typename... Tail>                                                  \
-  struct InferShapeCallHelper<input_type, Tail...> {                           \
-    template <int in_idx, int vec_in_idx, int attr_idx,                        \
-              typename... PreviousArgs>                                        \
-    static Return InferShape(                                                  \
-        const std::vector<std::vector<int64_t>>& input_shapes,                 \
-        const std::vector<std::vector<std::vector<int64_t>>>&                  \
-            vec_input_shapes,                                                  \
-        const std::vector<paddle::any>& attrs, const PreviousArgs&... pargs) { \
-      input_type arg = input_shapes[in_idx];                                   \
-      return InferShapeCallHelper<Tail...>::template InferShape<               \
-          in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes,    \
-                                            attrs, pargs..., arg);             \
-    }                                                                          \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)            \
+  template <typename... Tail>                                               \
+  struct InferShapeCallHelper<input_type, Tail...> {                        \
+    template <int in_idx,                                                   \
+              int vec_in_idx,                                               \
+              int attr_idx,                                                 \
+              typename... PreviousArgs>                                     \
+    static Return InferShape(                                               \
+        const std::vector<std::vector<int64_t>>& input_shapes,              \
+        const std::vector<std::vector<std::vector<int64_t>>>&               \
+            vec_input_shapes,                                               \
+        const std::vector<paddle::any>& attrs,                              \
+        const PreviousArgs&... pargs) {                                     \
+      input_type arg = input_shapes[in_idx];                                \
+      return InferShapeCallHelper<Tail...>::template InferShape<in_idx + 1, \
+                                                                vec_in_idx, \
+                                                                attr_idx>(  \
+          input_shapes, vec_input_shapes, attrs, pargs..., arg);            \
+    }                                                                       \
   }
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)              \
-  template <typename... Tail>                                                  \
-  struct InferShapeCallHelper<input_type, Tail...> {                           \
-    template <int in_idx, int vec_in_idx, int attr_idx,                        \
-              typename... PreviousArgs>                                        \
-    static Return InferShape(                                                  \
-        const std::vector<std::vector<int64_t>>& input_shapes,                 \
-        const std::vector<std::vector<std::vector<int64_t>>>&                  \
-            vec_input_shapes,                                                  \
-        const std::vector<paddle::any>& attrs, const PreviousArgs&... pargs) { \
-      input_type arg = vec_input_shapes[vec_in_idx];                           \
-      return InferShapeCallHelper<Tail...>::template InferShape<               \
-          in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes,    \
-                                            attrs, pargs..., arg);             \
-    }                                                                          \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)    \
+  template <typename... Tail>                                        \
+  struct InferShapeCallHelper<input_type, Tail...> {                 \
+    template <int in_idx,                                            \
+              int vec_in_idx,                                        \
+              int attr_idx,                                          \
+              typename... PreviousArgs>                              \
+    static Return InferShape(                                        \
+        const std::vector<std::vector<int64_t>>& input_shapes,       \
+        const std::vector<std::vector<std::vector<int64_t>>>&        \
+            vec_input_shapes,                                        \
+        const std::vector<paddle::any>& attrs,                       \
+        const PreviousArgs&... pargs) {                              \
+      input_type arg = vec_input_shapes[vec_in_idx];                 \
+      return InferShapeCallHelper<Tail...>::                         \
+          template InferShape<in_idx, vec_in_idx + 1, attr_idx>(     \
+              input_shapes, vec_input_shapes, attrs, pargs..., arg); \
+    }                                                                \
   }
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type)                 \
-  template <typename... Tail>                                                  \
-  struct InferShapeCallHelper<attr_type, Tail...> {                            \
-    template <int in_idx, int vec_in_idx, int attr_idx,                        \
-              typename... PreviousArgs>                                        \
-    static Return InferShape(                                                  \
-        const std::vector<std::vector<int64_t>>& input_shapes,                 \
-        const std::vector<std::vector<std::vector<int64_t>>>&                  \
-            vec_input_shapes,                                                  \
-        const std::vector<paddle::any>& attrs, const PreviousArgs&... pargs) { \
-      try {                                                                    \
-        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);          \
-        return InferShapeCallHelper<Tail...>::template InferShape<             \
-            in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes,  \
-                                              attrs, pargs..., arg);           \
-      } catch (paddle::bad_any_cast&) {                                        \
-        PD_THROW(                                                              \
-            "Attribute cast error in custom operator InferShapeFn. "           \
-            "Expected " #attr_type                                             \
-            " value. InferShapeFn's attribute list must be exactly same as "   \
-            "Forward "                                                         \
-            "KernelFn's attribute list except std::vector<int64_t> "           \
-            "attribute.");                                                     \
-      }                                                                        \
-    }                                                                          \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type)               \
+  template <typename... Tail>                                                \
+  struct InferShapeCallHelper<attr_type, Tail...> {                          \
+    template <int in_idx,                                                    \
+              int vec_in_idx,                                                \
+              int attr_idx,                                                  \
+              typename... PreviousArgs>                                      \
+    static Return InferShape(                                                \
+        const std::vector<std::vector<int64_t>>& input_shapes,               \
+        const std::vector<std::vector<std::vector<int64_t>>>&                \
+            vec_input_shapes,                                                \
+        const std::vector<paddle::any>& attrs,                               \
+        const PreviousArgs&... pargs) {                                      \
+      try {                                                                  \
+        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);        \
+        return InferShapeCallHelper<Tail...>::                               \
+            template InferShape<in_idx, vec_in_idx, attr_idx + 1>(           \
+                input_shapes, vec_input_shapes, attrs, pargs..., arg);       \
+      } catch (paddle::bad_any_cast&) {                                      \
+        PD_THROW(                                                            \
+            "Attribute cast error in custom operator InferShapeFn. "         \
+            "Expected " #attr_type                                           \
+            " value. InferShapeFn's attribute list must be exactly same as " \
+            "Forward "                                                       \
+            "KernelFn's attribute list except std::vector<int64_t> "         \
+            "attribute.");                                                   \
+      }                                                                      \
+    }                                                                        \
   }
 
 template <typename F, F f>
@@ -276,8 +295,10 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
       const std::vector<std::vector<int64_t>>& input_shapes,
       const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
       const std::vector<paddle::any>& attrs) {
-    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<
-        0, 0, 0>(input_shapes, vec_input_shapes, attrs);
+    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<0,
+                                                                            0,
+                                                                            0>(
+        input_shapes, vec_input_shapes, attrs);
   }
 
  private:
@@ -313,7 +334,8 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
     static Return InferShape(
         const std::vector<std::vector<int64_t>>& input_shapes,
         const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
-        const std::vector<paddle::any>& attrs, const Args&... args) {
+        const std::vector<paddle::any>& attrs,
+        const Args&... args) {
       return impl_fn(args...);
     }
   };
@@ -344,19 +366,20 @@ using InferDtypeFunc = std::vector<DataType> (*)(
     }                                                                        \
   }
 
-#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)           \
-  template <typename... Tail>                                               \
-  struct InferDtypeCallHelper<input_type, Tail...> {                        \
-    template <int in_idx, int vec_in_idx, typename... PreviousArgs>         \
-    static Return InferDtype(                                               \
-        const std::vector<DataType>& input_dtypes,                          \
-        const std::vector<std::vector<DataType>>& vec_input_dtypes,         \
-        const PreviousArgs&... pargs) {                                     \
-      input_type arg = vec_input_dtypes[vec_in_idx];                        \
-      return InferDtypeCallHelper<Tail...>::template InferDtype<            \
-          in_idx, vec_in_idx + 1>(input_dtypes, vec_input_dtypes, pargs..., \
-                                  arg);                                     \
-    }                                                                       \
+#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)            \
+  template <typename... Tail>                                                \
+  struct InferDtypeCallHelper<input_type, Tail...> {                         \
+    template <int in_idx, int vec_in_idx, typename... PreviousArgs>          \
+    static Return InferDtype(                                                \
+        const std::vector<DataType>& input_dtypes,                           \
+        const std::vector<std::vector<DataType>>& vec_input_dtypes,          \
+        const PreviousArgs&... pargs) {                                      \
+      input_type arg = vec_input_dtypes[vec_in_idx];                         \
+      return InferDtypeCallHelper<Tail...>::template InferDtype<in_idx,      \
+                                                                vec_in_idx + \
+                                                                    1>(      \
+          input_dtypes, vec_input_dtypes, pargs..., arg);                    \
+    }                                                                        \
   }
 
 template <typename F, F f>
diff --git a/paddle/fluid/extension/include/ext_place.h b/paddle/pten/api/ext/place.h
similarity index 100%
rename from paddle/fluid/extension/include/ext_place.h
rename to paddle/pten/api/ext/place.h
diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
index 688aa20780bba4bc6fc5229ce25bbcda85345a87..bcd3d4355cf6e9683599143969b136478aa90319 100644
--- a/paddle/pten/api/include/creation.h
+++ b/paddle/pten/api/include/creation.h
@@ -15,33 +15,34 @@
 #pragma once
 
 #include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/scalar.h"
 
 namespace paddle {
 namespace experimental {
 
-Tensor full(const std::vector<int64_t>& shape,
-            const Scalar& value,
-            DataType dtype = DataType::FLOAT32,
-            Backend backend = Backend::CPU,
-            DataLayout layout = DataLayout::NCHW);
-
-Tensor full_like(const Tensor& x,
-                 const Scalar& value,
-                 DataType dtype = DataType::UNDEFINED,
-                 Backend backend = Backend::UNDEFINED,
-                 DataLayout layout = DataLayout::UNDEFINED);
-
-Tensor ones_like(const Tensor& x,
-                 DataType dtype = DataType::UNDEFINED,
-                 Backend backend = Backend::UNDEFINED,
-                 DataLayout layout = DataLayout::UNDEFINED);
-
-Tensor zeros_like(const Tensor& x,
-                  DataType dtype = DataType::UNDEFINED,
-                  Backend backend = Backend::UNDEFINED,
-                  DataLayout layout = DataLayout::UNDEFINED);
+PD_DLL_DECL Tensor full(const std::vector<int64_t>& shape,
+                        const Scalar& value,
+                        DataType dtype = DataType::FLOAT32,
+                        Backend backend = Backend::CPU,
+                        DataLayout layout = DataLayout::NCHW);
+
+PD_DLL_DECL Tensor full_like(const Tensor& x,
+                             const Scalar& value,
+                             DataType dtype = DataType::UNDEFINED,
+                             Backend backend = Backend::UNDEFINED,
+                             DataLayout layout = DataLayout::UNDEFINED);
+
+PD_DLL_DECL Tensor ones_like(const Tensor& x,
+                             DataType dtype = DataType::UNDEFINED,
+                             Backend backend = Backend::UNDEFINED,
+                             DataLayout layout = DataLayout::UNDEFINED);
+
+PD_DLL_DECL Tensor zeros_like(const Tensor& x,
+                              DataType dtype = DataType::UNDEFINED,
+                              Backend backend = Backend::UNDEFINED,
+                              DataLayout layout = DataLayout::UNDEFINED);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/include/linalg.h b/paddle/pten/api/include/linalg.h
index c28c133018464b635cd9c6b9037d3b12b8e91a5a..259cf664932038d1abbf4ab1c1ec0ebdba5f00ae 100644
--- a/paddle/pten/api/include/linalg.h
+++ b/paddle/pten/api/include/linalg.h
@@ -19,12 +19,12 @@
 namespace paddle {
 namespace experimental {
 
-Tensor dot(const Tensor& x, const Tensor& y);
+PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y);
 
-Tensor matmul(const Tensor& x,
-              const Tensor& y,
-              bool transpose_x,
-              bool transpose_y);
+PD_DLL_DECL Tensor matmul(const Tensor& x,
+                          const Tensor& y,
+                          bool transpose_x = false,
+                          bool transpose_y = false);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
index fe8c01cb74b95fe3c6159413df5e1f0e8b618eb1..8c3bf5ae94d95b8c5cfd7711533990bf1992a876 100644
--- a/paddle/pten/api/include/manipulation.h
+++ b/paddle/pten/api/include/manipulation.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
+PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
index 01d5850a2b286738827d5bcf800de9a3cf03fefb..41b4d2ffb33611347c3aac6bd7f8061742758cd7 100644
--- a/paddle/pten/api/include/math.h
+++ b/paddle/pten/api/include/math.h
@@ -21,9 +21,9 @@ namespace experimental {
 
 // TODO(chenweihang): add scale API
 // TODO(chenweihang): move mean API into stat.h/cc
-Tensor mean(const Tensor& x);
+PD_DLL_DECL Tensor mean(const Tensor& x);
 
-Tensor add(const Tensor& x, const Tensor& y);
+PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/include/registry.h b/paddle/pten/api/include/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..abb31451d522ebc948ad156c2feb4f50a9dd6d4f
--- /dev/null
+++ b/paddle/pten/api/include/registry.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/ext/dll_decl.h"
+
+namespace paddle {
+namespace experimental {
+
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
+/**
+ * Now there is no module to call pten's API. When compiling, the function
+ * implementation will be optimized. Therefore, the symbol will be exposed
+ * manually for the time being.
+ *
+ * After the dynamic graph calls the API in the future, the logic declared
+ * by these macro can be deleted.
+ */
+
+// use to declare symbol
+#define PT_REGISTER_API(name) \
+  PD_DLL_DECL int RegisterSymbolsFor##name() { return 0; }
+
+#define PT_DECLARE_API(name)                         \
+  extern PD_DLL_DECL int RegisterSymbolsFor##name(); \
+  UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index 66ea7853541bdbaa99e7f6899e2740c3bcbb3a3b..31e57b69447a22c1486e41bce7e613f616ba378c 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -17,35 +17,38 @@ limitations under the License. */
 #include <functional>
 #include <memory>
 #include <utility>
+#include <vector>
 
-#include "paddle/pten/core/tensor_base.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+using gpuStream_t = cudaStream_t;
+#endif
 
-/**
- * [ Why still include the fluid headers? ]
- *
- * We hope to organize the basic implementation of Tensor and the logic related
- * to Tensor computation into an independent library, which we call
- * [Tensor Operation Library, pten], so we extract or rewrite the original
- * Kernels.
- *
- * In the future, the training library, inference library and custom operators
- * will link to this Tensor Operation library.
- *
- * However, if we directly split the link relation, we need to make too many
- * changes, which will affect the stability of the framework, so here we still
- * rely on the implementation of the framework, which is a intermediate state.
- *
- * In the future, the necessary components will be moved to the this library,
- * or the corresponding components will be re-implemented.
- */
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+using gpuStream_t = hipStream_t;
+#endif
+
+#include "paddle/pten/api/ext/dll_decl.h"
+#include "paddle/pten/api/ext/place.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+
+namespace pten {
+class TensorBase;
+}  // namespace pten
 
 namespace paddle {
+namespace framework {
+class DDim;
+}
+namespace platform {
+class Place;
+}
 namespace experimental {
 
 class Tensor;
+class CompatiblePTenTensorUtils;
 
 class AbstractAutogradMeta {
  public:
@@ -80,138 +83,336 @@ class AbstractAutogradMeta {
  * another simple Tensor design may be required for inference.
  */
 
-class Tensor final {
+class PD_DLL_DECL Tensor final {
  public:
   /* Part 1: Construction and destruction methods */
-  Tensor() {}
+
+  /**
+   * @brief Construct a new Tensor object
+   */
+  Tensor() = default;
+
+  /**
+   * @brief Construct a new Tensor object by copy
+   */
   Tensor(const Tensor&) = default;
+
+  /**
+   * @brief Construct a new Tensor object by move
+   */
   Tensor(Tensor&&) = default;
 
   /**
-   * @description: Use a TensorImpl pointer to construct a Tensor
-   * @param {shared_ptr<TensorBase>} tensor_impl
-   * @return {Tensor}
+   * @brief Construct a new Tensor object by a TensorBase pointer
+   *
+   * @param tensor_impl
+   */
+  explicit Tensor(std::shared_ptr<pten::TensorBase> tensor_impl);
+
+  /**
+   * @brief Construct a new Tensor object on the target place.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @param place
    */
-  explicit Tensor(std::shared_ptr<pten::TensorBase> tensor_impl)
-      : impl_(std::move(tensor_impl)) {
-    PADDLE_ENFORCE_NOT_NULL(impl_,
-                            platform::errors::InvalidArgument(
-                                "TensorImpl with nullptr is not supported"));
-  }
+  explicit Tensor(const PlaceType& place);
+
+  /**
+   * @brief Construct a new Tensor object on the target place
+   * with specified shape.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @param place
+   * @param shape
+   */
+  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
 
   /* Part 2: Dimension, DataType and DataLayout methods */
+
+  /**
+   * @brief Return the number of elements of Tensor.
+   *
+   * @return int64_t
+   */
+  int64_t numel() const;
+
+  /**
+   * @brief Get the size of current tensor.
+   * The compatible method of `Tensor::numel()`.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @return int64_t
+   */
+  int64_t size() const;
+
   /**
-   * @description: Return the number of elements of current Tensor.
-   * @param None
-   * @return {int64_t}
+   * @brief Return the dimensions of Tensor.
+   *
+   * @return paddle::framework::DDim
    */
-  int64_t numel() const { return impl_->numel(); }
+  paddle::framework::DDim dims() const;
 
   /**
-   * @description: Return the shape (dimensions) of current Tensor.
-   * @param None
-   * @return {DDim}
+   * @brief Return the shape (dimensions) of Tensor.
+   * The compatible method of `Tensor::dims()`.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @return std::vector<int64_t>
+   */
+  std::vector<int64_t> shape() const;
+
+  /**
+   * @brief Reset the shape of the tensor.
+   * Reshape must be called before calling mutable_data() or
+   * copy_to(const PlaceType& place).
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @param shape
+   */
+  void reshape(const std::vector<int64_t>& shape);
+
+  /**
+   * @brief Return the data type of Tensor.
+   *
+   * @return DataType
    */
-  paddle::framework::DDim shape() const { return impl_->dims(); }
+  DataType dtype() const;
 
   /**
-   * @description: Return the data type of current Tensor.
-   * @param None
-   * @return {DataType}
+   * @brief Return the data type of Tensor.
+   * The compatible method of `Tensor::dtype()`.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @return DataType
    */
-  paddle::experimental::DataType type() const { return impl_->data_type(); }
+  DataType type() const;
 
   /**
-   * @description: Return the layout of current Tensor.
-   * @param None
-   * @return {DataLayout}
+   * @brief Return the layout of Tensor.
+   *
+   * @return DataLayout
    */
-  paddle::experimental::DataLayout layout() const { return impl_->layout(); }
+  DataLayout layout() const;
 
   /* Part 3: Device and Backend methods */
+
   /**
-   * @description: Return the place (device) of current Tensor.
-   * @param None
-   * @return {Place}
+   * @brief Return the place (device) of Tensor.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @return PlaceType
    */
-  paddle::platform::Place place() const { return impl_->place(); }
+  PlaceType place() const;
 
   /**
-   * Backend judgment APIs, shield the concept of Backend.
+   * @brief Return the place (device) of Tensor.
+   * Because the `place` method already exists, so we need to use a new name,
+   * here we temporarily use `inner_place`.
+   *
+   * @return paddle::platform::Place
    */
-  bool is_cpu() const { return paddle::platform::is_cpu_place(place()); }
-  bool is_cuda() const { return paddle::platform::is_gpu_place(place()); }
+  paddle::platform::Place inner_place() const;
 
   /**
-   * Backend convert APIs.
+   * @brief Determine whether the tensor device is CPU
+   *
+   * @return true
+   * @return false
    */
-  Tensor cpu() const;
-  Tensor cuda() const;
+  bool is_cpu() const;
+
+  /**
+   * @brief Determine whether the tensor device is CUDA
+   *
+   * @return true
+   * @return false
+   */
+  bool is_cuda() const;
 
   /* Part 4: Data Access methods */
+
+  /**
+   * @brief Get the memory pointer in CPU or GPU with specific data type.
+   * It's usually used to get the output data pointer.
+   *
+   * @tparam T
+   * @return T*
+   */
+  template <typename T>
+  T* mutable_data();
+
   /**
-   * @description: Return the implemention of current Tensor.
-   * @param None
-   * @return {std::shared_ptr<TensorBase>}
+   * @brief Get the memory pointer in CPU or GPU with specific data type.
+   * It's usually used to get the output data pointer.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @tparam T
+   * @param place
+   * @return T*
+   */
+  template <typename T>
+  T* mutable_data(const PlaceType& place);
+
+  /**
+   * @brief Get the const memory pointer directly.
+   * It's usually used to get the output data pointer.
+   *
+   * @tparam T
+   * @return T*
+   */
+  template <typename T>
+  const T* data() const;
+
+  /**
+   * @brief Get the memory pointer directly.
+   * It's usually used to get the output data pointer.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @tparam T
+   * @return T*
+   */
+  template <typename T>
+  T* data();
+
+  /**
+   * @brief Return a sub-tensor of the given tensor.
+   * It is usually used to extract a sub-tensor (which supports
+   * modifying the data of the original tensor) to perform further
+   * operations.
+   *
+   * @param begin_idx The index of the start row (inclusive) to slice.
+   *                  The index number begins from 0.
+   * @param end_idx The index of the end row (exclusive) to slice.
+   *                 The index number begins from begin_idx + 1.
+   * @return Tensor
+   */
+  Tensor slice(const int64_t begin_idx, const int64_t end_idx) const;
+
+  /**
+   * @brief Return the implemention of current Tensor.
+   *
+   * @return std::shared_ptr<pten::TensorBase>
+   */
+  std::shared_ptr<pten::TensorBase> impl() const;
+
+  /**
+   * @brief Set the implemention of current Tensor.
+   *
+   * @param impl
+   */
+  void set_impl(const std::shared_ptr<pten::TensorBase>& impl);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  /**
+   * @brief Get the stream where the tensor is currently located
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @return gpuStream_t
+   */
+  gpuStream_t stream() const;
+#endif
+
+  /* Part 5: Data Transform methods */
+
+  /**
+   * @brief Copy the current Tensor data to the specified device
+   * and return the new Tensor.
+   * It's usually used to set the input tensor data.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @tparam T
+   * @param target_place, the target place of which the tensor will copy to.
+   * @return Tensor
    */
-  std::shared_ptr<pten::TensorBase> impl() const { return impl_; }
+  template <typename T>
+  Tensor copy_to(const PlaceType& target_place) const;
 
   /**
-   * @description: Set the implemention of current Tensor.
-   * @param {std::shared_ptr<TensorBase>}
-   * @return None
+   * @brief Transfer the current Tensor to the specified device and return.
+   *
+   * @param place, the target place of which the tensor will copy to.
+   * @return Tensor
+   */
+  Tensor to(const PlaceType& place) const;
+
+  /**
+   * @brief Cast datatype from one to another
+   *
+   * @param target_type
+   * @return Tensor
    */
-  void set_impl(const std::shared_ptr<pten::TensorBase>& impl) { impl_ = impl; }
+  Tensor cast(const DataType& target_type) const;
 
-  // TODO(chenweihang): Whether API Tensor need `data` and `mutable_data`?
+  /* Part 6: Status utils methods */
 
-  // TODO(chenweihang): slice and split methods use kernels?
+  /**
+   * @brief Determine whether it is a meaningful Tensor
+   *
+   * @return true
+   * @return false
+   */
+  bool defined() const;
+
+  /**
+   * @brief Determine whether Tensor is initialized.
+   *
+   * @return true
+   * @return false
+   */
+  bool initialized() const;
+
+  /**
+   * @brief Determine whether Tensor is initialized.
+   * This is a deprecated method and may be removed in the future!
+   *
+   * @return true
+   * @return false
+   */
+  bool is_initialized() const;
 
-  /* Part 5: Status utils methods */
   /**
-   * @description: Determine whether it is a meaningful Tensor
-   * @param None
-   * @return {bool}
+   * @brief Reset the Tensor implementation
    */
-  bool defined() const { return impl_ != nullptr; }
+  void reset();
+
+  /* Part 7: Operator overloading */
 
   /**
-   * @description: Determine whether Tensor is initialized
-   * @param None
-   * @return {bool}
+   * @brief Assignment operator
+   *
+   * @param x
+   * @return Tensor&
    */
-  bool initialized() const { return impl_->initialized(); }
+  Tensor& operator=(const Tensor& x) &;
 
   /**
-   * @description: Reset the Tensor implementation
-   * @param None
-   * @return {void}
+   * @brief Move assignment operator
+   *
+   * @param x
+   * @return Tensor&
    */
-  void reset() { impl_.reset(); }
+  Tensor& operator=(Tensor&& x) &;
 
-  /* Part 6: Operator overloading */
-  Tensor& operator=(const Tensor& x) & {
-    impl_ = x.impl_;
-    autograd_meta_ = x.autograd_meta_;
-    return *this;
-  }
-  Tensor& operator=(Tensor&& x) & {
-    impl_ = std::move(x.impl_);
-    autograd_meta_ = std::move(x.autograd_meta_);
-    return *this;
-  }
+  /* Part 8: Autograd methods */
 
-  /* Part 7: Autograd methods */
-  AbstractAutogradMeta* get_autograd_meta() const {
-    return autograd_meta_.get();
-  }
+  /**
+   * @brief Get the autograd meta object
+   *
+   * @return AbstractAutogradMeta*
+   */
+  AbstractAutogradMeta* get_autograd_meta() const;
 
-  void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta) {
-    autograd_meta_ = std::move(autograd_meta);
-  }
+  /**
+   * @brief Set the autograd meta object
+   *
+   * @param autograd_meta
+   */
+  void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta);
 
-  /* Part 8: Auto generated Tensor methods */
-  // ...
+  /* Part 9: Auto generated Tensor methods */
+
+ private:
+  friend class CompatiblePTenTensorUtils;
 
  private:
   /**
@@ -249,10 +450,15 @@ class Tensor final {
 
   /**
    * Tensor name: used for adapt original execution mechanism and debug analysis
-   * in the development of new dygraph.
+   * in the development of new dygraph. It may be removed in the future.
    */
   std::string name_;
 };
 
 }  // namespace experimental
 }  // namespace paddle
+
+namespace paddle {
+// In order to be compatible with the original custom operator Tensor interface
+using Tensor = paddle::experimental::Tensor;
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index a4726b3d426f63bcbb2d2d8e07aa1f286a0bc0bb..2f605e491daa4f68c592ef291f53bc578361d18b 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -1,6 +1,20 @@
 add_subdirectory(utils)
 
-cc_library(math_api SRCS math.cc DEPS pten)
-cc_library(linalg_api SRCS linalg.cc DEPS pten)
-cc_library(creation_api SRCS creation.cc DEPS pten)
-cc_library(manipulation_api SRCS manipulation.cc DEPS pten)
+cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
+
+if (WITH_GPU)
+  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce)
+elseif (WITH_ROCM)
+  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce)
+else()
+  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce)
+endif()
+
+cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor device_context kernel_factory)
+
+cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
+
+cc_library(math_api SRCS math.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(linalg_api SRCS linalg.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(creation_api SRCS creation.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(manipulation_api SRCS manipulation.cc DEPS pten_tensor pten kernel_dispatch)
diff --git a/paddle/pten/api/include/backend_set.h b/paddle/pten/api/lib/backend_set.h
similarity index 90%
rename from paddle/pten/api/include/backend_set.h
rename to paddle/pten/api/lib/backend_set.h
index e01c195e955301326007b71339a785668e56837a..923048cdd2c84dea22d9adf41eb5ddc8f868d63d 100644
--- a/paddle/pten/api/include/backend_set.h
+++ b/paddle/pten/api/lib/backend_set.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/api/ext/exception.h"
 #include "paddle/pten/common/backend.h"
 namespace paddle {
 namespace experimental {
@@ -38,10 +38,7 @@ class BackendSet final {
   uint64_t bitset() const { return bitset_; }
 
   bool inline Has(Backend b) const {
-    PADDLE_ENFORCE_NE(b,
-                      Backend::UNDEFINED,
-                      platform::errors::InvalidArgument(
-                          "Backend argument can't be UNDEFINED."));
+    PD_CHECK(b != Backend::UNDEFINED, "Backend argument can't be UNDEFINED.");
     return static_cast<bool>(bitset_ & BackendSet(b).bitset());
   }
   bool IsEmpty() const { return bitset_ == 0; }
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
index 450b4d3de7eb3c7920be573291c6404efd2fb562..f39baa0207c73b1d5e9c429748826eba20129224 100644
--- a/paddle/pten/api/lib/creation.cc
+++ b/paddle/pten/api/lib/creation.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/pten/api/include/registry.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/include/core.h"
@@ -26,11 +27,11 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor full(const std::vector<int64_t>& shape,
-            const Scalar& value,
-            DataType dtype,
-            Backend backend,
-            DataLayout layout) {
+PD_DLL_DECL Tensor full(const std::vector<int64_t>& shape,
+                        const Scalar& value,
+                        DataType dtype,
+                        Backend backend,
+                        DataLayout layout) {
   // 1. Get kernel signature and kernel
   pten::KernelKey kernel_key{backend, layout, dtype};
   auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
@@ -61,11 +62,11 @@ Tensor full(const std::vector<int64_t>& shape,
   return out;
 }
 
-Tensor full_like(const Tensor& x,
-                 const Scalar& value,
-                 DataType dtype,
-                 Backend backend,
-                 DataLayout layout) {
+PD_DLL_DECL Tensor full_like(const Tensor& x,
+                             const Scalar& value,
+                             DataType dtype,
+                             Backend backend,
+                             DataLayout layout) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
@@ -106,19 +107,21 @@ Tensor full_like(const Tensor& x,
   return out;
 }
 
-Tensor ones_like(const Tensor& x,
-                 DataType dtype,
-                 Backend backend,
-                 DataLayout layout) {
+PD_DLL_DECL Tensor ones_like(const Tensor& x,
+                             DataType dtype,
+                             Backend backend,
+                             DataLayout layout) {
   return full_like(x, 1, dtype, backend, layout);
 }
 
-Tensor zeros_like(const Tensor& x,
-                  DataType dtype,
-                  Backend backend,
-                  DataLayout layout) {
+PD_DLL_DECL Tensor zeros_like(const Tensor& x,
+                              DataType dtype,
+                              Backend backend,
+                              DataLayout layout) {
   return full_like(x, 0, dtype, backend, layout);
 }
 
 }  // namespace experimental
 }  // namespace paddle
+
+PT_REGISTER_API(Creation);
diff --git a/paddle/pten/api/lib/ext_compat_utils.cc b/paddle/pten/api/lib/ext_compat_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3aef1257072910347ea517249a84c50c5ca4aec1
--- /dev/null
+++ b/paddle/pten/api/lib/ext_compat_utils.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/ext_compat_utils.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace experimental {
+
+platform::Place ConvertExtPlaceToInnerPlace(const PlaceType& p) {
+  if (p == PlaceType::kCPU) {
+    return platform::Place(platform::CPUPlace());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (p == PlaceType::kGPU) {
+    return platform::Place(platform::CUDAPlace(platform::GetCurrentDeviceId()));
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Unsupported place type code(%d) when "
+                                        "casting enum place to paddle place.",
+                                        static_cast<int>(p)));
+  }
+  return platform::Place();
+}
+
+PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p) {
+  if (platform::is_cpu_place(p)) {
+    return PlaceType::kCPU;
+  } else if (platform::is_gpu_place(p)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    return PlaceType::kGPU;
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Unsupported place type `%s` when "
+                                        "casting paddle place to enum place.",
+                                        p));
+  }
+  return PlaceType::kUNK;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_all.h b/paddle/pten/api/lib/ext_compat_utils.h
similarity index 55%
rename from paddle/fluid/extension/include/ext_all.h
rename to paddle/pten/api/lib/ext_compat_utils.h
index 6987b33012f64d6e4d473ffc7ae666c432c65967..3a4ea20ff88e09784126d1831ffde91628f67e58 100644
--- a/paddle/fluid/extension/include/ext_all.h
+++ b/paddle/pten/api/lib/ext_compat_utils.h
@@ -14,19 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#if !defined(_MSC_VER) && __cplusplus < 201402L
-#error C++14 or later compatible compiler is required to use Paddle.
-#endif
-
-#ifdef _WIN32
-#ifndef NOMINMAX
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-#endif
-
-#include "ext_dispatch.h"      // NOLINT
-#include "ext_dtype.h"         // NOLINT
-#include "ext_exception.h"     // NOLINT
-#include "ext_op_meta_info.h"  // NOLINT
-#include "ext_place.h"         // NOLINT
-#include "ext_tensor.h"        // NOLINT
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/api/ext/place.h"
+
+namespace paddle {
+namespace experimental {
+
+platform::Place ConvertExtPlaceToInnerPlace(const PlaceType& p);
+
+PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.cc b/paddle/pten/api/lib/kernel_dispatch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0205a0d53c319fdc7c4a809fd0aecf9f3a11210e
--- /dev/null
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+
+#include "paddle/pten/core/convert_utils.h"
+
+namespace paddle {
+namespace experimental {
+namespace detail {
+
+BackendSet GetTensorBackendSet(const Tensor& t) {
+  BackendSet backend_set(pten::TransToPtenBackend(t.inner_place()));
+  switch (t.layout()) {
+    case DataLayout::MKLDNN:
+      backend_set = backend_set | BackendSet(Backend::MKLDNN);
+      break;
+    default:
+      // do nothing
+      break;
+  }
+  return backend_set;
+}
+
+std::size_t CountLeadingZeros(uint64_t val) {
+  if (val == 0) {
+    return 64;
+  }
+  std::size_t zero_bits = 0;
+  for (std::size_t shift = 64 >> 1; shift; shift >>= 1) {
+    uint64_t tmp = val >> shift;
+    if (tmp) {
+      val = tmp;
+    } else {
+      zero_bits |= shift;
+    }
+  }
+  return zero_bits;
+}
+
+}  // namespace detail
+
+paddle::platform::DeviceContext* GetDeviceContextByBackend(
+    pten::Backend backend) {
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  return pool.Get(pten::TransToFluidPlace(backend));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.h b/paddle/pten/api/lib/kernel_dispatch.h
index 567c21eeee9e86757db8842d49afd6ad5cfa02f6..46aa9ce992939ac98b3718f693f5c737893768ce 100644
--- a/paddle/pten/api/lib/kernel_dispatch.h
+++ b/paddle/pten/api/lib/kernel_dispatch.h
@@ -18,13 +18,12 @@ limitations under the License. */
 #include <string>
 #include <utility>
 
-#include "paddle/pten/api/include/backend_set.h"
 #include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/api/lib/backend_set.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
 
 // TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
-#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -40,36 +39,13 @@ using CUDAContext = paddle::platform::CUDADeviceContext;
 #endif
 
 namespace detail {
-BackendSet GetTensorBackendSet(const Tensor& t) {
-  BackendSet backend_set(pten::TransToPtenBackend(t.place()));
-  switch (t.layout()) {
-    case DataLayout::MKLDNN:
-      backend_set = backend_set | BackendSet(Backend::MKLDNN);
-      break;
-    default:
-      // do nothing
-      break;
-  }
-  return backend_set;
-}
-
-std::size_t CountLeadingZeros(uint64_t val) {
-  if (val == 0) {
-    return 64;
-  }
-  std::size_t zero_bits = 0;
-  for (std::size_t shift = 64 >> 1; shift; shift >>= 1) {
-    uint64_t tmp = val >> shift;
-    if (tmp) {
-      val = tmp;
-    } else {
-      zero_bits |= shift;
-    }
-  }
-  return zero_bits;
-}
+BackendSet GetTensorBackendSet(const Tensor& t);
+std::size_t CountLeadingZeros(uint64_t val);
 }  // namespace detail
 
+paddle::platform::DeviceContext* GetDeviceContextByBackend(
+    pten::Backend backend);
+
 // TODO(chenweihang): support DataLayout and DataType selected
 struct KernelKeySet {
   BackendSet backend_set{Backend::UNDEFINED};
@@ -144,11 +120,5 @@ KernelKeySet ParseKernelKeyByInputArgs(const Args&... args) {
   return detail::KernelKeyParser().apply(args...).key_set;
 }
 
-paddle::platform::DeviceContext* GetDeviceContextByBackend(
-    pten::Backend backend) {
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  return pool.Get(pten::TransToFluidPlace(backend));
-}
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc
index 0ede7b8a68b416c0101579321e4fd507a69dc897..66c7b570b38374360321e46ebd008d6bc9b79047 100644
--- a/paddle/pten/api/lib/linalg.cc
+++ b/paddle/pten/api/lib/linalg.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/pten/api/include/registry.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/convert_utils.h"
@@ -29,7 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor dot(const Tensor& x, const Tensor& y) {
+PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
@@ -64,10 +65,10 @@ Tensor dot(const Tensor& x, const Tensor& y) {
   return out;
 }
 
-Tensor matmul(const Tensor& x,
-              const Tensor& y,
-              bool transpose_x,
-              bool transpose_y) {
+PD_DLL_DECL Tensor matmul(const Tensor& x,
+                          const Tensor& y,
+                          bool transpose_x,
+                          bool transpose_y) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
@@ -108,3 +109,5 @@ Tensor matmul(const Tensor& x,
 
 }  // namespace experimental
 }  // namespace paddle
+
+PT_REGISTER_API(Linalg);
diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc
index 43752120e37100af3d737e629623bd76c9b23d87..4b80b2010ccc07f5f0c333638aa1702cb683d064 100644
--- a/paddle/pten/api/lib/manipulation.cc
+++ b/paddle/pten/api/lib/manipulation.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 
 #include "glog/logging.h"
+#include "paddle/pten/api/include/registry.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/include/core.h"
@@ -25,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
+PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
@@ -60,3 +61,5 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
 }
 }  // namespace experimental
 }  // namespace paddle
+
+PT_REGISTER_API(Manipulation);
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
index 3c6536af6afa9aa851c234330f36cb419a846700..be16e83219bc2c5a2a78747c7c6509df65fadee2 100644
--- a/paddle/pten/api/lib/math.cc
+++ b/paddle/pten/api/lib/math.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/pten/api/include/registry.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/include/core.h"
@@ -27,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor mean(const Tensor& x) {
+PD_DLL_DECL Tensor mean(const Tensor& x) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
@@ -60,7 +61,7 @@ Tensor mean(const Tensor& x) {
   return out;
 }
 
-Tensor add(const Tensor& x, const Tensor& y) {
+PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
@@ -97,3 +98,5 @@ Tensor add(const Tensor& x, const Tensor& y) {
 
 }  // namespace experimental
 }  // namespace paddle
+
+PT_REGISTER_API(Math);
diff --git a/paddle/fluid/extension/src/ext_op_meta_info.cc b/paddle/pten/api/lib/op_meta_info.cc
similarity index 97%
rename from paddle/fluid/extension/src/ext_op_meta_info.cc
rename to paddle/pten/api/lib/op_meta_info.cc
index 40cad7a155226bfa0d97501ae93540449c86ad4a..586fa0cc05526fc15beff00fca214dbf84682e14 100644
--- a/paddle/fluid/extension/src/ext_op_meta_info.cc
+++ b/paddle/pten/api/lib/op_meta_info.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/extension/include/ext_op_meta_info.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
 
 #include <string>
 #include <unordered_map>
@@ -72,7 +72,8 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) {
   auto& info_vector = OpMetaInfoMap::Instance()[name_];
   // index check
   PADDLE_ENFORCE_EQ(
-      info_vector.size(), index_,
+      info_vector.size(),
+      index_,
       platform::errors::PreconditionNotMet(
           "The operator %s's meta info register failed. "
           "Please make sure you call marcos as order `PD_BUILD_OP`, "
@@ -122,7 +123,8 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
 
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
   PADDLE_ENFORCE_EQ(
-      index_, 0UL,
+      index_,
+      0UL,
       platform::errors::Unimplemented(
           "Currently, the InferShapeFn setting of Grad Op is not supported, "
           "And backward Tensor `X@GRAD` will use the shape of forward Tensor "
@@ -133,7 +135,8 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
 
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
   PADDLE_ENFORCE_EQ(
-      index_, 0UL,
+      index_,
+      0UL,
       platform::errors::Unimplemented(
           "Currently, the InferDtypeFn setting of Grad Op is not supported, "
           "And backward Tensor `X@GRAD` will use the dtype of forward Tensor "
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d10c7209e2b866a8f3d4c33342b3b6e5bb03694
--- /dev/null
+++ b/paddle/pten/api/lib/tensor.cc
@@ -0,0 +1,343 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/tensor.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/pten/api/lib/ext_compat_utils.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+/**
+ * [ Why still include the fluid headers? ]
+ *
+ * We hope to organize the basic implementation of Tensor and the logic related
+ * to Tensor computation into an independent library, which we call
+ * [Tensor Operation Library, pten], so we extract or rewrite the original
+ * Kernels.
+ *
+ * In the future, the training library, inference library and custom operators
+ * will link to this Tensor Operation library.
+ *
+ * However, if we directly split the link relation, we need to make too many
+ * changes, which will affect the stability of the framework, so here we still
+ * rely on the implementation of the framework, which is a intermediate state.
+ *
+ * In the future, the necessary components will be moved to the this library,
+ * or the corresponding components will be re-implemented.
+ */
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+
+namespace paddle {
+namespace experimental {
+
+namespace detail {
+
+inline bool IsDenseTensor(
+    const std::shared_ptr<pten::TensorBase> &tensor_impl) {
+  return tensor_impl->type_info().name() == "DenseTensor";
+}
+
+}  // namespace detail
+
+/////// Tensor Methods ////////
+
+/* Part 1: Construction and destruction methods */
+
+Tensor::Tensor(std::shared_ptr<pten::TensorBase> tensor_impl)
+    : impl_(std::move(tensor_impl)) {
+  PADDLE_ENFORCE_NOT_NULL(impl_,
+                          platform::errors::InvalidArgument(
+                              "TensorImpl with nullptr is not supported"));
+}
+
+Tensor::Tensor(const PlaceType &place)
+    : impl_(std::move(std::make_shared<pten::DenseTensor>(
+          std::move(pten::make_intrusive<SharedStorage>(
+              ConvertExtPlaceToInnerPlace(place))),
+          std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
+                                          framework::make_ddim({}),
+                                          pten::DataLayout::NCHW))))) {}
+
+Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
+    : impl_(std::move(std::make_shared<pten::DenseTensor>(
+          std::move(pten::make_intrusive<SharedStorage>(
+              ConvertExtPlaceToInnerPlace(place))),
+          std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
+                                          framework::make_ddim(shape),
+                                          pten::DataLayout::NCHW))))) {}
+
+/* Part 2: Dimension, DataType and DataLayout methods */
+
+int64_t Tensor::numel() const { return impl_->numel(); }
+
+int64_t Tensor::size() const { return impl_->numel(); }
+
+paddle::framework::DDim Tensor::dims() const { return impl_->dims(); }
+
+std::vector<int64_t> Tensor::shape() const {
+  return paddle::framework::vectorize<int64_t>(impl_->dims());
+}
+
+void Tensor::reshape(const std::vector<int64_t> &shape) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "The reshape operation is not supported now, "
+      "and it will be implemented by calling the reshape kernel later."));
+}
+
+DataType Tensor::dtype() const { return impl_->data_type(); }
+
+DataType Tensor::type() const { return impl_->data_type(); }
+
+DataLayout Tensor::layout() const { return impl_->layout(); }
+
+/* Part 3: Device and Backend methods */
+
+PlaceType Tensor::place() const {
+  return ConvertInnerPlaceToExtPlace(impl_->place());
+}
+
+paddle::platform::Place Tensor::inner_place() const { return impl_->place(); }
+
+bool Tensor::is_cpu() const {
+  return paddle::platform::is_cpu_place(impl_->place());
+}
+
+bool Tensor::is_cuda() const {
+  return paddle::platform::is_gpu_place(impl_->place());
+}
+
+/* Part 4: Data Access methods */
+
+template <typename T>
+T *Tensor::mutable_data() {
+  if (detail::IsDenseTensor(impl_)) {
+    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)
+        ->mutable_data<T>();
+  }
+  return nullptr;
+}
+
+template PD_DLL_DECL float *Tensor::mutable_data<float>();
+template PD_DLL_DECL double *Tensor::mutable_data<double>();
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL paddle::platform::complex<float>
+    *Tensor::mutable_data<paddle::platform::complex<float>>();
+template PD_DLL_DECL paddle::platform::complex<double>
+    *Tensor::mutable_data<paddle::platform::complex<double>>();
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
+
+template <typename T>
+T *Tensor::mutable_data(const PlaceType &place) {
+  auto inner_place = ConvertExtPlaceToInnerPlace(place);
+  PADDLE_ENFORCE_EQ(
+      platform::is_same_place(inner_place, impl_->place()),
+      true,
+      platform::errors::Unimplemented("Modification of tensor place through "
+                                      "mutable_data is not supported now"));
+  return mutable_data<T>();
+}
+
+template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
+template PD_DLL_DECL double *Tensor::mutable_data<double>(
+    const PlaceType &place);
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+    const PlaceType &place);
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+    const PlaceType &place);
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex<float> *
+Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex<double> *
+Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
+
+template <typename T>
+const T *Tensor::data() const {
+  if (detail::IsDenseTensor(impl_)) {
+    return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->data<T>();
+  }
+  return nullptr;
+}
+
+template PD_DLL_DECL const float *Tensor::data<float>() const;
+template PD_DLL_DECL const double *Tensor::data<double>() const;
+template PD_DLL_DECL const int64_t *Tensor::data<int64_t>() const;
+template PD_DLL_DECL const int32_t *Tensor::data<int32_t>() const;
+template PD_DLL_DECL const uint8_t *Tensor::data<uint8_t>() const;
+template PD_DLL_DECL const int8_t *Tensor::data<int8_t>() const;
+template PD_DLL_DECL const int16_t *Tensor::data<int16_t>() const;
+template PD_DLL_DECL const bool *Tensor::data<bool>() const;
+template PD_DLL_DECL const paddle::platform::complex<float>
+    *Tensor::data<paddle::platform::complex<float>>() const;
+template PD_DLL_DECL const paddle::platform::complex<double>
+    *Tensor::data<paddle::platform::complex<double>>() const;
+template PD_DLL_DECL const paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
+
+template <typename T>
+T *Tensor::data() {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "It is not currently supported to directly obtain the modifiable data "
+      "address through the tensor::data<T>() method, please use the "
+      "tensor::mutable_data<T>() method."));
+  return nullptr;
+}
+
+template PD_DLL_DECL float *Tensor::data<float>();
+template PD_DLL_DECL double *Tensor::data<double>();
+template PD_DLL_DECL int64_t *Tensor::data<int64_t>();
+template PD_DLL_DECL int32_t *Tensor::data<int32_t>();
+template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>();
+template PD_DLL_DECL int8_t *Tensor::data<int8_t>();
+template PD_DLL_DECL int16_t *Tensor::data<int16_t>();
+template PD_DLL_DECL bool *Tensor::data<bool>();
+template PD_DLL_DECL paddle::platform::complex<float>
+    *Tensor::data<paddle::platform::complex<float>>();
+template PD_DLL_DECL paddle::platform::complex<double>
+    *Tensor::data<paddle::platform::complex<double>>();
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>();
+
+Tensor Tensor::slice(const int64_t begin_idx, const int64_t end_idx) const {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "The slice operation is not supported now, "
+      "and it will be implemented by calling the slice kernel later."));
+  return Tensor();
+}
+
+std::shared_ptr<pten::TensorBase> Tensor::impl() const { return impl_; }
+
+void Tensor::set_impl(const std::shared_ptr<pten::TensorBase> &impl) {
+  impl_ = impl;
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+gpuStream_t Tensor::stream() const {
+  return platform::stream::get_current_stream(-1)->raw_stream();
+}
+#endif
+
+/* Part 5: Data Transform methods */
+
+template <typename T>
+Tensor Tensor::copy_to(const PlaceType &target_place) const {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "The copy_to operation is not supported now, "
+      "and it will be implemented by calling the copy kernel later."));
+  return Tensor();
+}
+
+template PD_DLL_DECL Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
+    const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
+    const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
+
+Tensor Tensor::to(const PlaceType &target_place) const {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "The to operation is not supported now, "
+      "and it will be implemented by calling the copy kernel later."));
+  return Tensor();
+}
+
+Tensor Tensor::cast(const DataType &target_type) const {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "The cast operation is not supported now, "
+      "and it will be implemented by calling the cast kernel later."));
+  return Tensor();
+}
+
+/* Part 6: Status utils methods */
+
+bool Tensor::defined() const { return impl_ != nullptr; }
+
+bool Tensor::initialized() const {
+  return impl_ != nullptr && impl_->initialized();
+}
+
+bool Tensor::is_initialized() const {
+  return impl_ != nullptr && impl_->initialized();
+}
+
+void Tensor::reset() { impl_.reset(); }
+
+/* Part 7: Operator overloading */
+
+Tensor &Tensor::operator=(const Tensor &x) & {
+  impl_ = x.impl_;
+  autograd_meta_ = x.autograd_meta_;
+  return *this;
+}
+
+Tensor &Tensor::operator=(Tensor &&x) & {
+  impl_ = std::move(x.impl_);
+  autograd_meta_ = std::move(x.autograd_meta_);
+  return *this;
+}
+
+AbstractAutogradMeta *Tensor::get_autograd_meta() const {
+  return autograd_meta_.get();
+}
+
+void Tensor::set_autograd_meta(
+    std::shared_ptr<AbstractAutogradMeta> autograd_meta) {
+  autograd_meta_ = std::move(autograd_meta);
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
index 2ba1e2e90eef214b8701bb8b32dff34481b9d52e..b05ae4cb08494adb930efe5c4ddb5dfab8f3d5f4 100644
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -63,11 +63,24 @@ class SharedStorage : public pten::Storage {
     size_ = allocation->size();
   }
 
+  // In order to be compatible with the original Tensor design and execution
+  // system, we need to allow the uninitialized SharedStorage to exist,
+  // and it can be removed after the compatibility phase is over in the future
+  explicit SharedStorage(const paddle::platform::Place& place) {
+    data_ = pten::Allocation(nullptr, place);
+  }
+
   static const char* name() { return "SharedStorage"; }
 
+  // In order to be compatible with the original Tensor design and execution
+  // system, we need to allow the SharedStorage realloc,
+  // and it can be removed after the compatibility phase is over in the future
   void Realloc(size_t n) override {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "The external shared storage cannot be reallocated."));
+    if (data() != nullptr) {
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "The external shared storage cannot be reallocated."));
+    }
+    ResetAllocation(paddle::memory::AllocShared(place(), n), 0);
   }
 
   void Clear() override {
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 52554bf7af0cadeff416546ec7c21cfe2988a189..2c362a11b11d9c112ab477bbee40c5edc4723725 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -113,19 +113,30 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
 }
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
-  CHECK(src);
-  CHECK(dst);
+  PADDLE_ENFORCE_NOT_NULL(
+      src,
+      platform::errors::InvalidArgument(
+          "The source DenseTensor is nullptr when move storage."));
+  PADDLE_ENFORCE_NOT_NULL(
+      dst,
+      platform::errors::InvalidArgument(
+          "The destination Tensor is nullptr when move storage."));
   dst->Resize(src->dims());
   auto storage = src->release();
-  CHECK(storage->OwnsMemory());
   std::shared_ptr<paddle::memory::allocation::Allocation> holder(
       new TensorStorage(std::move(storage)));
   dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->data_type()));
 }
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  CHECK(src);
-  CHECK(dst);
+  PADDLE_ENFORCE_NOT_NULL(
+      src,
+      platform::errors::InvalidArgument(
+          "The source DenseTensor is nullptr when move storage."));
+  PADDLE_ENFORCE_NOT_NULL(
+      dst,
+      platform::errors::InvalidArgument(
+          "The destination LoDTensor is nullptr when move storage."));
   SetLoD(dst->mutable_lod(), src->lod());
   MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
 }
diff --git a/paddle/pten/common/backend.h b/paddle/pten/common/backend.h
index e0bf746050a672ace718e33f56df1391218954f7..94080701e28166c20952ba09d8eccb1394508923 100644
--- a/paddle/pten/common/backend.h
+++ b/paddle/pten/common/backend.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {
 namespace experimental {
@@ -80,8 +80,7 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
       os << "CUDNN";
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid enum backend type `%d`.", static_cast<int>(backend)));
+      PD_THROW("Invalid enum backend type `", static_cast<int>(backend), "`.");
   }
   return os;
 }
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 27ca28b2734859612fd9b2a4d1356097e39e5342..1ddee0746d4d16e8d31c52f2442b31955709f3a3 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "bfloat16.h"  // NOLINT
+#include "complex.h"   // NOLINT
+#include "float16.h"   // NOLINT
+
+#include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {
 namespace experimental {
@@ -72,9 +72,9 @@ inline size_t SizeOf(DataType data_type) {
       return 16;
     case DataType::UNDEFINED:
     case DataType::NUM_DATA_TYPES:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type %d is not supported by tensor.",
-          static_cast<int>(data_type)));
+      PD_THROW("Data type `",
+               static_cast<int>(data_type),
+               "` is not supported by tensor.");
   }
   return 0;
 }
@@ -173,8 +173,7 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
       os << "complex128";
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid enum data type `%d`.", static_cast<int>(dtype)));
+      PD_THROW("Invalid enum data type `", static_cast<int>(dtype), "`.");
   }
   return os;
 }
@@ -184,4 +183,13 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
 
 namespace pten {
 using DataType = paddle::experimental::DataType;
-}
+}  // namespace pten
+
+namespace paddle {
+// In order to be compatible with the original custom operator Tensor interface
+using DataType = paddle::experimental::DataType;
+using bfloat16 = paddle::experimental::bfloat16;
+using complex64 = paddle::experimental::complex64;
+using complex128 = paddle::experimental::complex128;
+using float16 = paddle::experimental::float16;
+}  // namespace paddle
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
index 0da10dff4335b9618fcec6dd19a324b15a519705..b93e2623ee7d0f8cbcfd3ecb410799d2a31cd2f1 100644
--- a/paddle/pten/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/enforce.h"
-
+#include "paddle/pten/api/ext/exception.h"
 namespace paddle {
 namespace experimental {
 
@@ -46,8 +45,8 @@ inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
       os << "MKLDNN";
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid enum data layout type `%d`.", static_cast<int>(layout)));
+      PD_THROW(
+          "Invalid enum data layout type `", static_cast<int>(layout), "`.");
   }
   return os;
 }
diff --git a/paddle/pten/common/scalar.h b/paddle/pten/common/scalar.h
index ef648ba70f33684c9d81295c092a0aeec82a660c..bc2488024f13bf8baabc76fececc1c15a0da8cab 100644
--- a/paddle/pten/common/scalar.h
+++ b/paddle/pten/common/scalar.h
@@ -15,8 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <limits>
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {
 namespace experimental {
@@ -60,8 +61,7 @@ class Scalar {
       case Tag::HAS_B:
         return static_cast<T>(data_.b);
       default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid enum scalar type tag `%d`.", static_cast<int>(tag)));
+        PD_THROW("Invalid enum scalar type tag `", static_cast<int>(tag), "`.");
     }
   }
 
@@ -83,4 +83,4 @@ class Scalar {
 
 namespace pten {
 using Scalar = paddle::experimental::Scalar;
-}
+}  // namespace pten
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index a7ccf314674384bc8f3e97a3cb3bfa461b715d50..ae09507193e9cc24f4ec341b58669eda8b651e42 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,9 +1,3 @@
-IF(WITH_MKLDNN)
-    set(MKLDNN_CTX_DEPS mkldnn)
-ELSE()
-    set(MKLDNN_CTX_DEPS)
-ENDIF()
-
 if(WITH_GPU)
   cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 elseif(WITH_ROCM)
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 2dfd523579d26e780358c5b8c546785a463855df..fbc2a24312941e27c444509f5985d0c8ac162938 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
 namespace pten {
 
 DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
@@ -74,6 +79,13 @@ void* DenseTensor::mutable_data(size_t request_bytes) {
 
 template <typename T>
 T* DenseTensor::mutable_data() {
+  // In order to be compatible with the original Tensor design and
+  // execution system, we have to reset the datatype in mutable_data<T>.
+  // When the compatibility phase is over in the future, we can delete it
+  if (meta_.type == DataType::UNDEFINED) {
+    const_cast<DataType&>(meta_.type) =
+        paddle::experimental::CppTypeToDataType<T>::Type();
+  }
   PADDLE_ENFORCE(
       (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
       paddle::platform::errors::InvalidArgument(
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 8c2b711015c9da2c5daac2005eae47fdfa2e403d..ca313ef9cc31f840e78977d70492118ad0f97fc9 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -76,11 +76,11 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the number of elements contained in tensor.
   /// \return The number of elements contained in tensor.
-  int64_t numel() const;
+  int64_t numel() const override;
 
   /// \brief Returns the dims of the tensor.
   /// \return The dims of the tensor.
-  const DDim& dims() const noexcept { return meta_.dims; }
+  const DDim& dims() const noexcept override { return meta_.dims; }
 
   /// \brief Returns the lod of the tensor.
   /// \return The lod of the tensor.
@@ -93,15 +93,15 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
-  DataType data_type() const noexcept { return meta_.type; }
+  DataType data_type() const noexcept override { return meta_.type; }
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
-  DataLayout layout() const noexcept { return meta_.layout; }
+  DataLayout layout() const noexcept override { return meta_.layout; }
 
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
-  const Place& place() const { return storage_->place(); }
+  const Place& place() const override { return storage_->place(); }
 
   /// \brief Returns the meta information of the tensor.
   /// \return The meta information of the tensor.
@@ -109,11 +109,13 @@ class DenseTensor : public TensorBase,
 
   /// \brief Test whether the metadata is valid.
   /// \return Whether the metadata is valid.
-  bool valid() const noexcept { return meta_.valid(); }
+  bool valid() const noexcept override { return meta_.valid(); }
 
   /// \brief Test whether the storage is allocated.
   /// return Whether the storage is allocated.
-  bool initialized() const { return storage_->data(); }
+  bool initialized() const override {
+    return storage_ != nullptr && storage_->data() != nullptr;
+  }
 
   /// \brief Check if storage is shared with other objects.
   /// \return Whether the storage is shared with other objects.
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index 00a9496d84b7e30f8039f96034e5ead0c84482b3..0e6349ef085fa1d3248ba2f82888e1d0797f47d2 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -1,8 +1,16 @@
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_api pten_api_utils)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_api pten_api_utils)
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_api pten_api_utils)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_api pten_api_utils)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_api pten_api_utils)
+if(WITH_ROCM)
+  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor glog)
+else()
+  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor glog)
+endif()
+
+cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
 cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils)
 cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_api pten_api_utils)
+
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 2c7b65f98135be8b464badcfa0114d9c98e0d52d..50363a3bf95244a1b3df2af56008a42aba91701a 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -65,8 +65,8 @@ TEST(API, dot) {
   auto out = paddle::experimental::dot(x, y);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
-  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
   ASSERT_EQ(out.numel(), 3);
   ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index 79f69343149c836deaef70adf540c07a796ca3e4..e9eda8d3b18860c5b3afbe261ceca8384a664011 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -66,7 +66,7 @@ TEST(API, add) {
   auto out = paddle::experimental::add(x, y);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape().size(), 2UL);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 30);
   ASSERT_EQ(out.is_cpu(), true);
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 897637942547e9435d800fc9561d6a7ce94ea747..0b8d0e2e98d84a4e8b3b17ae225e9ae8258f7490 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -51,8 +51,8 @@ TEST(API, full_like) {
   auto out = paddle::experimental::full_like(x, val, pten::DataType::FLOAT32);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
-  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
@@ -84,8 +84,8 @@ TEST(API, zeros_like) {
   auto out = paddle::experimental::zeros_like(x, pten::DataType::INT32);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
-  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::INT32);
@@ -117,8 +117,8 @@ TEST(API, ones_like) {
   auto out = paddle::experimental::ones_like(x, pten::DataType::INT32);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
-  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::INT32);
@@ -143,7 +143,7 @@ TEST(API, full) {
   auto out = paddle::experimental::full({3, 2}, val, pten::DataType::FLOAT32);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape().size(), 2UL);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 3701c358c667e1c96851f5b5d18ac72101411057..5bcf60febff9afb7d9781b969c3bd1f542080b0f 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -53,9 +53,9 @@ TEST(API, flatten) {
 
   // 3. check result
   std::vector<int> expect_shape = {3, 4, 3};
-  ASSERT_EQ(out.shape()[0], expect_shape[0]);
-  ASSERT_EQ(out.shape()[1], expect_shape[1]);
-  ASSERT_EQ(out.shape()[2], expect_shape[2]);
+  ASSERT_EQ(out.dims()[0], expect_shape[0]);
+  ASSERT_EQ(out.dims()[1], expect_shape[1]);
+  ASSERT_EQ(out.dims()[2], expect_shape[2]);
   ASSERT_EQ(out.numel(), 36);
   ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index b0b649c6908c98d8eea03deda5e3fefc1db2e890..377173370de7d9b1f68c389d9b47aba763a15c16 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -63,9 +63,9 @@ TEST(API, matmul_cpu) {
   auto out = paddle::experimental::matmul(x, y, false, false);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
-  ASSERT_EQ(out.shape()[0], 3);
-  ASSERT_EQ(out.shape()[1], 3);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
   ASSERT_EQ(out.numel(), 9);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
@@ -135,9 +135,9 @@ TEST(API, matmul_cuda) {
   auto out = paddle::experimental::matmul(x, y, false, false);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 2);
-  ASSERT_EQ(out.shape()[0], 3);
-  ASSERT_EQ(out.shape()[1], 3);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
   ASSERT_EQ(out.numel(), 9);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
@@ -148,7 +148,7 @@ TEST(API, matmul_cuda) {
   auto ref_out = std::make_shared<pten::DenseTensor>(
       alloc_cpu,
       pten::DenseTensorMeta(
-          pten::DataType::FLOAT32, out.shape(), pten::DataLayout::NCHW));
+          pten::DataType::FLOAT32, out.dims(), pten::DataLayout::NCHW));
 
   pten::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get());
 
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index d772e58659507a7d098fdba2b785a62fab019eac..ee38e9d49cb1e7a35f07a0550fc94f8053ba62e4 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -54,8 +54,8 @@ TEST(API, mean) {
   auto out = paddle::experimental::mean(x);
 
   // 3. check result
-  ASSERT_EQ(out.shape().size(), 1);
-  ASSERT_EQ(out.shape()[0], 1);
+  ASSERT_EQ(out.dims().size(), 1);
+  ASSERT_EQ(out.dims()[0], 1);
   ASSERT_EQ(out.numel(), 1);
   ASSERT_EQ(out.is_cpu(), true);
   ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
diff --git a/python/paddle/fluid/tests/custom_op/test_check_error.cc b/paddle/pten/tests/api/test_pten_exception.cc
similarity index 74%
rename from python/paddle/fluid/tests/custom_op/test_check_error.cc
rename to paddle/pten/tests/api/test_pten_exception.cc
index eda521a89662f5569db39ffa94dcd9a0f539e9bf..a987e9bc315455922f4fd06c0e61e62171294c33 100644
--- a/python/paddle/fluid/tests/custom_op/test_check_error.cc
+++ b/paddle/pten/tests/api/test_pten_exception.cc
@@ -12,7 +12,7 @@ limitations under the License. */
 #include <iostream>
 #include <string>
 #include "gtest/gtest.h"
-#include "paddle/fluid/extension/include/ext_exception.h"
+#include "paddle/pten/api/ext/exception.h"
 
 TEST(PD_THROW, empty) {
   bool caught_exception = false;
@@ -23,12 +23,11 @@ TEST(PD_THROW, empty) {
     std::string err_msg = e.what();
     EXPECT_TRUE(err_msg.find("An error occurred.") != std::string::npos);
 #if _WIN32
-    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc:20") !=
+    EXPECT_TRUE(err_msg.find("tests\\api\\test_pten_exception.cc:20") !=
                 std::string::npos);
 #else
     EXPECT_TRUE(
-        err_msg.find(
-            "python/paddle/fluid/tests/custom_op/test_check_error.cc:20") !=
+        err_msg.find("paddle/pten/tests/api/test_pten_exception.cc:20") !=
         std::string::npos);
 #endif
   }
@@ -52,13 +51,11 @@ TEST(PD_THROW, non_empty) {
     EXPECT_TRUE(err_msg.find("PD_THROW returns 0. DataType of 1 is INT. ") !=
                 std::string::npos);
 #if _WIN32
-    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+    EXPECT_TRUE(err_msg.find("tests\\api\\test_pten_exception.cc") !=
                 std::string::npos);
 #else
-    EXPECT_TRUE(
-        err_msg.find(
-            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
-        std::string::npos);
+    EXPECT_TRUE(err_msg.find("paddle/pten/tests/api/test_pten_exception.cc") !=
+                std::string::npos);
 #endif
   }
   EXPECT_TRUE(caught_exception);
@@ -84,13 +81,11 @@ TEST(PD_CHECK, FAILED) {
     EXPECT_TRUE(err_msg.find("Expected false, but it's not satisfied.") !=
                 std::string::npos);
 #if _WIN32
-    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+    EXPECT_TRUE(err_msg.find("tests\\api\\test_pten_exception.cc") !=
                 std::string::npos);
 #else
-    EXPECT_TRUE(
-        err_msg.find(
-            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
-        std::string::npos);
+    EXPECT_TRUE(err_msg.find("paddle/pten/tests/api/test_pten_exception.cc") !=
+                std::string::npos);
 #endif
   }
   EXPECT_TRUE(caught_exception);
@@ -112,13 +107,11 @@ TEST(PD_CHECK, FAILED) {
     EXPECT_TRUE(err_msg.find("PD_CHECK returns 0. DataType of 1 is INT. ") !=
                 std::string::npos);
 #if _WIN32
-    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+    EXPECT_TRUE(err_msg.find("tests\\api\\test_pten_exception.cc") !=
                 std::string::npos);
 #else
-    EXPECT_TRUE(
-        err_msg.find(
-            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
-        std::string::npos);
+    EXPECT_TRUE(err_msg.find("paddle/pten/tests/api/test_pten_exception.cc") !=
+                std::string::npos);
 #endif
   }
   EXPECT_TRUE(caught_exception);
@@ -134,13 +127,11 @@ TEST(PD_CHECK, FAILED) {
     EXPECT_TRUE(err_msg.find("Expected a > b, but it's not satisfied.") !=
                 std::string::npos);
 #if _WIN32
-    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+    EXPECT_TRUE(err_msg.find("tests\\api\\test_pten_exception.cc") !=
                 std::string::npos);
 #else
-    EXPECT_TRUE(
-        err_msg.find(
-            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
-        std::string::npos);
+    EXPECT_TRUE(err_msg.find("paddle/pten/tests/api/test_pten_exception.cc") !=
+                std::string::npos);
 #endif
   }
   EXPECT_TRUE(caught_exception);
@@ -156,13 +147,11 @@ TEST(PD_CHECK, FAILED) {
     EXPECT_TRUE(err_msg.find("PD_CHECK returns 0, because 123 > 0.345") !=
                 std::string::npos);
 #if _WIN32
-    EXPECT_TRUE(err_msg.find("tests\\custom_op\\test_check_error.cc") !=
+    EXPECT_TRUE(err_msg.find("tests\\api\\test_pten_exception.cc") !=
                 std::string::npos);
 #else
-    EXPECT_TRUE(
-        err_msg.find(
-            "python/paddle/fluid/tests/custom_op/test_check_error.cc") !=
-        std::string::npos);
+    EXPECT_TRUE(err_msg.find("paddle/pten/tests/api/test_pten_exception.cc") !=
+                std::string::npos);
 #endif
   }
   EXPECT_TRUE(caught_exception);
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/pten/tests/api/test_pten_tensor.cc
similarity index 60%
rename from paddle/fluid/framework/custom_tensor_test.cc
rename to paddle/pten/tests/api/test_pten_tensor.cc
index 342be27c896ae9924f967005f38315f70abed6f2..9cadd2664bdf3c6f2ad6db84279752a3add9271d 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/pten/tests/api/test_pten_tensor.cc
@@ -14,15 +14,16 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/extension/include/ext_all.h"
-#include "paddle/fluid/framework/custom_tensor_utils.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/api/lib/ext_compat_utils.h"
+
+namespace pten {
+namespace tests {
 
 template <typename T>
 paddle::Tensor InitCPUTensorForTest() {
   std::vector<int64_t> tensor_shape{5, 5};
-  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
-  t1.reshape(tensor_shape);
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape);
   auto* p_data_ptr = t1.mutable_data<T>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t1.size(); i++) {
     p_data_ptr[i] = T(5);
@@ -56,21 +57,18 @@ void TestCopyTensor() {
 void TestAPIPlace() {
   std::vector<int64_t> tensor_shape = {5, 5};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
-  t1.reshape(tensor_shape);
+  auto t1 = paddle::Tensor(paddle::PlaceType::kGPU, tensor_shape);
   t1.mutable_data<float>();
   CHECK((paddle::PlaceType::kGPU == t1.place()));
 #endif
-  auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
-  t2.reshape(tensor_shape);
+  auto t2 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape);
   t2.mutable_data<float>();
   CHECK((paddle::PlaceType::kCPU == t2.place()));
 }
 
 void TestAPISizeAndShape() {
   std::vector<int64_t> tensor_shape = {5, 5};
-  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
-  t1.reshape(tensor_shape);
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape);
   CHECK_EQ(t1.size(), 25);
   CHECK(t1.shape() == tensor_shape);
 }
@@ -113,8 +111,7 @@ void TestAPISlice() {
 template <typename T>
 paddle::DataType TestDtype() {
   std::vector<int64_t> tensor_shape = {5, 5};
-  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
-  t1.reshape(tensor_shape);
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape);
   t1.template mutable_data<T>();
   return t1.type();
 }
@@ -122,8 +119,7 @@ paddle::DataType TestDtype() {
 template <typename T>
 void TestCast(paddle::DataType data_type) {
   std::vector<int64_t> tensor_shape = {5, 5};
-  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
-  t1.reshape(tensor_shape);
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape);
   t1.template mutable_data<T>();
   auto t2 = t1.cast(data_type);
   CHECK(t2.type() == data_type);
@@ -195,80 +191,12 @@ void GroupTestDtype() {
   CHECK(TestDtype<paddle::float16>() == paddle::DataType::FLOAT16);
 }
 
-void GroupTestDtypeConvert() {
-  // enum -> proto
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::FLOAT64) ==
-        paddle::framework::proto::VarType::FP64);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::FLOAT32) ==
-        paddle::framework::proto::VarType::FP32);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::UINT8) ==
-        paddle::framework::proto::VarType::UINT8);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::INT8) == paddle::framework::proto::VarType::INT8);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::INT32) ==
-        paddle::framework::proto::VarType::INT32);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::INT64) ==
-        paddle::framework::proto::VarType::INT64);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::INT16) ==
-        paddle::framework::proto::VarType::INT16);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::COMPLEX64) ==
-        paddle::framework::proto::VarType::COMPLEX64);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::COMPLEX128) ==
-        paddle::framework::proto::VarType::COMPLEX128);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
-            paddle::DataType::FLOAT16) ==
-        paddle::framework::proto::VarType::FP16);
-  // proto -> enum
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::FP64) ==
-        paddle::DataType::FLOAT64);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::FP32) ==
-        paddle::DataType::FLOAT32);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::INT64) ==
-        paddle::DataType::INT64);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::INT32) ==
-        paddle::DataType::INT32);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::INT8) == paddle::DataType::INT8);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::UINT8) ==
-        paddle::DataType::UINT8);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::INT16) ==
-        paddle::DataType::INT16);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::COMPLEX64) ==
-        paddle::DataType::COMPLEX64);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::COMPLEX128) ==
-        paddle::DataType::COMPLEX128);
-  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
-            paddle::framework::proto::VarType::FP16) ==
-        paddle::DataType::FLOAT16);
-}
-
 void TestInitilized() {
-  paddle::Tensor test_tensor(paddle::PlaceType::kCPU);
+  paddle::Tensor test_tensor(paddle::PlaceType::kCPU, {1, 1});
   CHECK(test_tensor.is_initialized() == false);
-  test_tensor.reshape({1, 1});
   test_tensor.mutable_data<float>();
   CHECK(test_tensor.is_initialized() == true);
-  float* tensor_data = test_tensor.data<float>();
+  float* tensor_data = test_tensor.mutable_data<float>();
   for (int i = 0; i < test_tensor.size(); i++) {
     tensor_data[i] = 0.5;
   }
@@ -277,21 +205,23 @@ void TestInitilized() {
   }
 }
 
-TEST(CustomTensor, copyTest) {
-  VLOG(2) << "TestCopy";
-  GroupTestCopy();
+TEST(PtenTensor, All) {
+  // TODO(chenweihang, before 2021.11.20) support copy, slice and cast methods
+  // VLOG(2) << "TestCopy";
+  // GroupTestCopy();
   VLOG(2) << "TestDtype";
   GroupTestDtype();
   VLOG(2) << "TestShape";
   TestAPISizeAndShape();
   VLOG(2) << "TestPlace";
   TestAPIPlace();
-  VLOG(2) << "TestSlice";
-  TestAPISlice();
-  VLOG(2) << "TestCast";
-  GroupTestCast();
-  VLOG(2) << "TestDtypeConvert";
-  GroupTestDtypeConvert();
+  // VLOG(2) << "TestSlice";
+  // TestAPISlice();
+  // VLOG(2) << "TestCast";
+  // GroupTestCast();
   VLOG(2) << "TestInitilized";
   TestInitilized();
 }
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/common/test_backend.cc b/paddle/pten/tests/common/test_backend.cc
index 1c17c881ed24f23148525072301c5eba3b671125..0aabf22608f69bd4fd661f005e7796b65f51f0d3 100644
--- a/paddle/pten/tests/common/test_backend.cc
+++ b/paddle/pten/tests/common/test_backend.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <iostream>
 
+#include "paddle/pten/api/ext/exception.h"
 #include "paddle/pten/common/backend.h"
 
 TEST(Backend, OStream) {
@@ -42,7 +43,7 @@ TEST(Backend, OStream) {
   oss.str("");
   try {
     oss << pten::Backend::NUM_BACKENDS;
-  } catch (paddle::platform::EnforceNotMet &exception) {
+  } catch (const std::exception& exception) {
     std::string ex_msg = exception.what();
     EXPECT_TRUE(ex_msg.find("Invalid enum backend type") != std::string::npos);
   }
diff --git a/paddle/pten/tests/common/test_data_layout.cc b/paddle/pten/tests/common/test_data_layout.cc
index 7fe1b6c2ffd2be88afa2e6978f6c3e9be10745f1..4de75a7bd71fb7d9f96d6ac94dc6f2abaafefa23 100644
--- a/paddle/pten/tests/common/test_data_layout.cc
+++ b/paddle/pten/tests/common/test_data_layout.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <sstream>
 
+#include "paddle/pten/api/ext/exception.h"
 #include "paddle/pten/common/layout.h"
 
 TEST(DataLayout, OStream) {
@@ -37,7 +38,7 @@ TEST(DataLayout, OStream) {
   oss.str("");
   try {
     oss << pten::DataLayout::NUM_DATA_LAYOUTS;
-  } catch (paddle::platform::EnforceNotMet &exception) {
+  } catch (const std::exception& exception) {
     std::string ex_msg = exception.what();
     EXPECT_TRUE(ex_msg.find("Invalid enum data layout type") !=
                 std::string::npos);
diff --git a/paddle/pten/tests/common/test_data_type.cc b/paddle/pten/tests/common/test_data_type.cc
index 28d58858bb42c8a5d778fa5ce9fa24a0a171bce4..95957260de545733b2dbcde09ba344c52802b803 100644
--- a/paddle/pten/tests/common/test_data_type.cc
+++ b/paddle/pten/tests/common/test_data_type.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <sstream>
 
+#include "paddle/pten/api/ext/exception.h"
 #include "paddle/pten/common/data_type.h"
 
 TEST(DataType, OStream) {
@@ -61,7 +62,7 @@ TEST(DataType, OStream) {
   oss.str("");
   try {
     oss << pten::DataType::NUM_DATA_TYPES;
-  } catch (paddle::platform::EnforceNotMet &exception) {
+  } catch (const std::exception& exception) {
     std::string ex_msg = exception.what();
     EXPECT_TRUE(ex_msg.find("Invalid enum data type") != std::string::npos);
   }
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index b25439cfe25279b3db97afe90643624fe014a62a..9a5cfecc2917b8e40d4b941a7339935400e8484a 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -3,4 +3,5 @@ cc_test(test_storage SRCS test_storage.cc DEPS tensor_base)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
+cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils)
 cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory)
diff --git a/paddle/pten/tests/core/test_convert_utils.cc b/paddle/pten/tests/core/test_convert_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51fba7cbe0658bc761562e47e4f5c0988d6c9d56
--- /dev/null
+++ b/paddle/pten/tests/core/test_convert_utils.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pten {
+namespace tests {
+
+TEST(ConvertUtils, DataType) {
+  // enum -> proto
+  CHECK(pten::TransToProtoVarType(paddle::DataType::FLOAT64) ==
+        paddle::framework::proto::VarType::FP64);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::FLOAT32) ==
+        paddle::framework::proto::VarType::FP32);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::UINT8) ==
+        paddle::framework::proto::VarType::UINT8);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::INT8) ==
+        paddle::framework::proto::VarType::INT8);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::INT32) ==
+        paddle::framework::proto::VarType::INT32);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::INT64) ==
+        paddle::framework::proto::VarType::INT64);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::INT16) ==
+        paddle::framework::proto::VarType::INT16);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::BOOL) ==
+        paddle::framework::proto::VarType::BOOL);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::COMPLEX64) ==
+        paddle::framework::proto::VarType::COMPLEX64);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::COMPLEX128) ==
+        paddle::framework::proto::VarType::COMPLEX128);
+  CHECK(pten::TransToProtoVarType(paddle::DataType::FLOAT16) ==
+        paddle::framework::proto::VarType::FP16);
+  // proto -> enum
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::FP64) ==
+        paddle::DataType::FLOAT64);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::FP32) ==
+        paddle::DataType::FLOAT32);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::INT64) ==
+        paddle::DataType::INT64);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::INT32) ==
+        paddle::DataType::INT32);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::INT8) ==
+        paddle::DataType::INT8);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::UINT8) ==
+        paddle::DataType::UINT8);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::INT16) ==
+        paddle::DataType::INT16);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::BOOL) ==
+        paddle::DataType::BOOL);
+  CHECK(
+      pten::TransToPtenDataType(paddle::framework::proto::VarType::COMPLEX64) ==
+      paddle::DataType::COMPLEX64);
+  CHECK(pten::TransToPtenDataType(
+            paddle::framework::proto::VarType::COMPLEX128) ==
+        paddle::DataType::COMPLEX128);
+  CHECK(pten::TransToPtenDataType(paddle::framework::proto::VarType::FP16) ==
+        paddle::DataType::FLOAT16);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 2092151b84f454de581601e2de630e9789bfe429..68b79041355fb3d0bf6eec2ec0c59d2ad19506ff 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -16,8 +16,8 @@ py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
 py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py)
 py_test(test_custom_concat SRCS test_custom_concat.py)
 py_test(test_custom_conj SRCS test_custom_conj.py)
+py_test(test_custom_linear SRCS test_custom_linear.py)
 
 # other tests
 py_test(test_sysconfig SRCS test_sysconfig.py)
 py_test(test_check_abi SRCS test_check_abi.py)
-cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
index 1edc10b8a8a981532cfe61875f3e792380487a76..297acd602086541cbdfc274a9b248b08b5c94758 100644
--- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
@@ -132,8 +132,7 @@ std::vector<paddle::Tensor> AttrTestForward(
     std::vector<float> float_vec_attr,
     std::vector<int64_t> int64_vec_attr,
     std::vector<std::string> str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -161,8 +160,7 @@ std::vector<paddle::Tensor> AttrTestBackward(
     int int_attr,
     std::vector<float> float_vec_attr,
     std::vector<std::string> str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
-  grad_x.reshape(grad_out.shape());
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
 
   PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
                                assign_cpu_kernel<data_t>(
@@ -187,8 +185,7 @@ std::vector<paddle::Tensor> ConstAttrTestForward(
     const std::vector<float>& float_vec_attr,
     const std::vector<int64_t>& int64_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -216,8 +213,7 @@ std::vector<paddle::Tensor> ConstAttrTestBackward(
     const int& int_attr,
     const std::vector<float>& float_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
-  grad_x.reshape(grad_out.shape());
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
 
   PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
                                assign_cpu_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
index a01e01f2bc59239e5ce6aec4a1d9ea9a27bc00d1..66cc36c300e9d6f24652ab5907a71e3a2fbc03ff 100644
--- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
@@ -75,8 +75,7 @@ std::vector<paddle::Tensor> ConcatForwardDynamicAxis(
   auto out_shape = ComputeOutShape(in_shapes, axis);
 
   // create output
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(out_shape);
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape);
 
   // calc
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
@@ -107,8 +106,7 @@ std::vector<paddle::Tensor> ConcatBackwardDynamicAxis(
   // create outputs
   std::vector<paddle::Tensor> grad_inputs;
   for (auto& t : inputs) {
-    auto grad = paddle::Tensor(paddle::PlaceType::kCPU);
-    grad.reshape(t.shape());
+    auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape());
     grad_inputs.emplace_back(grad);
   }
 
@@ -163,8 +161,7 @@ std::vector<paddle::Tensor> ConcatForwardStaticAxis(
   auto out_shape = ComputeOutShape(in_shapes, final_axis);
 
   // create output
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(out_shape);
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape);
 
   // calc
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
@@ -193,8 +190,7 @@ std::vector<paddle::Tensor> ConcatBackwardStaticAxis(
   // create outputs
   std::vector<paddle::Tensor> grad_inputs;
   for (auto& t : inputs) {
-    auto grad = paddle::Tensor(paddle::PlaceType::kCPU);
-    grad.reshape(t.shape());
+    auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape());
     grad_inputs.emplace_back(grad);
   }
 
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
index 4feb887ca036a1c911b3a2890162a839f836dacf..b9c10f479e0a39eb8e33ffceb30e8eb9cc8efa9e 100644
--- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -71,8 +71,7 @@ void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) {
 std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
   CHECK_INPUT(x);
 
-  paddle::Tensor out(x.place());
-  out.reshape(x.shape());
+  paddle::Tensor out(x.place(), x.shape());
 
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "ConjCPUKernel", ([&] {
diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e0b44b71f7f87447bd66a052f0a394ab38b2874
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <vector>
+#include "paddle/extension.h"
+
+// The linear implemented here must be passed in bias
+std::vector<paddle::Tensor> PtenLinearForward(const paddle::Tensor& x,
+                                              const paddle::Tensor& weight,
+                                              const paddle::Tensor& bias) {
+  return {
+      paddle::experimental::add(paddle::experimental::matmul(x, weight), bias)};
+}
+
+std::vector<std::vector<int64_t>> LinearInferShape(
+    const std::vector<int64_t>& x_shape,
+    const std::vector<int64_t>& weight_shape,
+    const std::vector<int64_t>& bias_shape) {
+  auto dims_x = x_shape;
+  auto dims_y = weight_shape;
+  auto ndims_x = x_shape.size();
+  auto ndims_y = weight_shape.size();
+  PD_CHECK(ndims_x > 0,
+           "The Input(x) dims size must be greater than 0,"
+           " but reviced dims size is 0. ");
+  PD_CHECK(ndims_y > 0,
+           "The Input(y) dims size must be greater than 0,"
+           " but reviced dims size is 0. ");
+
+  bool x_broadcasted = false, y_broadcasted = false;
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
+
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
+
+  size_t M, N;
+  M = dims_x[ndims_x - 2];
+  N = dims_y[ndims_y - 1];
+
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+    }
+  }
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
+  }
+
+  return {new_dims};
+}
+
+std::vector<paddle::DataType> LinearInferDtype(
+    const paddle::DataType& x_dtype,
+    const paddle::DataType& weight_dtype,
+    const paddle::DataType& bias_dtype) {
+  return {x_dtype};
+}
+
+PD_BUILD_OP(pten_linear)
+    .Inputs({"X", "Weight", "Bias"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(PtenLinearForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype));
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c075d27f7b1763babf54cd9d378b28f29fd84566..b2ef90bf87a1af3ebe68a8f10950537978c0e390 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -21,6 +21,8 @@ template <typename data_t>
 void relu_cpu_forward_kernel(const data_t* x_data,
                              data_t* out_data,
                              int64_t x_numel) {
+  PD_CHECK(x_data != nullptr, "x_data is nullptr.");
+  PD_CHECK(out_data != nullptr, "out_data is nullptr.");
   for (int i = 0; i < x_numel; ++i) {
     out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
   }
@@ -52,8 +54,7 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
                                               const paddle::Tensor& out,
                                               const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
-  grad_x.reshape(x.shape());
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 38e8e71cf8129b0e3b8ea7d816e6389c52c83a9e..dda42a5c0598478aaf80ac25bdcdea2c2ab8b6e4 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -37,20 +37,15 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
 }
 
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kGPU);
+  auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
 
-  out.reshape(x.shape());
   int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
-        auto cpu_input = x.copy_to<data_t>(paddle::PlaceType::kCPU);
-        auto gpu_input = cpu_input.copy_to<data_t>(paddle::PlaceType::kGPU);
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            gpu_input.data<data_t>(),
-            out.mutable_data<data_t>(x.place()),
-            numel);
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
       }));
 
   return {out};
@@ -59,8 +54,7 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
                                                const paddle::Tensor& out,
                                                const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU);
-  grad_x.reshape(x.shape());
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
 
   int numel = out.size();
   int block = 512;
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index 0435f50b7c701e997c9e73ba4bfd9a8c5a998471..0f7d323b5451efba5a503d9039a03531e1773efb 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -27,8 +27,7 @@ void assign_cpu_kernel(const data_t* x_data,
 }
 
 std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -46,8 +45,7 @@ PD_BUILD_OP(dispatch_test_integer)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -64,8 +62,7 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
 
 std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -83,8 +80,7 @@ PD_BUILD_OP(dispatch_test_complex)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -102,8 +98,7 @@ PD_BUILD_OP(dispatch_test_float_and_complex)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -120,8 +115,7 @@ PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
 
 std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
diff --git a/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc b/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
index 17a36df2cde48598efb3945499516c6c70edecfd..18940f20e7652305d86942b5a67a51fa98638034 100644
--- a/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
@@ -34,8 +34,7 @@ void fill_constant_cpu_kernel(data_t* out_data, int64_t x_numel, data_t value) {
 }
 
 std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -44,15 +43,13 @@ std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
       }));
 
   // fake multi output: Fake_float64 with float64 dtype
-  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU);
-  fake_float64.reshape(x.shape());
+  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   fill_constant_cpu_kernel<double>(
       fake_float64.mutable_data<double>(x.place()), x.size(), 0.);
 
   // fake multi output: ZFake_int32 with int32 dtype
-  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU);
-  zfake_int32.reshape(x.shape());
+  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
   fill_constant_cpu_kernel<int32_t>(
       zfake_int32.mutable_data<int32_t>(x.place()), x.size(), 1);
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index d796c3b5fbd60b450d7dbb02cbd47905c57f4b98..9049b604c910c80a41afa2c509e2ec3fdb4ffbfc 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -24,8 +24,7 @@ from utils import paddle_includes, extra_cc_args, extra_nvcc_args
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
-    get_build_directory())
+file = '{}\\custom_concat\\custom_concat.pyd'.format(get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
     cmd = 'del {}'.format(file)
     run_cmd(cmd, True)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
index a8e4019880332d46031ae3847a0a3f611ed37ad2..25c88ee6c6b01daf62553f5f7857bfe06fce25ca 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -24,8 +24,7 @@ from utils import paddle_includes, extra_cc_args, extra_nvcc_args
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
-    get_build_directory())
+file = '{}\\custom_conj\\custom_conj.pyd'.format(get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
     cmd = 'del {}'.format(file)
     run_cmd(cmd, True)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ba70eaa3e06cec619b2a9175db4aa1c8bf75a8b
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.static as static
+import paddle.nn.functional as F
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+
+# Because Windows don't use docker, the shared lib already exists in the
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_linear\\custom_linear.pyd'.format(get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+custom_ops = load(
+    name='custom_linear_jit',
+    sources=['custom_linear_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cc flags
+    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
+    verbose=True)
+
+
+def linear_dynamic(func, dtype, np_x, np_weight, np_bias):
+    paddle.set_device("cpu")
+    x = paddle.to_tensor(np_x, dtype=dtype)
+    weight = paddle.to_tensor(np_weight, dtype=dtype)
+    bias = paddle.to_tensor(np_bias, dtype=dtype)
+    out = func(x, weight, bias)
+    return out.numpy()
+
+
+def linear_static(func, dtype, np_x, np_weight, np_bias):
+    paddle.enable_static()
+    paddle.set_device("cpu")
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="x", shape=np_x.shape, dtype=dtype)
+            weight = static.data(
+                name="weight", shape=np_weight.shape, dtype=dtype)
+            bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype)
+            out = func(x, weight, bias)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            out_v, = exe.run(static.default_main_program(),
+                             feed={
+                                 "x": np_x.astype(dtype),
+                                 "weight": np_weight.astype(dtype),
+                                 "bias": np_bias.astype(dtype)
+                             },
+                             fetch_list=[out.name])
+    paddle.disable_static()
+    return out_v
+
+
+class TestCustomLinearJit(unittest.TestCase):
+    def setUp(self):
+        self.dtypes = ['float32', 'float64']
+        self.np_x = np.random.random((3, 2)).astype("float32")
+        self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32")
+        self.np_bias = np.ones([4], dtype="float32")
+
+    def check_output(self, out, pd_out, name):
+        self.assertTrue(
+            np.array_equal(out, pd_out),
+            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
+                                                           pd_out))
+
+    def test_static(self):
+        for dtype in self.dtypes:
+            pten_out = linear_static(custom_ops.pten_linear, dtype, self.np_x,
+                                     self.np_weight, self.np_bias)
+            pd_out = linear_static(F.linear, dtype, self.np_x, self.np_weight,
+                                   self.np_bias)
+            self.check_output(pten_out, pd_out, "pten_out")
+
+    def test_dynamic(self):
+        for dtype in self.dtypes:
+            pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x,
+                                      self.np_weight, self.np_bias)
+            pd_out = linear_dynamic(F.linear, dtype, self.np_x, self.np_weight,
+                                    self.np_bias)
+            self.check_output(pten_out, pd_out, "pten_out")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 052fe8b156a53cec04fc2a216768acc1d4528312..4f075066b9d931c978b40e4b9cafc21018c43710 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -101,7 +101,7 @@ class TestJITLoad(unittest.TestCase):
         except OSError as e:
             caught_exception = True
             self.assertTrue(
-                "function \"relu_cpu_forward\" is not implemented for data type `int32_t`"
+                "function \"relu_cpu_forward\" is not implemented for data type `int32`"
                 in str(e))
             if IS_WINDOWS:
                 self.assertTrue(
@@ -123,7 +123,7 @@ class TestJITLoad(unittest.TestCase):
         except OSError as e:
             caught_exception = True
             self.assertTrue(
-                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
+                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32`"
                 in str(e))
             self.assertTrue(
                 "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in
diff --git a/python/setup.py.in b/python/setup.py.in
index b1643e6a6d7d36c5e9a59d4daefb7e15300a549e..05418e1f46814a423dbbc77106ef2f08543376bd 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -558,9 +558,13 @@ def find_files(pattern, root, recursive=False):
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) +  # extension
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api')) +  # pten unify api header
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api/ext')) +  # custom op api
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api/include')) +  # pten api
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
     # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
-    # to `extension/incude`,
+    # to `paddle/pten/api/ext`,
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/bfloat16.h'] +
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] +
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'] +
     ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])
@@ -577,7 +581,6 @@ class InstallCommand(InstallCommandBase):
         ret = InstallCommandBase.finalize_options(self)
         self.install_lib = self.install_platlib
         self.install_headers = os.path.join(self.install_platlib, 'paddle', 'include')
-        
         return ret
 
 class InstallHeaders(Command):
@@ -609,8 +612,8 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'fluid' in install_dir or 'utils' in install_dir:
-                install_dir = "paddle/extension/include/"
+            if 'fluid' in install_dir:
+                install_dir = "paddle/pten/common/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
index 421962bb584974ee0eef5140c908af55f5d5fa8c..9232924ddb07d8473e951392fcba163bdb63c9b2 100644
--- a/tools/get_single_test_cov.py
+++ b/tools/get_single_test_cov.py
@@ -87,7 +87,7 @@ def getCovinfo(rootPath, test):
         'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
         % ut_map_path)
     os.system(
-        "cd %s && lcov --extract coverage.info '/paddle/paddle/fluid/framework/*' '/paddle/paddle/fluid/imperative/*' '/paddle/paddle/fluid/inference/*' '/paddle/paddle/fluid/memory/*' '/paddle/paddle/fluid/operators/*' '/paddle/paddle/fluid/string/*' '/paddle/paddle/fluid/distributed/*' '/paddle/paddle/fluid/extension/*' '/paddle/paddle/fluid/platform/*' '/paddle/paddle/fluid/pybind/*' '/paddle/build/*' -o coverage.info.tmp --rc lcov_branch_coverage=0 > /dev/null 2>&1"
+        "cd %s && lcov --extract coverage.info '/paddle/paddle/fluid/framework/*' '/paddle/paddle/fluid/imperative/*' '/paddle/paddle/fluid/inference/*' '/paddle/paddle/fluid/memory/*' '/paddle/paddle/fluid/operators/*' '/paddle/paddle/fluid/string/*' '/paddle/paddle/fluid/distributed/*' '/paddle/paddle/fluid/platform/*' '/paddle/paddle/fluid/pybind/*' '/paddle/build/*' -o coverage.info.tmp --rc lcov_branch_coverage=0 > /dev/null 2>&1"
         % ut_map_path)
     os.system('rm -rf %s/paddle' % ut_map_path)
     os.system('rm -rf %s/coverage.info' % ut_map_path)