[PluggableDevice] Add custom runtime support (#38740)

* [CustomRuntime] Add DeviceManager * [CustomRuntime] Add DeviceInterface * [CustomRuntime] Add Stream, Event, DeviceGuard, CallbackManager * [CustomRuntime] Add plug-in device * [CustomRuntime] Memory module support PluggableDevice * [CustomRuntime] Add WITH_PLUGGABLE_DEVICE cmake option * update * [API] update API doc based on comments, test=develop Co-authored-by: N qili93 <qili93@qq.com>

[PluggableDevice] Add custom runtime support (#38740)
* [CustomRuntime] Add DeviceManager * [CustomRuntime] Add DeviceInterface * [CustomRuntime] Add Stream, Event, DeviceGuard, CallbackManager * [CustomRuntime] Add plug-in device * [CustomRuntime] Memory module support PluggableDevice * [CustomRuntime] Add WITH_PLUGGABLE_DEVICE cmake option * update * [API] update API doc based on comments, test=develop Co-authored-by: N qili93 <qili93@qq.com>
3e7825f3 · ronnywang · GitHub · 0d46a108 · 3e7825f3 · 3e7825f3
66 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup ji
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
 option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
 option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
+option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)

 if(WITH_RECORD_BUILDTIME)
    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
@@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr
  return()
 endif()

+if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
+set(WITH_CUSTOM_DEVICE ON)
+endif()
+
 if(WIN32)
    if(WITH_DISTRIBUTE)
        MESSAGE(WARNING

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -219,3 +219,7 @@ endif(ON_INFER)
 if(WITH_CRYPTO)
    add_definitions(-DPADDLE_WITH_CRYPTO)
 endif(WITH_CRYPTO)
+
+if(WITH_CUSTOM_DEVICE AND NOT WIN32)
+    add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
+endif()
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
        platform::errors::Unimplemented("platform::MLUPlace is not supported"));
  }

+  inline ::DLDevice operator()(const platform::CustomPlace &place) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "platform::CustomPlace is not supported"));
+  }
+
  inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    ::DLDevice device;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
+#endif
+    } else if (platform::is_custom_place(place_)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for " << place_ << ".";
+        gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_,
+                                                            max_memory_size));
+      } else {
+        VLOG(4) << "Use default stream gc for " << place_ << ".";
+        gc.reset(
+            new CustomDefaultStreamGarbageCollector(place_, max_memory_size));
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found"));
 #endif
    }
  }

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -18,6 +18,7 @@
 #endif
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"

 DECLARE_double(eager_delete_tensor_gb);
 DECLARE_double(memory_fraction_of_eager_deletion);
@@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback(
 }
 #endif

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CustomDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void CustomDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+CustomStreamGarbageCollector::CustomStreamGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::DeviceGuard guard(place);
+  stream_.reset(new platform::stream::Stream);
+  stream_->Init(place);
+  callback_manager_.reset(new platform::CallbackManager(stream_.get()));
+}
+
+CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
+  platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
+  stream_->Synchronize();
+  stream_->Destroy();
+}
+
+platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
+  return stream_.get();
+}
+
+void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void CustomStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+
 int64_t GetEagerDeletionThreshold() {
  return FLAGS_eager_delete_tensor_gb < 0
             ? -1

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector {
 };
 #endif

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place,
+                                      size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place,
+                                         size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class CustomStreamGarbageCollector : public GarbageCollector {
+ public:
+  CustomStreamGarbageCollector(const platform::CustomPlace &place,
+                               size_t max_memory_size);
+
+  ~CustomStreamGarbageCollector();
+
+  void Wait() const override;
+
+  platform::stream::Stream *stream() const;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+
+ private:
+  std::unique_ptr<platform::stream::Stream> stream_;
+  std::unique_ptr<platform::CallbackManager> callback_manager_;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
  Add(std::forward<Container>(objs), []() {});

--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
                        "Too many OpKernel attribute values, expected maximum "
                        "value is 64, received value is %d.",
                        cur_loc));
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::hash<int> hasher;
+  size_t seed =
+      hasher(place + data_type + data_layout + library_type + customized_value);
+  if (platform::is_custom_place(key.place_)) {
+    seed ^= std::hash<std::string>{}(key.place_.GetDeviceType()) + 0x9e3779b9 +
+            (seed << 6) + (seed >> 2) + 4;
+  }
+  return seed;
+#else
  std::hash<int> hasher;
  return hasher(place + data_type + data_layout + library_type +
                customized_value);
+#endif
 }

 bool OpKernelType::operator==(const OpKernelType& o) const {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/common/scalar.h"
@@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = place.device;
      platform::SetMLUDeviceId(dev_id);
+#endif
+    } else if (platform::is_custom_place(place)) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with CustomDevice support.",
+          place));
+#else
+      platform::DeviceManager::SetDevice(place);
 #endif
    }


--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use XPU device since it's not compiled with XPU,"
          "Please recompile or reinstall Paddle with XPU support."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(
+            new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size));
+      } else {
+        gc.reset(new CustomStreamGarbageCollector(place, max_memory_size));
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use custom device since it's not compiled with "
+          "CustomDevice,"
+          "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
    } else if (platform::is_cpu_place(place)) {
      gc.reset(new CPUGarbageCollector(place, max_memory_size));

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_custom_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  } else if (platform::is_custom_place(src_place) &&  // NOLINT
+             platform::is_custom_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
@@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
-      platform::is_mlu_place(dst_place)) {
+      platform::is_mlu_place(dst_place) ||
+      platform::is_custom_place(dst_place)) {
    dev_ctx = pool.Get(dst_place);
  } else {
    dev_ctx = pool.Get(src.place());
@@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&    // NOLINT
+           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_custom_place(
+               dst_place)) { /* custom_device -> custom_device*/
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
@@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
                 const platform::CUDAPinnedPlace& cpu) const {
    return *out.data<bool>();
  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CustomPlace& custom_dev) const {
+    PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ",
+                                                 custom_dev));
+    return false;
+  }
 };

 template <typename Predicate>
@@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> {
      out_ptr[i] = lhs && rhs;
    }
  }
+
+  void VisitorImpl(const platform::CustomPlace& custom_dev) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("CustomPlace is not supported"));
+  }
 };

 void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
@@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPlace is not supported when not compiled with NPU"));
+#endif
+    } else if (platform::is_custom_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& custom_device_context =
+          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(), tensor.place(),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     custom_device_context.stream());
+        custom_device_context.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CustomPlace is not supported when not compiled with "
+          "CustomDevice"));
 #endif
    } else {
      os.write(static_cast<const char*>(data_ptr),
@@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
        platform::is_mlu_place(dev_ctx.GetPlace()) ||
-        platform::is_npu_place(dev_ctx.GetPlace())) {
+        platform::is_npu_place(dev_ctx.GetPlace()) ||
+        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
      Tensor cpu_tensor;
      cpu_tensor.Resize(framework::make_ddim(shape));
      framework::VisitDataType(
@@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
+          platform::is_custom_place(dev_ctx.GetPlace())) {
        dev_ctx.Wait();
      }
 #else
@@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
        platform::is_mlu_place(dev_ctx.GetPlace()) ||
-        platform::is_npu_place(dev_ctx.GetPlace())) {
+        platform::is_npu_place(dev_ctx.GetPlace()) ||
+        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
      Tensor cpu_tensor;
      cpu_tensor.Resize(framework::make_ddim(dims));
      framework::VisitDataType(
@@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
+          platform::is_custom_place(dev_ctx.GetPlace())) {
        dev_ctx.Wait();
      }
 #else
@@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "MLUPlace is not supported when not compiled with MLU"));
-      } else {
+      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CutomPlace is not supported when not compiled with CustomDevice"));
      }
 #endif
    } else {

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, src_place, src_ptr, size,
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromArray on %s is not supported.", dst_place));
+  }
 }

 template <typename T>
@@ -241,6 +252,17 @@ void TensorFromVector(const std::vector<T>& src,
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
  }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, src_place, src_ptr, size,
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromVector on %s is not supported.", dst_place));
+  }
 }

 // The fully specialized function should be inline to avoid
@@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector<bool>& src,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromVector on %s is not supported.", dst_place));
+  }
  delete[] array;
 }

@@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
  }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorToVector on %s is not supported.", src.place()));
+  }
 }

 template <>
@@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src,
        dst_place, dst_ptr, src.place(), src_ptr, size,
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
  }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+  }
 #endif
  for (unsigned int i = 0; i < src.numel(); i++) {
    (*dst)[i] = static_cast<bool>(array[i]);

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -180,6 +180,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
        "is not supported in imperative mode",
        place));
  }
+  void operator()(const platform::CustomPlace& place) const {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }

 private:
  int64_t numel_;
@@ -331,7 +337,14 @@ void TensorAdd(const VarType& src, VarType* dst) {
    return;
  }
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (platform::is_custom_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Gradient accumulation of data type (%s) on place (%s) is not "
+        "supported in imperative mode",
+        framework::DataTypeToString(data_type), place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
  if (platform::is_xpu_place(place)) {
    if (data_type == framework::DataTypeTrait<float>::DataType()) {

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -278,6 +278,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (kernel_iter == kernels.end() &&
+      paddle::platform::is_custom_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
  // case

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"

@@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use MLU device since it's not compiled with MLU,"
          "Please recompile or reinstall Paddle with MLU support."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use CustomDevice since it's not compiled with "
+          "CustomDevice,"
+          "Please recompile or reinstall Paddle with CustomDevice "
+          "support."));
 #endif
    } else {
      PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -222,6 +234,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with MLU if use MLUPlace."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      platform::DeviceManager::SetDevice(place);
+#else
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "PaddlePaddle should compile with CustomDevice if use "
+          "CustomPlace."));
 #endif
    }
    if (!override_default_attr_map) {

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -58,6 +58,11 @@ else ()
    set(AllocatorFacadeDeps)
 endif()

+if (WITH_CUSTOM_DEVICE)
+  cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
+  set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
+endif()
+
 if (WITH_GPU)
    nv_test(best_fit_allocator_test
            SRCS best_fit_allocator_test.cc

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -62,6 +62,11 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/memory/allocation/custom_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_int64(
    gpu_allocator_retry_time, 10000,
    "The retry time (milliseconds) when allocator fails "
@@ -186,6 +191,17 @@ class AllocatorFacadePrivate {
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (const auto& dev_type : device_types) {
+          for (size_t dev_id = 0;
+               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               ++dev_id) {
+            InitNaiveBestFitCustomDeviceAllocator(
+                platform::CustomPlace(dev_type, dev_id));
+          }
+        }
 #endif
        break;
      }
@@ -222,6 +238,17 @@ class AllocatorFacadePrivate {
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (const auto& dev_type : device_types) {
+          for (size_t dev_id = 0;
+               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               ++dev_id) {
+            InitAutoGrowthCustomDeviceAllocator(
+                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
+          }
+        }
 #endif
        break;
      }
@@ -700,6 +727,21 @@ class AllocatorFacadePrivate {
  }
 #endif

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+
+  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
+                                           bool allow_free_idle_chunk) {
+    auto custom_allocator =
+        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
+        allow_free_idle_chunk);
+  }
+#endif
+
  void InitSystemAllocators() {
    if (!system_allocators_.empty()) return;
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
@@ -770,6 +812,16 @@ class AllocatorFacadePrivate {
      places.emplace_back(platform::MLUPlace(dev_id));
    }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    for (const auto& dev_type : device_types) {
+      for (size_t dev_id = 0;
+           dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+           dev_id++) {
+        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
+      }
+    }
+#endif

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
@@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
  }
 #endif
-
  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)

--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/custom_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool CustomAllocator::IsAllocThreadSafe() const { return true; }
+void CustomAllocator::FreeImpl(pten::Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      allocation->place(), place_,
+      platform::errors::PermissionDenied("CustomDevice memory is "
+                                         "freed in incorrect device. "
+                                         "This may be a bug"));
+
+  delete allocation;
+}
+
+pten::Allocation* CustomAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::DeviceManager::SetDevice(place_); });
+
+  void* ptr =
+      platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
+  if (LIKELY(ptr)) {
+    return new Allocation(ptr, size, place_);
+  }
+
+  size_t avail, total;
+  platform::DeviceManager::MemoryStats(place_, &total, &avail);
+
+  auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
+  auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on %s:%d. "
+      "Cannot allocate %s memory on %s:%d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using %s:%d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another %s.\n"
+      "2. If no, please decrease the batch size of your model.\n\n",
+      dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id,
+      string::HumanReadableSize(avail), dev_type, dev_id, dev_type));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/custom_allocator.h
+++ b/paddle/fluid/memory/allocation/custom_allocator.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CustomAllocator : public Allocator {
+ public:
+  explicit CustomAllocator(const platform::CustomPlace& place)
+      : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::Place place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -20,6 +20,7 @@
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -30,7 +31,6 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-#include "paddle/fluid/platform/device/device_wrapper.h"

 PADDLE_DEFINE_EXPORTED_bool(
    init_allocated_mem, false,
@@ -733,6 +733,136 @@ uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
 #endif
 }

+// For CustomDevice
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class BuddyAllocatorList {
+ private:
+  explicit BuddyAllocatorList(const std::string &device_type)
+      : device_type_(device_type) {
+    auto devices = platform::DeviceManager::GetDeviceList(device_type);
+    for (auto dev_id : devices) {
+      init_flags_[dev_id].reset(new std::once_flag());
+    }
+  }
+
+  static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) {
+    return new BuddyAllocatorList(device_type);
+  }
+
+ public:
+  static BuddyAllocatorList *Instance(const std::string &device_type) {
+    // DeviceType -> AllocatorList
+    static std::unordered_map<std::string, BuddyAllocatorList *> pool;
+    if (pool.find(device_type) == pool.end()) {
+      pool[device_type] = CreateNewInstance(device_type);
+    }
+    return pool[device_type];
+  }
+
+  BuddyAllocator *Get(int dev_id) {
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+                      platform::errors::OutOfRange(
+                          "Cannot find %s %d, please check visible devices.",
+                          device_type_, dev_id));
+
+    std::call_once(*init_flags_[dev_id], [this, dev_id] {
+      platform::DeviceManager::SetDevice(device_type_, dev_id);
+      platform::CustomPlace place(device_type_, dev_id);
+
+      allocators_[dev_id].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::CustomAllocator(device_type_, dev_id)),
+          platform::DeviceManager::GetMinChunkSize(place),
+          platform::DeviceManager::GetMaxChunkSize(place),
+          platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
+    });
+
+    return allocators_[dev_id].get();
+  }
+
+ private:
+  std::string device_type_;
+  std::unordered_map<size_t, std::unique_ptr<std::once_flag>> init_flags_;
+  std::unordered_map<size_t, std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetBuddyAllocator(const platform::Place &place) {
+  VLOG(10) << "GetBuddyAllocator place = " << place;
+  if (platform::is_custom_place(place)) {
+    return BuddyAllocatorList::Instance(
+               platform::PlaceHelper::GetDeviceType(place))
+        ->Get(platform::PlaceHelper::GetDeviceId(place));
+  } else {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("place must be CustomPlace"));
+  }
+}
+#endif
+
+template <>
+void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
+                                   size_t size) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  auto *buddy_allocator = GetBuddyAllocator(place);
+  auto *ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    platform::DeviceGuard guard(place);
+    size_t avail, total;
+    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
+        "%s. ",
+        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(total - avail)));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
+                                                                    size);
+    }
+  }
+  VLOG(10) << "  pointer=" << ptr;
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+                                 size_t size) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetBuddyAllocator(place)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::CustomPlace>(const platform::CustomPlace &place) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  return GetBuddyAllocator(place)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  return GetBuddyAllocator(place)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
 struct AllocVisitor : public boost::static_visitor<void *> {
  inline explicit AllocVisitor(size_t size) : size_(size) {}


--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,9 +25,7 @@ limitations under the License. */
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif

-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"

 namespace paddle {
 namespace memory {
@@ -35,12 +33,37 @@ namespace detail {

 BuddyAllocator::BuddyAllocator(
    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t max_chunk_size, size_t extra_padding_size)
+    size_t max_chunk_size, size_t extra_padding_size,
+    const std::string dev_type)
    : min_chunk_size_(min_chunk_size),
      max_chunk_size_(max_chunk_size),
      extra_padding_size_(extra_padding_size),
      cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {}
+      system_allocator_(std::move(system_allocator)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (!dev_type.empty()) {
+    init_allocate_size_func_ = [dev_type]() {
+      return platform::DeviceManager::GetInitAllocSize(
+          platform::PlaceHelper::CreatePlace(dev_type));
+    };
+    re_allocate_size_func_ = [dev_type]() {
+      return platform::DeviceManager::GetReallocSize(
+          platform::PlaceHelper::CreatePlace(dev_type));
+    };
+  } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    init_allocate_size_func_ = &platform::GpuInitAllocSize;
+    re_allocate_size_func_ = &platform::GpuReallocSize;
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    init_allocate_size_func_ = &platform::NPUInitAllocSize;
+    re_allocate_size_func_ = &platform::NPUReallocSize;
+#elif defined(PADDLE_WITH_MLU)
+    init_allocate_size_func_ = &platform::MLUInitAllocSize;
+    re_allocate_size_func_ = &platform::MLUReallocSize;
+#endif
+  }
+#endif
+}

 BuddyAllocator::~BuddyAllocator() {
  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
@@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
  size_t allocate_bytes = max_chunk_size_;
  size_t index = 0;

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  allocate_bytes = DeviceAllocateSize(init_allocate_size_func_,
+                                      re_allocate_size_func_, request_bytes);
+#else
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
                                      &platform::GpuReallocSize, request_bytes);
@@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #elif defined(PADDLE_WITH_MLU)
  allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
                                      &platform::MLUReallocSize, request_bytes);
+#endif
 #endif

  // Allocate a new block

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -39,7 +39,8 @@ class BuddyAllocator {
 public:
  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
                 size_t min_chunk_size, size_t max_chunk_size,
-                 size_t extra_padding_size = 0);
+                 size_t extra_padding_size = 0,
+                 const std::string dev_type = "");

  ~BuddyAllocator();

@@ -123,6 +124,9 @@ class BuddyAllocator {
  /*! Allocate CPU/GPU memory from system */
  std::unique_ptr<SystemAllocator> system_allocator_;
  std::mutex mutex_;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::function<size_t()> init_allocate_size_func_, re_allocate_size_func_;
+#endif
 };

 }  // namespace detail

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -38,6 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif

+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) {
 bool MLUAllocator::UseGpu() const { return true; }
 #endif

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+void* CustomAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto place = platform::CustomPlace(dev_type_, dev_id_);
+  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  p = device->MemoryAllocate(size);
+  if (LIKELY(p)) {
+    VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
+    *index = 0;
+    plug_alloc_size += size;
+  } else {
+    size_t avail, total;
+
+    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on %s %d. "
+        "total memory is %s, used memory is %s, "
+        "available memory is only %s.\n\n",
+        dev_type_, dev_id_, string::HumanReadableSize(total),
+        string::HumanReadableSize(total - avail),
+        string::HumanReadableSize(avail)));
+  }
+  return p;
+}
+
+void CustomAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, plug_alloc_size));
+  plug_alloc_size -= size;
+  auto place = platform::CustomPlace(dev_type_, dev_id_);
+  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  device->MemoryDeallocate(p, size);
+}
+
+bool CustomAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <stddef.h>  // for size_t
+#include <string>

 namespace paddle {
 namespace memory {
@@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator {
 };
 #endif

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomAllocator : public SystemAllocator {
+ public:
+  explicit CustomAllocator(const std::string& device_type, size_t dev_id)
+      : dev_type_(device_type), dev_id_(dev_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t plug_alloc_size = 0;
+  std::string dev_type_;
+  size_t dev_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -36,66 +36,25 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or GPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or GPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   CUDA stream.
- *
- * \note    For GPU memory copy, CUDA stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          gpuStream_t stream);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or NPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or NPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   NPU stream.
- *
- * \note    For NPU memory copy, NPU stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          aclrtStream stream);
-#endif
-
-#ifdef PADDLE_WITH_MLU
 /**
 * \brief   Copy memory from one place to another place.
 *
- * \param[in]  DstPlace Destination allocation place (CPU or MLU).
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU or XPU or
+ * CustomDevice).
 * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or MLU).
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU or XPU or
+ * CustomDevice).
 * \param[in]  src      Source memory address.
 * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   MLU stream.
+ * \param[in]  stream   stream for asynchronously memory copy.
 *
- * \note    For MLU memory copy, MLU stream need to be specified
- *          for asynchronously memory copy.
+ * \note    For GPU/XPU/CustomDevice memory copy, stream need to be specified
+ *          for asynchronously memory copy, and type is restored in the
+ *          implementation.
 *
 */
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          mluStream stream);
-#endif
-
+          void* stream);
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using float16 = paddle::platform::float16;
+
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int16_t>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<double>>;
+
+template struct SetConstant<pten::CPUContext, platform::float16>;
+template struct SetConstant<pten::CPUContext, platform::bfloat16>;
+template struct SetConstant<pten::CPUContext, float>;
+template struct SetConstant<pten::CPUContext, double>;
+template struct SetConstant<pten::CPUContext, int16_t>;
+template struct SetConstant<pten::CPUContext, int>;
+template struct SetConstant<pten::CPUContext, int64_t>;
+template struct SetConstant<pten::CPUContext, bool>;
+template struct SetConstant<pten::CPUContext, uint8_t>;
+template struct SetConstant<pten::CPUContext, platform::complex<float>>;
+template struct SetConstant<pten::CPUContext, platform::complex<double>>;
+
+#ifdef PADDLE_WITH_XPU
+template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
+template struct SetConstant<platform::XPUDeviceContext, float>;
+template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
+template struct SetConstant<platform::XPUDeviceContext, int16_t>;
+template struct SetConstant<platform::XPUDeviceContext, int>;
+template struct SetConstant<platform::XPUDeviceContext, int64_t>;
+template struct SetConstant<platform::XPUDeviceContext, bool>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<double>>;
+#endif
+
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<float>, RANK>;                \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<double>, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    transpose_helper(0, out->numel());
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::NPUPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+}
+
+template <>
+void set_constant_with_place<platform::MLUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CustomPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CUDAPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // tensor->place().apply_visitor(func);
+  paddle::platform::VisitPlace(tensor->place(), func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, int>;
+template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
+
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
+  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
+IF(WITH_CUSTOM_DEVICE)
+cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
+
+cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
+
+cc_library(stream SRCS stream.cc DEPS callback_manager)
+
+cc_library(event SRCS event.cc DEPS enforce place)
+
+cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
+
+ENDIF()
+
+set(DEV_LIBS custom_device)
+
 # GPU
 IF(WITH_GPU OR WITH_ROCM)
  add_subdirectory(gpu)
@@ -22,3 +37,11 @@ ENDIF()
 IF(WITH_MLU)
  add_subdirectory(mlu)
 ENDIF()
+
+# CUSTOM
+IF(WITH_CUSTOM_DEVICE)
+  add_subdirectory(custom)
+
+  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
+  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
+ENDIF()
--- a/paddle/fluid/platform/device/callback_manager.cc
+++ b/paddle/fluid/platform/device/callback_manager.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+CallbackManager::CallbackManager(stream::Stream *stream)
+    : stream_(stream), thread_pool_(1) {}
+
+void CallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
+
+  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+      ->AddCallback(stream_, func);
+}
+
+void CallbackManager::Wait() const {
+  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+      ->SynchronizeStream(stream_);
+
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/callback_manager.h
+++ b/paddle/fluid/platform/device/callback_manager.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include <functional>
+#include <future>  // NOLINT
+#include <memory>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+namespace stream {
+class Stream;
+}  // namespace stream
+
+// NOTE(zjl): clean CallbackManager to make compilation faster
+// Make CallbackManager thread-safe
+class CallbackManager {
+ public:
+  explicit CallbackManager(stream::Stream* stream);
+
+  ~CallbackManager() = default;
+
+  void AddCallback(std::function<void()> callback) const;
+
+  void Wait() const;
+
+ private:
+  stream::Stream* stream_;
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
+IF(WITH_CUSTOM_DEVICE)
+cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
+cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
+ENDIF()
--- a/paddle/fluid/platform/device/custom/custom_device.cc
+++ b/paddle/fluid/platform/device/custom/custom_device.cc
--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/fluid/platform/device/custom/custom_device_test.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
+#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/fluid/platform/device_context.h"
+
+void RegisterDevice() {
+  CustomRuntimeParams runtime_params;
+  runtime_params.size = sizeof(CustomRuntimeParams);
+  auto device_interface = std::make_unique<C_DeviceInterface>();
+  runtime_params.interface = device_interface.get();
+  std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
+  runtime_params.interface->size = sizeof(C_DeviceInterface);
+
+  InitFakeCPUDevice(&runtime_params);
+  EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib(
+      runtime_params, std::move(device_interface), nullptr));
+}
+
+void InitDevice() {
+  RegisterDevice();
+  EXPECT_GT(static_cast<int>(
+                paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
+            0);
+  auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
+  auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+  EXPECT_NE(device, nullptr);
+
+  std::vector<paddle::platform::Place> places;
+  auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  for (auto dev_type : device_types) {
+    auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
+    for (auto dev_id : devices) {
+      places.push_back(
+          paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
+    }
+  }
+  EXPECT_GT(static_cast<int>(places.size()), 0);
+
+  paddle::platform::DeviceContextPool::Init(places);
+}
+
+void TestDeviceInterface(const paddle::platform::Place& place) {
+  std::cout << "TestDeviceInterface on " << place << std::endl;
+  if (paddle::platform::is_custom_place(place)) {
+    auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+    auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
+    auto p1 = device->MemoryAllocate(
+        paddle::platform::DeviceManager::GetMinChunkSize(place));
+    EXPECT_NE(p1, nullptr);
+
+    paddle::platform::DeviceManager::SetDevice(place);
+    auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
+    EXPECT_EQ(dev_id, place.GetDeviceId());
+  }
+}
+
+void TestTensorMutableData(const paddle::platform::Place& place) {
+  std::cout << "TestTensorInitialization on " << place << std::endl;
+  paddle::framework::Tensor src_tensor;
+  float* p1 = nullptr;
+  float* p2 = nullptr;
+  // initialization
+  p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({1, 2, 3}),
+                                      place);
+  auto p1_holder = src_tensor.Holder();
+  EXPECT_NE(p1, nullptr);
+  // set src_tensor a new dim with large size
+  // momery is supposed to be re-allocated
+  p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({3, 1024}),
+                                      place);
+  auto p2_holder = src_tensor.Holder();
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p1_holder.get(), p2_holder.get());
+  // set src_tensor a new dim with same size
+  // momery block is supposed to be unchanged
+  p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2, 3}),
+                                      place);
+  EXPECT_EQ(p1, p2);
+  // set src_tensor a new dim with smaller size
+  // momery block is supposed to be unchanged
+  p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2}),
+                                      place);
+  EXPECT_EQ(p1, p2);
+}
+
+void TestTensorShareDataWith(const paddle::platform::Place& place) {
+  std::cout << "TestTensorShareDataWith on " << place << std::endl;
+  paddle::framework::Tensor src_tensor;
+  paddle::framework::Tensor dst_tensor;
+  src_tensor.mutable_data<int>(paddle::framework::make_ddim({2, 3, 4}), place);
+  dst_tensor.ShareDataWith(src_tensor);
+  ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+}
+
+void TestTensorUtils(const paddle::platform::Place& place) {
+  if (paddle::platform::is_custom_place(place) == false) {
+    return;
+  }
+  paddle::framework::Tensor src_tensor;
+  paddle::framework::Tensor gpu_tensor;
+  paddle::framework::Tensor dst_tensor;
+
+  int* src_ptr = src_tensor.mutable_data<int>(
+      paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+
+  // CPU Tensor to GPU Tensor
+  paddle::platform::CustomDeviceContext gpu_ctx(place);
+  paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor);
+#if 0
+  // GPU Tensor to CPU Tensor
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+  // Sync before Compare Tensors
+  gpu_ctx.Wait();
+  const int* dst_ptr = dst_tensor.data<int>();
+  EXPECT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  // Copy the same tensor
+  paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor);
+  gpu_ctx.Wait();
+  const int* dst_ptr_tmp = dst_tensor.data<int>();
+  EXPECT_NE(src_ptr, dst_ptr_tmp);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+  }
+
+  paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+  // CPU Slice Tensor to GPU Tensor
+  paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor);
+
+  // GPU Tensor to CPU Tensor
+  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+  // Sync before Compare Slice Tensors
+  gpu_ctx.Wait();
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  EXPECT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+#endif
+}
+
+TEST(CustomDevice, Tensor) {
+  InitDevice();
+  auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  for (const auto& dev_type : dev_types) {
+    std::cout << "Test on " << dev_type << std::endl;
+    EXPECT_GT(static_cast<int>(
+                  paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
+              0);
+    auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
+
+    TestDeviceInterface(place);
+    TestTensorMutableData(place);
+    TestTensorShareDataWith(place);
+    TestTensorUtils(place);
+  }
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/fluid/platform/device/custom/enforce_custom.h
+++ b/paddle/fluid/platform/device/custom/enforce_custom.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/device_ext.h"
+
+namespace paddle {
+namespace platform {
+namespace details {
+template <typename T>
+struct CustomDeviceStatusType {};
+
+#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \
+  template <>                                                 \
+  struct CustomDeviceStatusType<type> {                       \
+    using Type = type;                                        \
+    static constexpr Type kSuccess = success_value;           \
+  }
+
+DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS);
+}  // namespace details
+
+inline std::string build_custom_device_error_msg(C_Status stat) {
+  std::ostringstream sout;
+  sout << " CustomDevice error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
+#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND)                      \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__);           \
+    constexpr auto __success_type__ =                                   \
+        ::paddle::platform::details::CustomDeviceStatusType<            \
+            __CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess;                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = ::paddle::platform::errors::External(          \
+          ::paddle::platform::build_custom_device_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+}  // namespace platform
+}  // namespace paddle
+#endif  // PADDLE_WITH_CUSTOM_DEVICE
--- a/paddle/fluid/platform/device/custom/fake_cpu_device.h
+++ b/paddle/fluid/platform/device/custom/fake_cpu_device.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/device/device_ext.h"
+
+constexpr size_t global_total_memory = 1024 * 1024UL;
+static size_t global_free_memory = global_total_memory;
+
+C_Status Init() { return C_SUCCESS; }
+
+C_Status InitDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status SetDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status GetDevice(const C_Device device) {
+  device->id = 0;
+  return C_SUCCESS;
+}
+
+C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status Finalize() { return C_SUCCESS; }
+
+C_Status GetDevicesCount(size_t *count) {
+  *count = 1;
+  return C_SUCCESS;
+}
+
+C_Status GetDevicesList(size_t *device) {
+  *device = 0;
+  return C_SUCCESS;
+}
+
+C_Status MemCpy(const C_Device device, void *dst, const void *src,
+                size_t size) {
+  memcpy(dst, src, size);
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
+                     const void *src, size_t size) {
+  memcpy(dst, src, size);
+  return C_SUCCESS;
+}
+
+C_Status Allocate(const C_Device device, void **ptr, size_t size) {
+  if (global_free_memory >= size) {
+    *ptr = malloc(size);
+    global_free_memory -= size;
+    return C_SUCCESS;
+  } else {
+    *ptr = nullptr;
+    return C_FAILED;
+  }
+}
+
+C_Status Deallocate(const C_Device device, void *ptr, size_t size) {
+  free(ptr);
+  global_free_memory += size;
+  return C_SUCCESS;
+}
+
+C_Status CreateStream(const C_Device device, C_Stream *stream) {
+  return C_SUCCESS;
+}
+
+C_Status DestroyStream(const C_Device device, C_Stream stream) {
+  return C_SUCCESS;
+}
+
+C_Status CreateEvent(const C_Device device, C_Event *event) {
+  return C_SUCCESS;
+}
+
+C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status DestroyEvent(const C_Device device, C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status SyncDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status SyncStream(const C_Device device, C_Stream stream) {
+  return C_SUCCESS;
+}
+
+C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
+
+C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
+                         C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
+
+C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
+                        size_t *free_memory) {
+  *total_memory = global_total_memory;
+  *free_memory = global_free_memory;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMinChunkSize(const C_Device device, size_t *size) {
+  *size = 4 * 1024;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) {
+  *size = 64 * 1024;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) {
+  *size = global_total_memory * 0.95;
+  return C_SUCCESS;
+}
+
+#define DEVICE_TYPE "FakeCPU"
+#define SUB_DEVICE_TYPE "V100"
+
+void InitFakeCPUDevice(CustomRuntimeParams *params) {
+  params->device_type = const_cast<char *>(DEVICE_TYPE);
+  params->sub_device_type = const_cast<char *>(SUB_DEVICE_TYPE);
+  params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION;
+  params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
+  params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
+
+  memset(reinterpret_cast<void *>(params->interface), 0,
+         sizeof(C_DeviceInterface));
+
+  params->interface->initialize = Init;
+  params->interface->finalize = Finalize;
+
+  params->interface->init_device = InitDevice;
+  params->interface->set_device = SetDevice;
+  params->interface->get_device = GetDevice;
+  params->interface->deinit_device = DestroyDevice;
+
+  params->interface->create_stream = CreateStream;
+  params->interface->destroy_stream = DestroyStream;
+
+  params->interface->create_event = CreateEvent;
+  params->interface->destroy_event = DestroyEvent;
+  params->interface->record_event = RecordEvent;
+
+  params->interface->synchronize_device = SyncDevice;
+  params->interface->synchronize_stream = SyncStream;
+  params->interface->synchronize_event = SyncEvent;
+  params->interface->stream_wait_event = StreamWaitEvent;
+
+  params->interface->memory_copy_h2d = MemCpy;
+  params->interface->memory_copy_d2d = MemCpy;
+  params->interface->memory_copy_d2h = MemCpy;
+  params->interface->async_memory_copy_h2d = AsyncMemCpy;
+  params->interface->async_memory_copy_d2d = AsyncMemCpy;
+  params->interface->async_memory_copy_d2h = AsyncMemCpy;
+  params->interface->device_memory_allocate = Allocate;
+  params->interface->host_memory_allocate = Allocate;
+  params->interface->unified_memory_allocate = Allocate;
+  params->interface->device_memory_deallocate = Deallocate;
+  params->interface->host_memory_deallocate = Deallocate;
+  params->interface->unified_memory_deallocate = Deallocate;
+
+  params->interface->get_device_count = GetDevicesCount;
+  params->interface->get_device_list = GetDevicesList;
+  params->interface->device_memory_stats = DeviceMemStats;
+
+  params->interface->device_max_chunk_size = DeviceMaxChunkSize;
+  params->interface->device_min_chunk_size = DeviceMinChunkSize;
+  params->interface->device_max_alloc_size = DeviceMaxAllocSize;
+}
--- a/paddle/fluid/platform/device/device_base.cc
+++ b/paddle/fluid/platform/device/device_base.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "gflags/gflags.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+namespace paddle {
+namespace platform {
+
+#define INTERFACE_UNIMPLEMENT                   \
+  PADDLE_THROW(platform::errors::Unimplemented( \
+      "%s is not implemented on %s device.", __func__, Type()));
+
+// info
+size_t DeviceInterface::GetComputeCapability() {
+  VLOG(10) << Type() + " get compute capability " << 0;
+  return 0;
+}
+
+size_t DeviceInterface::GetRuntimeVersion() {
+  VLOG(10) << Type() + " get runtime version " << 0;
+  return 0;
+}
+
+size_t DeviceInterface::GetDriverVersion() {
+  VLOG(10) << Type() + " get driver version " << 0;
+  return 0;
+}
+
+// device manage
+void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::SynchronizeDevice(size_t dev_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
+
+// stream manage
+void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
+                                   const stream::Stream::Priority& priority,
+                                   const stream::Stream::Flag& flag) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::SynchronizeStream(size_t dev_id,
+                                        const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+  return true;
+}
+
+void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
+                                  stream::Stream::Callback* callback) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::StreamWaitEvent(size_t dev_id,
+                                      const stream::Stream* stream,
+                                      const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+// event manage
+void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
+                                  event::Event::Flag flags) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
+                                  const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::SynchronizeEvent(size_t dev_id,
+                                       const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+  return true;
+}
+
+// memery manage
+void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
+                                    size_t src_id, const void* src, size_t size,
+                                    const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
+                                           size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
+                                              size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
+                                size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
+  size_t available_to_alloc = AvailableAllocSize(dev_id);
+  PADDLE_ENFORCE_GT(available_to_alloc, 0,
+                    platform::errors::ResourceExhausted(
+                        "Not enough available %s memory.", Type()));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
+                    platform::errors::ResourceExhausted(
+                        "Not enough available %s memory.", Type()));
+  return alloc_bytes;
+}
+
+size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
+  size_t total = 0;
+  size_t available = 0;
+  MemoryStats(dev_id, &total, &available);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GetMinChunkSize(dev_id);
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  return available_to_alloc;
+}
+
+size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
+  size_t init_alloc_size = AllocSize(dev_id, false);
+  VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
+  return init_alloc_size;
+}
+
+size_t DeviceInterface::GetReallocSize(size_t dev_id) {
+  size_t realloc_size = AllocSize(dev_id, true);
+  VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
+  return realloc_size;
+}
+
+size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
+  size_t max_alloc_size =
+      std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
+  VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
+  return max_alloc_size;
+}
+
+size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
+  size_t max_chunk_size = GetMaxAllocSize(dev_id);
+  VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
+  VLOG(10) << Type() + " extra padding size " << 0;
+  return 0;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/device_base.h
+++ b/paddle/fluid/platform/device/device_base.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceInterface {  // Driver / Runtime
+ public:
+  DeviceInterface(const std::string& type, uint8_t priority, bool is_custom)
+      : type_(type), priority_(priority), is_custom_(is_custom) {}
+  uint8_t Priority() { return priority_; }
+  std::string Type() { return type_; }
+  bool IsCustom() { return is_custom_; }
+
+  virtual ~DeviceInterface() {}
+
+  // Info
+  virtual size_t GetComputeCapability();
+
+  virtual size_t GetRuntimeVersion();
+
+  virtual size_t GetDriverVersion();
+
+  // Platform
+  //! Initialize
+  virtual void Initialize();
+
+  //! Finalize
+  virtual void Finalize();
+
+  // Device
+  virtual size_t GetDeviceCount() = 0;
+  virtual std::vector<size_t> GetDeviceList() = 0;
+
+  //! Wait for compute device to finish.
+  virtual void SynchronizeDevice(size_t dev_id);
+
+  //! Initialize device.
+  virtual void InitDevice(size_t dev_id);
+
+  //! Deinitialize device.
+  virtual void DeInitDevice(size_t dev_id);
+
+  // ! Set device to be used.
+  virtual void SetDevice(size_t dev_id);
+
+  // ! Returns which device is currently being used.
+  virtual int GetDevice();
+
+  // Stream
+  // ! Create an asynchronous stream
+  virtual void CreateStream(
+      size_t dev_id, stream::Stream* stream,
+      const stream::Stream::Priority& priority =
+          stream::Stream::Priority::kNormal,
+      const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
+
+  // ! Destroys an asynchronous stream.
+  virtual void DestroyStream(size_t dev_id, stream::Stream* stream);
+
+  // ! Waits for stream tasks to complete.
+  virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream);
+
+  // ! Queries an asynchronous stream for completion status.
+  virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
+
+  // ! Add a callback to a compute stream.
+  virtual void AddCallback(size_t dev_id, stream::Stream* stream,
+                           stream::Stream::Callback* callback);
+
+  // Event
+  // ! Create an event.
+  virtual void CreateEvent(size_t dev_id, event::Event* event,
+                           event::Event::Flag flags);
+
+  // ! Destroy an event.
+  virtual void DestroyEvent(size_t dev_id, event::Event* event);
+
+  // ! Records an event.
+  virtual void RecordEvent(size_t dev_id, const event::Event* event,
+                           const stream::Stream* stream);
+
+  // ! Waits for event to complete.
+  virtual void SynchronizeEvent(size_t dev_id, const event::Event* event);
+  // ! Queries an event for completion status.
+  virtual bool QueryEvent(size_t dev_id, const event::Event* event);
+
+  // ! Make a compute stream wait on an event
+  virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+                               const event::Event* event);
+
+  // Memory
+  virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
+                             const void* src, size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void* MemoryAllocate(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size);
+
+  virtual void* MemoryAllocateHost(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size);
+
+  virtual void* MemoryAllocateUnified(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size);
+
+  virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size);
+
+  virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free);
+
+  virtual size_t GetMinChunkSize(size_t dev_id);
+
+  virtual size_t GetInitAllocSize(size_t dev_id);
+
+  virtual size_t GetReallocSize(size_t dev_id);
+
+  virtual size_t GetMaxAllocSize(size_t dev_id);
+
+  virtual size_t GetMaxChunkSize(size_t dev_id);
+
+  virtual size_t GetExtraPaddingSize(size_t dev_id);
+
+ private:
+  const std::string type_;
+  const uint8_t priority_;
+  const bool is_custom_;
+
+  size_t AllocSize(size_t dev_id, bool realloc);
+
+  size_t AvailableAllocSize(size_t dev_id);
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/device/device_ext.h
+++ b/paddle/fluid/platform/device/device_ext.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+#include <cstddef>
+#include <cstring>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0
+#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
+#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
+
+typedef enum {
+  C_SUCCESS = 0,    // success
+  C_WARNING,        // results may not meet expectation (such as an asynchronous
+                    // interface is actually synchronous)
+  C_FAILED,         // resource exhausted/query failed
+  C_ERROR,          // invalid argument/wrong usage/uninitialized
+  C_INTERNAL_ERROR  // plugin error
+} C_Status;
+
+typedef struct C_Device_st { int id; } * C_Device;
+
+typedef struct C_Stream_st* C_Stream;
+
+typedef struct C_Event_st* C_Event;
+
+typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
+                           C_Status* status);
+
+struct C_DeviceInterface {
+  // Core fill it and plugin must to check it
+  size_t size;
+
+  ///////////////////////
+  // device manage api //
+  ///////////////////////
+
+  /**
+   * @brief Initialize hardware
+   *
+   */
+  C_Status (*initialize)();
+
+  /**
+   * @brief Deinitialize hardware
+   *
+   */
+  C_Status (*finalize)();
+
+  /**
+   * @brief Initialize device
+   *
+   * @param[C_Device] device     Core fill it with a logical id, and then plugin
+   * must replace it with a physical id
+   */
+  C_Status (*init_device)(const C_Device device);
+
+  /**
+   * @brief Set current device
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   */
+  C_Status (*set_device)(const C_Device device);
+
+  /**
+   * @brief Get current device
+   *
+   * @param[C_Device] device     Plugin fill it with a physical id
+   */
+  C_Status (*get_device)(const C_Device device);
+
+  /**
+   * @brief Deinitialize device
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   */
+  C_Status (*deinit_device)(const C_Device device);
+
+  /**
+   * @brief Create a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream*] stream    Plugin create a stream and fill it
+   */
+  C_Status (*create_stream)(const C_Device device, C_Stream* stream);
+
+  /**
+   * @brief Destroy a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream] stream
+   */
+  C_Status (*destroy_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Query a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream] stream
+   */
+  C_Status (*query_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Add a callback to stream
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Callback] callback
+   * @param[void*]      user_data
+   */
+  C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
+                                  C_Callback callback, void* user_data);
+
+  /**
+   * @brief Create an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event*]   event      Plugin create an event and fill it
+   */
+  C_Status (*create_event)(const C_Device device, C_Event* event);
+
+  /**
+   * @brief Record an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Event]    event
+   */
+  C_Status (*record_event)(const C_Device device, C_Stream stream,
+                           C_Event event);
+
+  /**
+   * @brief Destroy an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*destroy_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Query an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*query_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Synchronize a device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   */
+  C_Status (*synchronize_device)(const C_Device device);
+
+  /**
+   * @brief Synchronize a stream
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   */
+  C_Status (*synchronize_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Synchronize an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*synchronize_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Make a stream wait on an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Event]    event
+   */
+  C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
+                                C_Event event);
+
+  void* reserved_dev_api[8];
+
+  ///////////////////////
+  // memory manage api //
+  ///////////////////////
+
+  /**
+   * @brief Device memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
+                                     size_t size);
+
+  /**
+   * @brief Device memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
+                                       size_t size);
+
+  /**
+   * @brief Device memory set
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[unsigned char] value
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_set)(const C_Device device, void* ptr,
+                                unsigned char value, size_t size);
+
+  /**
+   * @brief Host memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
+                                   size_t size);
+
+  /**
+   * @brief Host memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
+                                     size_t size);
+
+  /**
+   * @brief Unified memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
+                                      size_t size);
+
+  /**
+   * @brief Unified memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
+                                        size_t size);
+
+  /**
+   * @brief Memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Memory copy from device to host
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Memory copy from device to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Peer memory copy from device to device
+   *
+   * @param[C_Device]   dst_device     Core fill it with a physical id
+   * @param[C_Device]   src_device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_p2p)(const C_Device dst_device,
+                              const C_Device src_device, void* dst,
+                              const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from device to host
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from device to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Peer asynchonrize memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
+                                    const C_Device src_device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  void* reserved_mem_api[8];
+
+  //////////////
+  // info api //
+  //////////////
+
+  /**
+   * @brief Get visible device count
+   *
+   * @param[size_t*]    count       Plugin fill it
+   */
+  C_Status (*get_device_count)(size_t* count);
+
+  /**
+   * @brief Get visible device list
+   *
+   * @param[size_t*]    devices     Plugin fill it
+   */
+  C_Status (*get_device_list)(size_t* devices);
+
+  /**
+   * @brief Device memory statistic
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[size_t*]    total_memory
+   * @param[size_t*]    free_memory
+   * @param[size_t*]    used_memory
+   */
+  C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
+                                  size_t* free_memory);
+
+  /**
+   * @brief Device minimum chunk size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_min_chunk_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device maximum chunk size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_max_chunk_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device maximum alloc size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_max_alloc_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device extra padding size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_extra_padding_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Device initial allocated size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_init_alloc_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Device reallocated size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_realloc_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Get compute capability
+   *
+   * @param[size_t*]    compute_capability
+   */
+  C_Status (*get_compute_capability)(size_t* compute_capability);
+
+  /**
+   * @brief Get runtime version
+   *
+   * @param[size_t*]    version
+   */
+  C_Status (*get_runtime_version)(size_t* version);
+
+  /**
+   * @brief Get driver version
+   *
+   * @param[size_t*]    version
+   */
+  C_Status (*get_driver_version)(size_t* version);
+
+  void* reserved_info_api[8];
+
+  ///////////////
+  // other api //
+  ///////////////
+
+  void* reserved_other_api[8];
+};
+
+struct CustomRuntimeVersion {
+  size_t major, minor, patch;
+};
+
+struct CustomRuntimeParams {
+  // Core fill it and plugin must to check it
+  size_t size;
+  // Plugin fill it
+  C_DeviceInterface* interface;
+  // Plugin fill it and Core will to check it
+  CustomRuntimeVersion version;
+  // Plugin fill it
+  char* device_type;
+  // Plugin fill it
+  char* sub_device_type;
+
+  char reserved[32];
+};
+
+// Plugin implement it and fill CustomRuntimeParams
+void InitPlugin(CustomRuntimeParams*);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
--- a/paddle/fluid/platform/device/device_guard.cc
+++ b/paddle/fluid/platform/device/device_guard.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_guard.h"
+
+namespace paddle {
+namespace platform {
+// Even this source file does not contains any code, it is better to keep this
+// source file for cmake dependency.
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/device_guard.h
+++ b/paddle/fluid/platform/device/device_guard.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/device/device_manager.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceGuard {
+ public:
+  explicit inline DeviceGuard(const Place& place)
+      : dev_type_(PlaceHelper::GetDeviceType(place)) {
+    prev_id = DeviceManager::GetDevice(dev_type_);
+    cur_id = PlaceHelper::GetDeviceId(place);
+
+    if (cur_id != prev_id) {
+      DeviceManager::SetDevice(dev_type_, cur_id);
+    }
+  }
+
+  inline ~DeviceGuard() {
+    if (cur_id != prev_id) {
+      DeviceManager::SetDevice(dev_type_, prev_id);
+    }
+  }
+
+  DeviceGuard(const DeviceGuard& o) = delete;
+  DeviceGuard& operator=(const DeviceGuard& o) = delete;
+
+ private:
+  size_t prev_id, cur_id;
+  std::string dev_type_;
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/device_manager.cc
+++ b/paddle/fluid/platform/device/device_manager.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/device_manager.h"
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#else
+
+#endif
+
+#include <functional>
+#include <regex>
+
+namespace paddle {
+namespace platform {
+
+void Device::CreateStream(stream::Stream* stream,
+                          const stream::Stream::Priority& priority,
+                          const stream::Stream::Flag& flag) {
+  impl_->CreateStream(dev_id_, stream, priority, flag);
+}
+
+void Device::DestroyStream(stream::Stream* stream) {
+  impl_->DestroyStream(dev_id_, stream);
+}
+
+void Device::SynchronizeStream(const stream::Stream* stream) {
+  impl_->SynchronizeStream(dev_id_, stream);
+}
+
+bool Device::QueryStream(const stream::Stream* stream) {
+  return impl_->QueryStream(dev_id_, stream);
+}
+
+void Device::AddCallback(stream::Stream* stream,
+                         stream::Stream::Callback* callback) {
+  impl_->AddCallback(dev_id_, stream, callback);
+}
+
+void Device::CreateEvent(event::Event* event, event::Event::Flag flags) {
+  impl_->CreateEvent(dev_id_, event, flags);
+}
+
+void Device::DestroyEvent(event::Event* event) {
+  impl_->DestroyEvent(dev_id_, event);
+}
+
+void Device::RecordEvent(const event::Event* event,
+                         const stream::Stream* stream) {
+  impl_->RecordEvent(dev_id_, event, stream);
+}
+
+void Device::SynchronizeEvent(const event::Event* event) {
+  impl_->SynchronizeEvent(dev_id_, event);
+}
+
+bool Device::QueryEvent(const event::Event* event) {
+  return impl_->QueryEvent(dev_id_, event);
+}
+
+void Device::StreamWaitEvent(const stream::Stream* stream,
+                             const event::Event* event) {
+  impl_->StreamWaitEvent(dev_id_, stream, event);
+}
+
+void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
+                           size_t size, const stream::Stream* stream) {
+  impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
+}
+
+void* Device::MemoryAllocate(size_t size) {
+  return impl_->MemoryAllocate(dev_id_, size);
+}
+
+void Device::MemoryDeallocate(void* ptr, size_t size) {
+  impl_->MemoryDeallocate(dev_id_, ptr, size);
+}
+
+void* Device::MemoryAllocateHost(size_t size) {
+  return impl_->MemoryAllocateHost(dev_id_, size);
+}
+
+void Device::MemoryDeallocateHost(void* ptr, size_t size) {
+  impl_->MemoryDeallocateHost(dev_id_, ptr, size);
+}
+
+void* Device::MemoryAllocateUnified(size_t size) {
+  return impl_->MemoryAllocateUnified(dev_id_, size);
+}
+
+void Device::MemoryDeallocateUnified(void* ptr, size_t size) {
+  impl_->MemoryDeallocateUnified(dev_id_, ptr, size);
+}
+
+void Device::MemorySet(void* ptr, uint8_t value, size_t size) {
+  impl_->MemorySet(dev_id_, ptr, value, size);
+}
+
+std::string Device::Type() { return impl_->Type(); }
+
+static pten::RWLock _global_device_manager_rw_lock;
+
+bool DeviceManager::Register(std::unique_ptr<DeviceInterface> device_impl) {
+  pten::AutoWRLock lock(&_global_device_manager_rw_lock);
+  VLOG(4) << "Register Device - " << device_impl->Type();
+  auto device_type = device_impl->Type();
+  auto& dev_impl_map = Instance().device_impl_map_;
+  auto& dev_map = Instance().device_map_;
+
+  if (dev_impl_map.find(device_type) == dev_impl_map.end()) {
+    dev_impl_map.insert(
+        std::pair<std::string, std::unique_ptr<DeviceInterface>>(
+            device_type, std::move(device_impl)));
+    auto& dev_impl = dev_impl_map[device_type];
+    auto& dev_vec = dev_map[device_type];
+    VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
+    for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
+      dev_vec.emplace_back(new Device(i, dev_impl.get()));
+    }
+  } else {
+    auto& plat = dev_impl_map[device_type];
+    if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) {
+      dev_impl_map[device_type] = std::move(device_impl);
+      auto& dev_impl = dev_impl_map[device_type];
+      auto& dev_vec = dev_map[device_type];
+      dev_vec.clear();
+      VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
+      for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
+        dev_vec.emplace_back(new Device(i, dev_impl.get()));
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
+    const std::string& device_type) {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+
+  auto& dev_impl_map = Instance().device_impl_map_;
+  if (dev_impl_map.find(device_type) != dev_impl_map.end()) {
+    return dev_impl_map.at(device_type).get();
+  } else {
+    LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
+    PADDLE_THROW(
+        platform::errors::Fatal("Unregistered device type %s.", device_type));
+    return nullptr;
+  }
+}
+
+Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+
+  auto& dev_map = Instance().device_map_;
+  auto dev_type = PlaceHelper::GetDeviceType(place);
+  auto dev_id = PlaceHelper::GetDeviceId(place);
+  PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
+                    platform::errors::NotFound(
+                        "Unable to find Device with type %s.", dev_type));
+  auto& dev_vec = dev_map[dev_type];
+  PADDLE_ENFORCE_LT(
+      dev_id, dev_vec.size(),
+      platform::errors::OutOfRange(
+          "The visible devices count of type %s is %d, but dev_id is %d.",
+          dev_type, dev_vec.size(), dev_id));
+  return dev_vec[dev_id].get();
+}
+
+std::vector<std::string> DeviceManager::GetAllDeviceTypes() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    devices.push_back(iter->first);
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllCustomDeviceTypes() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    if (iter->second->IsCustom()) {
+      devices.push_back(iter->first);
+    }
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllDeviceList() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    size_t device_count = iter->second->GetDeviceCount();
+    std::string dev_type = iter->second->Type();
+    if (device_count == 1) {
+      devices.push_back(dev_type);
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        devices.push_back(dev_type + ":" + std::to_string(i));
+      }
+    }
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllCustomDeviceList() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    size_t device_count = iter->second->GetDeviceCount();
+    std::string dev_type = iter->second->Type();
+    if (iter->second->IsCustom()) {
+      if (device_count == 1) {
+        devices.push_back(dev_type);
+      } else {
+        for (size_t i = 0; i < device_count; ++i) {
+          devices.push_back(dev_type + ":" + std::to_string(i));
+        }
+      }
+    }
+  }
+  return devices;
+}
+
+bool DeviceManager::HasDeviceType(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl != nullptr;
+}
+
+bool DeviceManager::IsCustom(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->IsCustom();
+}
+
+void DeviceManager::Initialize(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->Initialize();
+}
+
+void DeviceManager::Finalize(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->Finalize();
+}
+
+void DeviceManager::SynchronizeDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->SynchronizeDevice(device_id);
+}
+
+void DeviceManager::InitDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->InitDevice(device_id);
+}
+
+void DeviceManager::DeInitDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->DeInitDevice(device_id);
+}
+
+void DeviceManager::SetDevice(const std::string& device_type,
+                              size_t device_id) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->SetDevice(device_id);
+}
+
+void DeviceManager::SetDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  DeviceManager::SetDevice(device_type, device_id);
+}
+
+int DeviceManager::GetDevice(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDevice();
+}
+
+size_t DeviceManager::GetMinChunkSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMinChunkSize(device_id);
+}
+
+size_t DeviceManager::GetMaxChunkSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMaxChunkSize(device_id);
+}
+
+size_t DeviceManager::GetMaxAllocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMaxAllocSize(device_id);
+}
+
+size_t DeviceManager::GetInitAllocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetInitAllocSize(device_id);
+}
+
+size_t DeviceManager::GetReallocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetReallocSize(device_id);
+}
+
+size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetExtraPaddingSize(device_id);
+}
+
+void DeviceManager::MemoryStats(const Place& place, size_t* total,
+                                size_t* free) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->MemoryStats(device_id, total, free);
+}
+
+size_t DeviceManager::GetDeviceCount(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDeviceCount();
+}
+
+std::vector<size_t> DeviceManager::GetDeviceList(
+    const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDeviceList();
+}
+
+DeviceManager& DeviceManager::Instance() {
+  static DeviceManager platform_manager;
+  return platform_manager;
+}
+
+std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
+  std::vector<std::string> libraries;
+  std::regex express(".*\\.so");
+  std::match_results<std::string::iterator> results;
+  DIR* dir = nullptr;
+  dirent* ptr = nullptr;
+
+  dir = opendir(library_dir.c_str());
+  if (dir == nullptr) {
+    VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed";
+  } else {
+    while ((ptr = readdir(dir)) != nullptr) {
+      std::string filename(ptr->d_name);
+      if (std::regex_match(filename.begin(), filename.end(), results,
+                           express)) {
+        libraries.push_back(library_dir + '/' + filename);
+        VLOG(4) << "found CustomDevice library: " << libraries.back()
+                << std::endl;
+      }
+    }
+    closedir(dir);
+  }
+
+  return libraries;
+}
+
+bool LoadCustomDevice(const std::string& library_dir) {
+  std::vector<std::string> libs = ListAllLibraries(library_dir);
+  for (const auto& lib_path : libs) {
+    auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
+    LoadCustomRuntimeLib(dso_handle);
+  }
+  return true;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
--- a/paddle/fluid/platform/device/device_manager.h
+++ b/paddle/fluid/platform/device/device_manager.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/pten/backends/dynload/port.h"
+#include "paddle/pten/core/utils/rw_lock.h"
+
+namespace paddle {
+namespace platform {
+class Device final {
+ public:
+  Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
+
+  // Stream
+  // ! Create an asynchronous stream
+  void CreateStream(
+      stream::Stream* stream, const stream::Stream::Priority& priority =
+                                  stream::Stream::Priority::kNormal,
+      const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
+
+  // ! Destroys an asynchronous stream.
+  void DestroyStream(stream::Stream* stream);
+
+  // ! Waits for stream tasks to complete.
+  void SynchronizeStream(const stream::Stream* stream);
+
+  // ! Queries an asynchronous stream for completion status.
+  bool QueryStream(const stream::Stream* stream);
+
+  // ! Add a callback to a compute stream.
+  void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback);
+
+  // Event
+  // ! Create an event.
+  void CreateEvent(event::Event* event, event::Event::Flag flags);
+
+  // ! Destroy an event.
+  void DestroyEvent(event::Event* event);
+
+  // ! Records an event.
+  void RecordEvent(const event::Event* event, const stream::Stream* stream);
+
+  // ! Waits for event to complete.
+  void SynchronizeEvent(const event::Event* event);
+
+  // ! Queries an event for completion status.
+  bool QueryEvent(const event::Event* event);
+
+  // ! Make a compute stream wait on an event
+  void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
+
+  // Memory
+  void MemoryCopyH2D(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyD2H(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyD2D(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
+                     size_t size, const stream::Stream* stream = nullptr);
+
+  void* MemoryAllocate(size_t size);
+
+  void MemoryDeallocate(void* ptr, size_t size);
+
+  void* MemoryAllocateHost(size_t size);
+
+  void MemoryDeallocateHost(void* ptr, size_t size);
+
+  void* MemoryAllocateUnified(size_t size);
+
+  void MemoryDeallocateUnified(void* ptr, size_t size);
+
+  void MemorySet(void* ptr, uint8_t value, size_t size);
+
+  std::string Type();
+
+ private:
+  size_t dev_id_;
+  DeviceInterface* impl_;
+};
+
+class DeviceManager {
+ public:
+  static bool Register(std::unique_ptr<DeviceInterface> device);
+  static bool RegisterPinnedDevice(DeviceInterface* device);
+  static Device* GetDeviceWithPlace(const Place& place);
+  static std::vector<std::string> GetAllDeviceTypes();
+  static std::vector<std::string> GetAllCustomDeviceTypes();
+  static std::vector<std::string> GetAllDeviceList();
+  static std::vector<std::string> GetAllCustomDeviceList();
+  static bool HasDeviceType(const std::string& device_type);
+  static bool IsCustom(const std::string& device_type);
+
+  // platform & device
+  static void Initialize(const std::string& device_type);
+
+  static void Finalize(const std::string& device_type);
+
+  static void SynchronizeDevice(const Place& place);
+
+  static void InitDevice(const Place& place);
+
+  static void DeInitDevice(const Place& place);
+
+  static void SetDevice(const std::string& device_type, size_t device_id);
+
+  static void SetDevice(const Place& place);
+
+  static int GetDevice(const std::string& device_type);
+
+  static size_t GetMinChunkSize(const Place& place);
+
+  static size_t GetMaxChunkSize(const Place& place);
+
+  static size_t GetMaxAllocSize(const Place& place);
+
+  static size_t GetInitAllocSize(const Place& place);
+
+  static size_t GetReallocSize(const Place& place);
+
+  static size_t GetExtraPaddingSize(const Place& place);
+
+  static void MemoryStats(const Place& place, size_t* total, size_t* free);
+
+  static size_t GetDeviceCount(const std::string& device_type);
+
+  static std::vector<size_t> GetDeviceList(const std::string& device_type);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(DeviceManager);
+  DeviceManager() {}
+  static DeviceManager& Instance();
+  static DeviceInterface* GetDeviceInterfaceWithType(
+      const std::string& device_type);
+
+  std::unordered_map<std::string, std::unique_ptr<DeviceInterface>>
+      device_impl_map_;
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Device>>>
+      device_map_;
+};
+
+bool LoadCustomRuntimeLib(void* dso_handle);
+
+bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
+                          std::unique_ptr<C_DeviceInterface> device_interface,
+                          void* dso_handle);
+
+bool LoadCustomDevice(const std::string& library_path);
+
+class Registrar {
+ public:
+  template <typename DeviceT>
+  explicit Registrar(DeviceT* device_ptr) {
+    DeviceManager::Register(std::unique_ptr<DeviceT>(device_ptr));
+  }
+
+  void Touch() {}
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -38,3 +38,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_IPU
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/device/custom/enforce_custom.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#endif
--- a/paddle/fluid/platform/device/event.cc
+++ b/paddle/fluid/platform/device/event.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/stream.h"
+
+namespace paddle {
+namespace platform {
+namespace event {
+
+event_t Event::raw_event() const { return event_; }
+
+void Event::set_event(event_t event) { event_ = event; }
+
+Event::Event(const Place& place, event_t event)
+    : place_(place),
+      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      event_(event),
+      own_data_(false) {}
+
+Event::~Event() { Destroy(); }
+
+bool Event::Init(const Place& place, Flag flags) {
+  place_ = place;
+  DeviceGuard guard(place_);
+  device_->CreateEvent(this, flags);
+  VLOG(3) << "Init Event: " << event_ << ", place: " << place_
+          << ", flag:" << static_cast<int>(flags);
+  own_data_ = true;
+  return true;
+}
+
+void Event::Destroy() {
+  if (own_data_) {
+    DeviceGuard guard(place_);
+    device_->DestroyEvent(this);
+    own_data_ = false;
+  }
+}
+
+void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
+
+bool Event::Query() const { return device_->QueryEvent(this); }
+
+void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
+
+const Place& Event::GetPlace() const { return place_; }
+
+}  // namespace event
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/event.h
+++ b/paddle/fluid/platform/device/event.h
--- a/paddle/fluid/platform/device/stream.cc
+++ b/paddle/fluid/platform/device/stream.cc
--- a/paddle/fluid/platform/device/stream.h
+++ b/paddle/fluid/platform/device/stream.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double(

 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)

 /**
 * Memory related FLAG

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -284,7 +284,7 @@ if(WITH_PYTHON)

  cc_library(paddle_pybind SHARED
    SRCS ${PYBIND_SRCS}
-    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})

  if(NOT APPLE AND NOT WIN32)
    target_link_libraries(paddle_pybind rt)

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/pten/common/place.cc
+++ b/paddle/pten/common/place.cc
--- a/paddle/pten/common/place.h
+++ b/paddle/pten/common/place.h
--- a/paddle/pten/kernels/funcs/math_function.cc
+++ b/paddle/pten/kernels/funcs/math_function.cc
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -293,6 +293,7 @@ from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
 from .framework import MLUPlace  # noqa: F401
+from .framework import CustomPlace  # noqa: F401

 from .autograd import grad  # noqa: F401
 from .autograd import no_grad  # noqa: F401

--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/setup.py.in
+++ b/python/setup.py.in