diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8321010d389ee2493ef35d74d5d75d3ea73bfe9..a4c1b9c8098e9e632a4a05c491e07b1ce051c945 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup ji
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
 option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
 option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
+option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
     set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
@@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr
   return()
 endif()
 
+if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
+set(WITH_CUSTOM_DEVICE ON)
+endif()
+
 if(WIN32)
     if(WITH_DISTRIBUTE)
         MESSAGE(WARNING
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 9ebde06bd01ab9968b9cc53a3e38a2b2e1684fc4..20a35c91bdde1d606cef2b46ad8aabb5952bd7d8 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -219,3 +219,7 @@ endif(ON_INFER)
 if(WITH_CRYPTO)
     add_definitions(-DPADDLE_WITH_CRYPTO)
 endif(WITH_CRYPTO)
+
+if(WITH_CUSTOM_DEVICE AND NOT WIN32)
+    add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
+endif()
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 24f1591ff33c965b9b787c05ff5db67ad4362ea4..20d08ef18aeb3e4d8a9f5cfd0b38954daf27020d 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
         platform::errors::Unimplemented("platform::MLUPlace is not supported"));
   }
 
+  inline ::DLDevice operator()(const platform::CustomPlace &place) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "platform::CustomPlace is not supported"));
+  }
+
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLDevice device;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5596aba52131b74785741e16f9dc6ef71e6a91cb..4e6a4d5360860e8971c6dc9c2842defabcffd0dd 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
+#endif
+    } else if (platform::is_custom_place(place_)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for " << place_ << ".";
+        gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_,
+                                                            max_memory_size));
+      } else {
+        VLOG(4) << "Use default stream gc for " << place_ << ".";
+        gc.reset(
+            new CustomDefaultStreamGarbageCollector(place_, max_memory_size));
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found"));
 #endif
     }
   }
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 22f77be85055578f0d4e8288e90001fb59e9628d..9f2bdeffecf62764f5cbe5bea9cb50d4830be43b 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -18,6 +18,7 @@
 #endif
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 DECLARE_double(eager_delete_tensor_gb);
 DECLARE_double(memory_fraction_of_eager_deletion);
@@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback(
 }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CustomDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void CustomDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+CustomStreamGarbageCollector::CustomStreamGarbageCollector(
+    const platform::CustomPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::DeviceGuard guard(place);
+  stream_.reset(new platform::stream::Stream);
+  stream_->Init(place);
+  callback_manager_.reset(new platform::CallbackManager(stream_.get()));
+}
+
+CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
+  platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
+  stream_->Synchronize();
+  stream_->Destroy();
+}
+
+platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
+  return stream_.get();
+}
+
+void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void CustomStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+
 int64_t GetEagerDeletionThreshold() {
   return FLAGS_eager_delete_tensor_gb < 0
              ? -1
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f5d79d864b5659ed2b16cdded7e471eca457e3c5..a67860c6087e0f173e09d2a7c131703260c562fd 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place,
+                                      size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place,
+                                         size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class CustomStreamGarbageCollector : public GarbageCollector {
+ public:
+  CustomStreamGarbageCollector(const platform::CustomPlace &place,
+                               size_t max_memory_size);
+
+  ~CustomStreamGarbageCollector();
+
+  void Wait() const override;
+
+  platform::stream::Stream *stream() const;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+
+ private:
+  std::unique_ptr<platform::stream::Stream> stream_;
+  std::unique_ptr<platform::CallbackManager> callback_manager_;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
   Add(std::forward<Container>(objs), []() {});
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
index 7dac6a092d245fab3781c0af0bb6d4162b5be47c..9d1f09869988df96205cad5cc29aba8ea7edd945 100644
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
                         "Too many OpKernel attribute values, expected maximum "
                         "value is 64, received value is %d.",
                         cur_loc));
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::hash<int> hasher;
+  size_t seed =
+      hasher(place + data_type + data_layout + library_type + customized_value);
+  if (platform::is_custom_place(key.place_)) {
+    seed ^= std::hash<std::string>{}(key.place_.GetDeviceType()) + 0x9e3779b9 +
+            (seed << 6) + (seed >> 2) + 4;
+  }
+  return seed;
+#else
   std::hash<int> hasher;
   return hasher(place + data_type + data_layout + library_type +
                 customized_value);
+#endif
 }
 
 bool OpKernelType::operator==(const OpKernelType& o) const {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b6d8ca4aa67cbfc3dd34c0a4ef68d2c1bdb7ed94..7c13fa90f9bbc528393bc2607481bc43ca1b6397 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/common/scalar.h"
@@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = place.device;
       platform::SetMLUDeviceId(dev_id);
+#endif
+    } else if (platform::is_custom_place(place)) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with CustomDevice support.",
+          place));
+#else
+      platform::DeviceManager::SetDevice(place);
 #endif
     }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index aed5e2c7405ac0782ef3d9438b4958432584525a..1a826f6bdd5e7344d9983c026fc2d4cc8812d15a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use XPU device since it's not compiled with XPU,"
           "Please recompile or reinstall Paddle with XPU support."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(
+            new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size));
+      } else {
+        gc.reset(new CustomStreamGarbageCollector(place, max_memory_size));
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use custom device since it's not compiled with "
+          "CustomDevice,"
+          "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
     } else if (platform::is_cpu_place(place)) {
       gc.reset(new CPUGarbageCollector(place, max_memory_size));
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 844b5d82695009415815eaba819cf6a8bf5a89e3..e510257c6106b8d3540e927f0e6fd76a9e73ea09 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_custom_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  } else if (platform::is_custom_place(src_place) &&  // NOLINT
+             platform::is_custom_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
-      platform::is_mlu_place(dst_place)) {
+      platform::is_mlu_place(dst_place) ||
+      platform::is_custom_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
   } else {
     dev_ctx = pool.Get(src.place());
@@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&    // NOLINT
+           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+  else if (platform::is_custom_place(src_place) &&  // NOLINT
+           platform::is_custom_place(
+               dst_place)) { /* custom_device -> custom_device*/
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
                  const platform::CUDAPinnedPlace& cpu) const {
     return *out.data<bool>();
   }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CustomPlace& custom_dev) const {
+    PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ",
+                                                 custom_dev));
+    return false;
+  }
 };
 
 template <typename Predicate>
@@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> {
       out_ptr[i] = lhs && rhs;
     }
   }
+
+  void VisitorImpl(const platform::CustomPlace& custom_dev) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("CustomPlace is not supported"));
+  }
 };
 
 void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
@@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPlace is not supported when not compiled with NPU"));
+#endif
+    } else if (platform::is_custom_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& custom_device_context =
+          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(), tensor.place(),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     custom_device_context.stream());
+        custom_device_context.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CustomPlace is not supported when not compiled with "
+          "CustomDevice"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_mlu_place(dev_ctx.GetPlace()) ||
-        platform::is_npu_place(dev_ctx.GetPlace())) {
+        platform::is_npu_place(dev_ctx.GetPlace()) ||
+        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
+          platform::is_custom_place(dev_ctx.GetPlace())) {
         dev_ctx.Wait();
       }
 #else
@@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_mlu_place(dev_ctx.GetPlace()) ||
-        platform::is_npu_place(dev_ctx.GetPlace())) {
+        platform::is_npu_place(dev_ctx.GetPlace()) ||
+        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
-      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
+          platform::is_custom_place(dev_ctx.GetPlace())) {
         dev_ctx.Wait();
       }
 #else
@@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "MLUPlace is not supported when not compiled with MLU"));
-      } else {
+      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "NPUPlace is not supported when not compiled with NPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CutomPlace is not supported when not compiled with CustomDevice"));
       }
 #endif
     } else {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index bcaf3c719cb720d76c78a2b15475652eda793cad..1c1a86f1d32d3c3553e2201432453e5e2fdaa1e3 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, src_place, src_ptr, size,
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromArray on %s is not supported.", dst_place));
+  }
 }
 
 template <typename T>
@@ -241,6 +252,17 @@ void TensorFromVector(const std::vector<T>& src,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, src_place, src_ptr, size,
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromVector on %s is not supported.", dst_place));
+  }
 }
 
 // The fully specialized function should be inline to avoid
@@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector<bool>& src,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEICE
+  else if (platform::is_custom_place(dst_place)) {  // NOLINT
+    auto stream =
+        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorFromVector on %s is not supported.", dst_place));
+  }
   delete[] array;
 }
 
@@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+  }
+#endif
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "TensorToVector on %s is not supported.", src.place()));
+  }
 }
 
 template <>
@@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src,
         dst_place, dst_ptr, src.place(), src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
   }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (platform::is_custom_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+  }
 #endif
   for (unsigned int i = 0; i < src.numel(); i++) {
     (*dst)[i] = static_cast<bool>(array[i]);
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 4a71dd4deac9c5e69b1b6234a93fbb12bc1f31a5..dc8b3982ba99824c775bdc2bed6a49afcd179232 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -180,6 +180,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
         "is not supported in imperative mode",
         place));
   }
+  void operator()(const platform::CustomPlace& place) const {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
 
  private:
   int64_t numel_;
@@ -331,7 +337,14 @@ void TensorAdd(const VarType& src, VarType* dst) {
     return;
   }
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (platform::is_custom_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Gradient accumulation of data type (%s) on place (%s) is not "
+        "supported in imperative mode",
+        framework::DataTypeToString(data_type), place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   if (platform::is_xpu_place(place)) {
     if (data_type == framework::DataTypeTrait<float>::DataType()) {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c8ff561f7af3ad85d74eb7723b092a2a9aeaae64..ae7d0807530618864ff951e388a5d4deaa1765a5 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -278,6 +278,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (kernel_iter == kernels.end() &&
+      paddle::platform::is_custom_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
   // case
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 81cd39c225b533d742d9eb399c8c87863a6572e5..c2dd761c23c9f4a914f428d8e0bdb16d9b4a6cbf 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use MLU device since it's not compiled with MLU,"
           "Please recompile or reinstall Paddle with MLU support."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use CustomDevice since it's not compiled with "
+          "CustomDevice,"
+          "Please recompile or reinstall Paddle with CustomDevice "
+          "support."));
 #endif
     } else {
       PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -222,6 +234,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with MLU if use MLUPlace."));
+#endif
+    } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      platform::DeviceManager::SetDevice(place);
+#else
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "PaddlePaddle should compile with CustomDevice if use "
+          "CustomPlace."));
 #endif
     }
     if (!override_default_attr_map) {
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index b899ddbcd5a4e30e065eb1969c41fde6046a8ea7..6cd7d87332323f4bafd49b8b16254f9610405658 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -58,6 +58,11 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
+if (WITH_CUSTOM_DEVICE)
+  cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
+  set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
+endif()
+
 if (WITH_GPU)
     nv_test(best_fit_allocator_test
             SRCS best_fit_allocator_test.cc
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 35131446d8647e0581d2d997451017293b7ca8dc..fc34a64d62636cca3d274fb2294a5d9139ae5d77 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -62,6 +62,11 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/memory/allocation/custom_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
@@ -186,6 +191,17 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
           InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
         }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (const auto& dev_type : device_types) {
+          for (size_t dev_id = 0;
+               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               ++dev_id) {
+            InitNaiveBestFitCustomDeviceAllocator(
+                platform::CustomPlace(dev_type, dev_id));
+          }
+        }
 #endif
         break;
       }
@@ -222,6 +238,17 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
           InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
         }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (const auto& dev_type : device_types) {
+          for (size_t dev_id = 0;
+               dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+               ++dev_id) {
+            InitAutoGrowthCustomDeviceAllocator(
+                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
+          }
+        }
 #endif
         break;
       }
@@ -700,6 +727,21 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+
+  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
+                                           bool allow_free_idle_chunk) {
+    auto custom_allocator =
+        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
+        allow_free_idle_chunk);
+  }
+#endif
+
   void InitSystemAllocators() {
     if (!system_allocators_.empty()) return;
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
@@ -770,6 +812,16 @@ class AllocatorFacadePrivate {
       places.emplace_back(platform::MLUPlace(dev_id));
     }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+    for (const auto& dev_type : device_types) {
+      for (size_t dev_id = 0;
+           dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
+           dev_id++) {
+        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
+      }
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
@@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
         "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
   }
 #endif
-
   platform::CUDAPlace p(place.GetDeviceId());
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb035ea5e3ad409777114cca44cd945ed4bd9541
--- /dev/null
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/custom_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool CustomAllocator::IsAllocThreadSafe() const { return true; }
+void CustomAllocator::FreeImpl(pten::Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      allocation->place(), place_,
+      platform::errors::PermissionDenied("CustomDevice memory is "
+                                         "freed in incorrect device. "
+                                         "This may be a bug"));
+
+  delete allocation;
+}
+
+pten::Allocation* CustomAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::DeviceManager::SetDevice(place_); });
+
+  void* ptr =
+      platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
+  if (LIKELY(ptr)) {
+    return new Allocation(ptr, size, place_);
+  }
+
+  size_t avail, total;
+  platform::DeviceManager::MemoryStats(place_, &total, &avail);
+
+  auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
+  auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on %s:%d. "
+      "Cannot allocate %s memory on %s:%d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using %s:%d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another %s.\n"
+      "2. If no, please decrease the batch size of your model.\n\n",
+      dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id,
+      string::HumanReadableSize(avail), dev_type, dev_id, dev_type));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..708c105a850087f49becde702590920a0f9afc9d
--- /dev/null
+++ b/paddle/fluid/memory/allocation/custom_allocator.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CustomAllocator : public Allocator {
+ public:
+  explicit CustomAllocator(const platform::CustomPlace& place)
+      : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::Place place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 91358b688040aa9789e3268eb0e29dc6790c0e13..b63f872141c802f512332750d36a3116df2c40c9 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -20,6 +20,7 @@
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -30,7 +31,6 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-#include "paddle/fluid/platform/device/device_wrapper.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     init_allocated_mem, false,
@@ -733,6 +733,136 @@ uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
 #endif
 }
 
+// For CustomDevice
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class BuddyAllocatorList {
+ private:
+  explicit BuddyAllocatorList(const std::string &device_type)
+      : device_type_(device_type) {
+    auto devices = platform::DeviceManager::GetDeviceList(device_type);
+    for (auto dev_id : devices) {
+      init_flags_[dev_id].reset(new std::once_flag());
+    }
+  }
+
+  static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) {
+    return new BuddyAllocatorList(device_type);
+  }
+
+ public:
+  static BuddyAllocatorList *Instance(const std::string &device_type) {
+    // DeviceType -> AllocatorList
+    static std::unordered_map<std::string, BuddyAllocatorList *> pool;
+    if (pool.find(device_type) == pool.end()) {
+      pool[device_type] = CreateNewInstance(device_type);
+    }
+    return pool[device_type];
+  }
+
+  BuddyAllocator *Get(int dev_id) {
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+                      platform::errors::OutOfRange(
+                          "Cannot find %s %d, please check visible devices.",
+                          device_type_, dev_id));
+
+    std::call_once(*init_flags_[dev_id], [this, dev_id] {
+      platform::DeviceManager::SetDevice(device_type_, dev_id);
+      platform::CustomPlace place(device_type_, dev_id);
+
+      allocators_[dev_id].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::CustomAllocator(device_type_, dev_id)),
+          platform::DeviceManager::GetMinChunkSize(place),
+          platform::DeviceManager::GetMaxChunkSize(place),
+          platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
+    });
+
+    return allocators_[dev_id].get();
+  }
+
+ private:
+  std::string device_type_;
+  std::unordered_map<size_t, std::unique_ptr<std::once_flag>> init_flags_;
+  std::unordered_map<size_t, std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetBuddyAllocator(const platform::Place &place) {
+  VLOG(10) << "GetBuddyAllocator place = " << place;
+  if (platform::is_custom_place(place)) {
+    return BuddyAllocatorList::Instance(
+               platform::PlaceHelper::GetDeviceType(place))
+        ->Get(platform::PlaceHelper::GetDeviceId(place));
+  } else {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("place must be CustomPlace"));
+  }
+}
+#endif
+
+template <>
+void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
+                                   size_t size) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  auto *buddy_allocator = GetBuddyAllocator(place);
+  auto *ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    platform::DeviceGuard guard(place);
+    size_t avail, total;
+    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
+        "%s. ",
+        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(total - avail)));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
+                                                                    size);
+    }
+  }
+  VLOG(10) << "  pointer=" << ptr;
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+                                 size_t size) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetBuddyAllocator(place)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::CustomPlace>(const platform::CustomPlace &place) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  return GetBuddyAllocator(place)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  return GetBuddyAllocator(place)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CustomPlace' is not supported in CPU only device."));
+#endif
+}
+
 struct AllocVisitor : public boost::static_visitor<void *> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index b02fb6642be3fd4ade7dc1b4ed7642be28cc7757..d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,9 +25,7 @@ limitations under the License. */
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
 
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace memory {
@@ -35,12 +33,37 @@ namespace detail {
 
 BuddyAllocator::BuddyAllocator(
     std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t max_chunk_size, size_t extra_padding_size)
+    size_t max_chunk_size, size_t extra_padding_size,
+    const std::string dev_type)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       extra_padding_size_(extra_padding_size),
       cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {}
+      system_allocator_(std::move(system_allocator)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (!dev_type.empty()) {
+    init_allocate_size_func_ = [dev_type]() {
+      return platform::DeviceManager::GetInitAllocSize(
+          platform::PlaceHelper::CreatePlace(dev_type));
+    };
+    re_allocate_size_func_ = [dev_type]() {
+      return platform::DeviceManager::GetReallocSize(
+          platform::PlaceHelper::CreatePlace(dev_type));
+    };
+  } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    init_allocate_size_func_ = &platform::GpuInitAllocSize;
+    re_allocate_size_func_ = &platform::GpuReallocSize;
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    init_allocate_size_func_ = &platform::NPUInitAllocSize;
+    re_allocate_size_func_ = &platform::NPUReallocSize;
+#elif defined(PADDLE_WITH_MLU)
+    init_allocate_size_func_ = &platform::MLUInitAllocSize;
+    re_allocate_size_func_ = &platform::MLUReallocSize;
+#endif
+  }
+#endif
+}
 
 BuddyAllocator::~BuddyAllocator() {
   VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
@@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   size_t allocate_bytes = max_chunk_size_;
   size_t index = 0;
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  allocate_bytes = DeviceAllocateSize(init_allocate_size_func_,
+                                      re_allocate_size_func_, request_bytes);
+#else
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
                                       &platform::GpuReallocSize, request_bytes);
@@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #elif defined(PADDLE_WITH_MLU)
   allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
                                       &platform::MLUReallocSize, request_bytes);
+#endif
 #endif
 
   // Allocate a new block
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 0d736f680503a6ce59e88142a9eec2ad4ebfdd26..5296192b8fd9b632be4638d47153e113fd2ae576 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -39,7 +39,8 @@ class BuddyAllocator {
  public:
   BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
                  size_t min_chunk_size, size_t max_chunk_size,
-                 size_t extra_padding_size = 0);
+                 size_t extra_padding_size = 0,
+                 const std::string dev_type = "");
 
   ~BuddyAllocator();
 
@@ -123,6 +124,9 @@ class BuddyAllocator {
   /*! Allocate CPU/GPU memory from system */
   std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::function<size_t()> init_allocate_size_func_, re_allocate_size_func_;
+#endif
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 773122de6c3198b09c33241a0d6a09e9357f65a3..a61f98c4e1a22adcc3684a9e5af190a82e3b5110 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -38,6 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) {
 bool MLUAllocator::UseGpu() const { return true; }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+void* CustomAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto place = platform::CustomPlace(dev_type_, dev_id_);
+  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  p = device->MemoryAllocate(size);
+  if (LIKELY(p)) {
+    VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
+    *index = 0;
+    plug_alloc_size += size;
+  } else {
+    size_t avail, total;
+
+    platform::DeviceManager::MemoryStats(place, &total, &avail);
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on %s %d. "
+        "total memory is %s, used memory is %s, "
+        "available memory is only %s.\n\n",
+        dev_type_, dev_id_, string::HumanReadableSize(total),
+        string::HumanReadableSize(total - avail),
+        string::HumanReadableSize(avail)));
+  }
+  return p;
+}
+
+void CustomAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, plug_alloc_size));
+  plug_alloc_size -= size;
+  auto place = platform::CustomPlace(dev_type_, dev_id_);
+  auto device = platform::DeviceManager::GetDeviceWithPlace(place);
+  device->MemoryDeallocate(p, size);
+}
+
+bool CustomAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 975e2891b2472ad4aeb5c4a7d6f676c516350545..f6ff6282a614a3152dee5bd0e45ebe3b733fe14f 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
+#include <string>
 
 namespace paddle {
 namespace memory {
@@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator {
 };
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomAllocator : public SystemAllocator {
+ public:
+  explicit CustomAllocator(const std::string& device_type, size_t dev_id)
+      : dev_type_(device_type), dev_id_(dev_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t plug_alloc_size = 0;
+  std::string dev_type_;
+  size_t dev_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index d2ab438fd2946701c70ea0bebf35ac33fbfb521e..d857b1c1671a789fa122a1d4115461fc0b5ba840 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -19,9 +19,88 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/common/place.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+
 namespace paddle {
 namespace memory {
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <>
+void Copy<platform::CPUPlace, platform::CustomPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
+    const void* src, size_t num, void* stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
+  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
+  std::string msg = "Memcpy:" + src_type + "->" + dst_type;
+  platform::RecordEvent record_event(msg);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << ", stream=" << stream;
+
+  platform::DeviceManager::SetDevice(src_place);
+  platform::stream::Stream stream_wrapper(src_place, stream);
+  platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
+      dst, src, num, &stream_wrapper);
+}
+
+template <>
+void Copy<platform::CustomPlace, platform::CPUPlace>(
+    platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, void* stream) {
+  if (UNLIKELY(num == 0)) return;
+  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
+  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
+  std::string msg = "Memcpy:" + src_type + "->" + dst_type;
+  platform::RecordEvent record_event(msg);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << ", stream=" << stream;
+
+  platform::DeviceManager::SetDevice(dst_place);
+  platform::stream::Stream stream_wrapper(dst_place, stream);
+  platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
+      dst, src, num, &stream_wrapper);
+}
+
+template <>
+void Copy<platform::CustomPlace, platform::CustomPlace>(
+    platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
+    const void* src, size_t num, void* stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
+  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
+  std::string msg = "Memcpy:" + src_type + "->" + dst_type;
+  platform::RecordEvent record_event(msg);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << ", stream=" << stream;
+
+  if (src_type == dst_type) {
+    platform::DeviceManager::SetDevice(src_place);
+    platform::stream::Stream stream_wrapper(src_place, stream);
+
+    auto src_id = platform::PlaceHelper::GetDeviceId(src_place);
+    auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place);
+    if (src_id == dst_id) {
+      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
+          dst, src, num, &stream_wrapper);
+    } else {
+      platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
+          dst_place, dst, src, num, &stream_wrapper);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Copy between %s and %s is not supported.", src_type, dst_type));
+  }
+}
+#endif  // PADDLE_WITH_CUSTOM_DEVICE
+
 template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
@@ -158,7 +237,7 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  aclrtStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
@@ -168,7 +247,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
     // after async is not ok, since the async operation may not done.
@@ -186,7 +266,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  aclrtStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
@@ -196,7 +276,8 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -211,7 +292,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  aclrtStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -221,7 +302,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
     if (stream) {
       platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               stream);
+                               reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -239,7 +320,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       // TODO(zhiqiu): support peer access?
       platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               stream);
+                               reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -284,7 +365,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
     platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
-    const void* src, size_t num, aclrtStream stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
@@ -294,7 +375,8 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -307,7 +389,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 template <>
 void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
     platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
-    const void* src, size_t num, aclrtStream stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
@@ -317,7 +399,8 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
 
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+                             reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
     // after async is not ok, since the async operation may not done.
@@ -379,6 +462,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     platform::NPUPinnedPlace place_dst;
     platform::NPUPlace place_src(src_place.GetDeviceId());
     return Copy(place_dst, dst, place_src, src, num, stream);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+#endif
   }
 }
 
@@ -492,7 +592,7 @@ inline void SyncCUDAStream() {
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, gpuStream_t stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(src_place.device);
@@ -501,9 +601,11 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
@@ -522,7 +624,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, gpuStream_t stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
@@ -531,9 +633,11 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
@@ -552,7 +656,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, gpuStream_t stream) {
+    const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -562,9 +666,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     if (stream) {
       platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
 #ifdef PADDLE_WITH_HIP
-      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream);
+      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #else
-      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #endif
     } else {
       platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
@@ -578,7 +684,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     if (stream) {
       platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
       platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, stream);
+                                   num, reinterpret_cast<gpuStream_t>(stream));
     } else {
       platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
       platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
@@ -620,8 +726,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPlace src_place, const void* src, size_t num,
-    gpuStream_t stream) {
+    platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -629,9 +734,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
@@ -647,7 +754,7 @@ template <>
 void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num,
-    gpuStream_t stream) {
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
@@ -656,9 +763,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
@@ -674,7 +783,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 template <>
 void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
                                     pten::Place src_place, const void* src,
-                                    size_t num, gpuStream_t stream) {
+                                    size_t num, void* stream) {
   if (src_place.GetType() == pten::AllocationType::CPU &&
       dst_place.GetType() == pten::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -719,6 +828,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     platform::CUDAPinnedPlace place_dst;
     platform::CUDAPlace place_src(src_place.GetDeviceId());
     return Copy(place_dst, dst, place_src, src, num, stream);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+#endif
   }
 }
 
@@ -726,7 +852,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, gpuStream_t stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -735,7 +861,7 @@ template <>
 void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
                                        pten::CPUPlace src_place,
                                        const void* src, size_t num,
-                                       gpuStream_t stream) {
+                                       void* stream) {
   Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -743,7 +869,7 @@ void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::GPUPlace, pten::Place>(pten::GPUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, gpuStream_t stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
        src_place, src, num, stream);
 }
@@ -753,7 +879,7 @@ template <>
 void Copy<pten::Place, pten::GPUPlace>(pten::Place dst_place, void* dst,
                                        pten::GPUPlace src_place,
                                        const void* src, size_t num,
-                                       gpuStream_t stream) {
+                                       void* stream) {
   Copy(dst_place, dst,
        pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
        stream);
@@ -764,7 +890,7 @@ template <>
 void Copy<pten::GPUPinnedPlace, pten::Place>(pten::GPUPinnedPlace dst_place,
                                              void* dst, pten::Place src_place,
                                              const void* src, size_t num,
-                                             gpuStream_t stream) {
+                                             void* stream) {
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -773,7 +899,7 @@ template <>
 void Copy<pten::Place, pten::GPUPinnedPlace>(pten::Place dst_place, void* dst,
                                              pten::GPUPinnedPlace src_place,
                                              const void* src, size_t num,
-                                             gpuStream_t stream) {
+                                             void* stream) {
   Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -800,7 +926,7 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
                                                   const void* src, size_t num,
-                                                  mluStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetMLUDeviceId(src_place.device);
@@ -808,7 +934,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
     platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
-    platform::MLUMemcpyD2HAsync(dst, src, num, stream);
+    platform::MLUMemcpyD2HAsync(dst, src, num,
+                                reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -825,7 +952,7 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num,
-                                                  mluStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetMLUDeviceId(dst_place.device);
@@ -833,7 +960,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
     platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
-    platform::MLUMemcpyH2DAsync(dst, src, num, stream);
+    platform::MLUMemcpyH2DAsync(dst, src, num,
+                                reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -850,7 +978,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
                                                   const void* src, size_t num,
-                                                  mluStream stream) {
+                                                  void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   if (dst_place == src_place) {
@@ -860,7 +988,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
               << " to " << dst_place << " by mlu stream(" << stream << ")";
       platform::RecordEvent record_event(
           "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
-      platform::MLUMemcpyD2DAsync(dst, src, num, stream);
+      platform::MLUMemcpyD2DAsync(dst, src, num,
+                                  reinterpret_cast<mluStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -877,7 +1006,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
               << " to " << dst_place << " by mlu stream(" << stream << ")";
       platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU");
       platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, stream);
+                                   num, reinterpret_cast<mluStream>(stream));
     } else {
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
@@ -892,7 +1021,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
 template <>
 void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
                                     pten::Place src_place, const void* src,
-                                    size_t num, mluStream stream) {
+                                    size_t num, void* stream) {
   if (src_place.GetType() == pten::AllocationType::CPU &&
       dst_place.GetType() == pten::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -912,6 +1041,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     platform::MLUPlace place_src(src_place.GetDeviceId());
     platform::MLUPlace place_dst(dst_place.GetDeviceId());
     return Copy(place_dst, dst, place_src, src, num, stream);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+#endif
   }
 }
 
@@ -919,7 +1065,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::MLUPlace, pten::Place>(pten::MLUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, mluStream stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
        src_place, src, num, stream);
 }
@@ -929,7 +1075,7 @@ template <>
 void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
                                        pten::MLUPlace src_place,
                                        const void* src, size_t num,
-                                       mluStream stream) {
+                                       void* stream) {
   Copy(dst_place, dst,
        pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
        stream);
@@ -939,7 +1085,7 @@ void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
 template <>
 void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
                                        pten::Place src_place, const void* src,
-                                       size_t num, mluStream stream) {
+                                       size_t num, void* stream) {
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -948,7 +1094,7 @@ template <>
 void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
                                        pten::CPUPlace src_place,
                                        const void* src, size_t num,
-                                       mluStream stream) {
+                                       void* stream) {
   Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -1013,7 +1159,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
   }
 #endif
 #ifdef PADDLE_WITH_IPU
-  else if (src_place.GetType() == pten::AllocationType::CPU &&
+  else if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == pten::AllocationType::IPU) {
     platform::IPUPlace place_dst(dst_place.GetDeviceId());
     platform::CPUPlace place_src;
@@ -1048,5 +1194,48 @@ void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
   Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num);
 }
 
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
+    !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) &&     \
+    !defined(PADDLE_WITH_MLU)
+
+template <>
+void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
+                                    pten::Place src_place, const void* src,
+                                    size_t num, void* stream) {
+  if (src_place.GetType() == pten::AllocationType::CPU &&  // NOLINT
+      dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CPUPlace place_src;
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place);
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  } else if (src_place.GetType() == pten::AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == pten::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place);
+    platform::CustomPlace place_dst(dst_place);
+    return Copy(place_dst, dst, place_src, src, num, stream);
+  }
+}
+
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, void* stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num,
+                                       void* stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+#endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index 31d1a50e778f8c86400163a774af6dc04dce10ed..dd861a15b5c7b03e932eff8747668268b14618ef 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -36,66 +36,25 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or GPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or GPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   CUDA stream.
- *
- * \note    For GPU memory copy, CUDA stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          gpuStream_t stream);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-/**
- * \brief   Copy memory from one place to another place.
- *
- * \param[in]  DstPlace Destination allocation place (CPU or NPU).
- * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or NPU).
- * \param[in]  src      Source memory address.
- * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   NPU stream.
- *
- * \note    For NPU memory copy, NPU stream need to be specified
- *          for asynchronously memory copy.
- *
- */
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          aclrtStream stream);
-#endif
-
-#ifdef PADDLE_WITH_MLU
 /**
  * \brief   Copy memory from one place to another place.
  *
- * \param[in]  DstPlace Destination allocation place (CPU or MLU).
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU or XPU or
+ * CustomDevice).
  * \param[in]  dst      Destination memory address.
- * \param[in]  SrcPlace Source allocation place (CPU or MLU).
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU or XPU or
+ * CustomDevice).
  * \param[in]  src      Source memory address.
  * \param[in]  num      memory size in bytes to copy.
- * \param[in]  stream   MLU stream.
+ * \param[in]  stream   stream for asynchronously memory copy.
  *
- * \note    For MLU memory copy, MLU stream need to be specified
- *          for asynchronously memory copy.
+ * \note    For GPU/XPU/CustomDevice memory copy, stream need to be specified
+ *          for asynchronously memory copy, and type is restored in the
+ *          implementation.
  *
  */
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          mluStream stream);
-#endif
-
+          void* stream);
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..506b57186965de8fff758a958cc0e87b374e64bc
--- /dev/null
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using float16 = paddle::platform::float16;
+
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int16_t>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<double>>;
+
+template struct SetConstant<pten::CPUContext, platform::float16>;
+template struct SetConstant<pten::CPUContext, platform::bfloat16>;
+template struct SetConstant<pten::CPUContext, float>;
+template struct SetConstant<pten::CPUContext, double>;
+template struct SetConstant<pten::CPUContext, int16_t>;
+template struct SetConstant<pten::CPUContext, int>;
+template struct SetConstant<pten::CPUContext, int64_t>;
+template struct SetConstant<pten::CPUContext, bool>;
+template struct SetConstant<pten::CPUContext, uint8_t>;
+template struct SetConstant<pten::CPUContext, platform::complex<float>>;
+template struct SetConstant<pten::CPUContext, platform::complex<double>>;
+
+#ifdef PADDLE_WITH_XPU
+template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
+template struct SetConstant<platform::XPUDeviceContext, float>;
+template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
+template struct SetConstant<platform::XPUDeviceContext, int16_t>;
+template struct SetConstant<platform::XPUDeviceContext, int>;
+template struct SetConstant<platform::XPUDeviceContext, int64_t>;
+template struct SetConstant<platform::XPUDeviceContext, bool>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<double>>;
+#endif
+
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<float>, RANK>;                \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<double>, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    transpose_helper(0, out->numel());
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::NPUPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+}
+
+template <>
+void set_constant_with_place<platform::MLUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CustomPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<platform::CUDAPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // tensor->place().apply_visitor(func);
+  paddle::platform::VisitPlace(tensor->place(), func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, int>;
+template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
+
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
+  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 2cd068badf51e8a3176de4ec80700ce7057862d1..ecad5340d71c1ae32339ab1c79bf37d947402747 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,3 +1,18 @@
+IF(WITH_CUSTOM_DEVICE)
+cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
+
+cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
+
+cc_library(stream SRCS stream.cc DEPS callback_manager)
+
+cc_library(event SRCS event.cc DEPS enforce place)
+
+cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
+
+ENDIF()
+
+set(DEV_LIBS custom_device)
+
 # GPU
 IF(WITH_GPU OR WITH_ROCM)
   add_subdirectory(gpu)
@@ -22,3 +37,11 @@ ENDIF()
 IF(WITH_MLU)
   add_subdirectory(mlu)
 ENDIF()
+
+# CUSTOM
+IF(WITH_CUSTOM_DEVICE)
+  add_subdirectory(custom)
+
+  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
+  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
+ENDIF()
diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/fluid/platform/device/callback_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c677bc0262f0cfba0a5995afbde9e04f4bb0337e
--- /dev/null
+++ b/paddle/fluid/platform/device/callback_manager.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+CallbackManager::CallbackManager(stream::Stream *stream)
+    : stream_(stream), thread_pool_(1) {}
+
+void CallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
+
+  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+      ->AddCallback(stream_, func);
+}
+
+void CallbackManager::Wait() const {
+  platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
+      ->SynchronizeStream(stream_);
+
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/fluid/platform/device/callback_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..0edc694c94bb7846ac6081bccc0dc7fecd61adcb
--- /dev/null
+++ b/paddle/fluid/platform/device/callback_manager.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include <functional>
+#include <future>  // NOLINT
+#include <memory>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+namespace stream {
+class Stream;
+}  // namespace stream
+
+// NOTE(zjl): clean CallbackManager to make compilation faster
+// Make CallbackManager thread-safe
+class CallbackManager {
+ public:
+  explicit CallbackManager(stream::Stream* stream);
+
+  ~CallbackManager() = default;
+
+  void AddCallback(std::function<void()> callback) const;
+
+  void Wait() const;
+
+ private:
+  stream::Stream* stream_;
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f39c60c0c68edcdaca4bd4a0b25a9ec07453280e
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -0,0 +1,4 @@
+IF(WITH_CUSTOM_DEVICE)
+cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
+cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
+ENDIF()
diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/fluid/platform/device/custom/custom_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b98d3e2289588144e864bcbaed98f345bfad3c
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/custom_device.cc
@@ -0,0 +1,672 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device_context.h"
+
+static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
+  return d1.id == d2.id;
+}
+
+namespace paddle {
+namespace platform {
+
+class CustomDevice : public DeviceInterface {
+ public:
+  CustomDevice(const std::string& type, int priority, bool is_custom,
+               std::unique_ptr<C_DeviceInterface> pimpl, void* dso_handle)
+      : DeviceInterface(type, priority, is_custom),
+        pimpl_(std::move(pimpl)),
+        dso_handle_(dso_handle) {
+    Initialize();
+  }
+
+  ~CustomDevice() override { Finalize(); }
+
+  size_t GetDeviceCount() override {
+    size_t count;
+    if (pimpl_->get_device_count(&count) != C_SUCCESS) {
+      count = 0;
+    }
+    return count;
+  }
+
+  std::vector<size_t> GetDeviceList() override {
+    size_t count = GetDeviceCount();
+    std::vector<size_t> devices(count);
+    pimpl_->get_device_list(devices.data());
+    return devices;
+  }
+
+  C_DeviceInterface* Impl() { return pimpl_.get(); }
+
+  void SynchronizeDevice(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_device(device));
+  }
+
+  void Initialize() override {
+    if (pimpl_->initialize && pimpl_->initialize() != C_SUCCESS) {
+      LOG(ERROR) << "Initialize " << Type() << " Failed\n";
+      exit(-1);
+    }
+    auto devices = GetDeviceList();
+    for (auto dev_id : devices) {
+      C_Device_st device;
+      device.id = dev_id;
+      devices_pool[dev_id] = device;
+      InitDevice(dev_id);
+    }
+  }
+
+  void Finalize() override {
+    auto devices = GetDeviceList();
+    for (auto dev_id : devices) {
+      // SetDevice(dev_id);
+      // SynchronizeDevice(dev_id);
+      DeInitDevice(dev_id);
+    }
+
+    bool ok = true;
+    if (pimpl_->finalize && pimpl_->finalize() != C_SUCCESS) {
+      LOG(ERROR) << "Finalize " << Type() << " Failed\n";
+      ok = false;
+    }
+    if (dso_handle_) {
+      dlclose(dso_handle_);
+      dso_handle_ = nullptr;
+    }
+    if (!ok) {
+      exit(1);
+    }
+  }
+
+  void InitDevice(size_t dev_id) override {
+    if (pimpl_->init_device) {
+      // Core set logical id, and Plugin replace it with physical id
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->init_device(device));
+    }
+  }
+
+  void DeInitDevice(size_t dev_id) override {
+    if (pimpl_->deinit_device) {
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->deinit_device(device));
+    }
+  }
+
+  void SetDevice(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->set_device(device));
+  }
+
+  int GetDevice() override {
+    C_Device_st device;
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->get_device(&device));
+    return device.id;
+  }
+
+  void CreateStream(size_t dev_id, stream::Stream* stream,
+                    const stream::Stream::Priority& priority =
+                        stream::Stream::Priority::kNormal,
+                    const stream::Stream::Flag& flag =
+                        stream::Stream::Flag::kDefaultFlag) override {
+    if (priority != stream::Stream::Priority::kNormal ||
+        flag != stream::Stream::Flag::kDefaultFlag) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "priority != stream::Stream::Priority::kNormal || flag != "
+          "stream::Stream::Flag::kDefaultFlag is not allowed on "
+          "CustomDevice."));
+    }
+    const auto device = &devices_pool[dev_id];
+    C_Stream c_stream;
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->create_stream(device, &c_stream));
+    stream->set_stream(c_stream);
+  }
+
+  void DestroyStream(size_t dev_id, stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+  }
+
+  void SynchronizeStream(size_t dev_id, const stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+  }
+
+  bool QueryStream(size_t dev_id, const stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->query_stream) {
+      SynchronizeStream(dev_id, stream);
+      return true;
+    }
+    if (pimpl_->query_stream(device, reinterpret_cast<C_Stream>(
+                                         stream->raw_stream())) == C_SUCCESS) {
+      return true;
+    }
+    return false;
+  }
+
+  void AddCallback(size_t dev_id, stream::Stream* stream,
+                   stream::Stream::Callback* callback) override {
+    if (!pimpl_->stream_add_callback) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "AddCallback is not supported on %s.", Type()));
+    } else {
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback(
+          device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+          [](C_Device device, C_Stream stream, void* user_data,
+             C_Status* status) {
+            std::unique_ptr<std::function<void()>> func(
+                reinterpret_cast<std::function<void()>*>(user_data));
+            (*func)();
+          },
+          callback));
+    }
+  }
+
+  void CreateEvent(size_t dev_id, event::Event* event,
+                   event::Event::Flag flags) override {
+    const auto device = &devices_pool[dev_id];
+    C_Event c_event;
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->create_event(device, &c_event));
+    event->set_event(c_event);
+  }
+
+  void DestroyEvent(size_t dev_id, event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_event(
+        device, reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  void RecordEvent(size_t dev_id, const event::Event* event,
+                   const stream::Stream* stream) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  void SynchronizeEvent(size_t dev_id, const event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_event(
+        device, reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  bool QueryEvent(size_t dev_id, const event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->query_event) {
+      SynchronizeEvent(dev_id, event);
+      return true;
+    }
+    if (pimpl_->query_event(device, reinterpret_cast<C_Event>(
+                                        event->raw_event())) == C_SUCCESS) {
+      return true;
+    }
+    return false;
+  }
+
+  void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+                       const event::Event* event) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
+        device, reinterpret_cast<C_Stream>(stream->raw_stream()),
+        reinterpret_cast<C_Event>(event->raw_event())));
+  }
+
+  void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    const auto device = &devices_pool[dev_id];
+    auto place = platform::CustomPlace(Type(), dev_id);
+
+    if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) {
+      C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pool.Get(place)->Wait();
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->memory_copy_h2d(device, dst, src, size));
+    }
+  }
+
+  void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    const auto device = &devices_pool[dev_id];
+    auto place = platform::CustomPlace(Type(), dev_id);
+
+    if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) {
+      C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pool.Get(place)->Wait();
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->memory_copy_d2h(device, dst, src, size));
+    }
+  }
+
+  void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    const auto device = &devices_pool[dev_id];
+    auto place = platform::CustomPlace(Type(), dev_id);
+
+    if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) {
+      C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pool.Get(place)->Wait();
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->memory_copy_d2d(device, dst, src, size));
+    }
+  }
+
+  void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id,
+                     const void* src, size_t size,
+                     const stream::Stream* stream = nullptr) override {
+    int dst_dev_id = PlaceToId(dst_place);
+    auto dst_device = &devices_pool[dst_dev_id];
+    auto src_device = &devices_pool[src_dev_id];
+
+    if (stream && stream->raw_stream()) {
+      if (!pimpl_->async_memory_copy_p2p) {
+        MemoryCopyP2P(dst_place, dst, src_dev_id, src, size);
+      } else {
+        PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p(
+            dst_device, src_device,
+            reinterpret_cast<C_Stream>(stream->raw_stream()), dst, src, size));
+      }
+    } else {
+      if (!pimpl_->memory_copy_p2p) {
+        std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+        MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
+        MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
+      } else {
+        auto src_place = platform::CustomPlace(Type(), src_dev_id);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        pool.Get(src_place)->Wait();
+        PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+            pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
+      }
+    }
+  }
+
+  void* MemoryAllocate(size_t dev_id, size_t size) override {
+    void* ptr = nullptr;
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->device_memory_allocate(device, &ptr, size));
+    return ptr;
+  }
+
+  void MemoryDeallocate(size_t dev_id, void* ptr, size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->device_memory_deallocate(device, ptr, size));
+  }
+
+  void* MemoryAllocateHost(size_t dev_id, size_t size) override {
+    void* ptr = nullptr;
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->unified_memory_allocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Host is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->host_memory_allocate(device, &ptr, size));
+    }
+    return ptr;
+  }
+
+  void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->host_memory_deallocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Host is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->host_memory_deallocate(device, ptr, size));
+    }
+  }
+
+  void* MemoryAllocateUnified(size_t dev_id, size_t size) override {
+    void* ptr = nullptr;
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->unified_memory_allocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Unified is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->unified_memory_allocate(device, &ptr, size));
+    }
+    return ptr;
+  }
+
+  void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (!pimpl_->unified_memory_deallocate) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "MemoryAllocKind::Host is not supported on %s.", Type()));
+    } else {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->unified_memory_deallocate(device, ptr, size));
+    }
+  }
+
+  void MemorySet(size_t dev_id, void* ptr, uint8_t value,
+                 size_t size) override {
+    const auto device = &devices_pool[dev_id];
+
+    if (pimpl_->device_memory_set) {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->device_memory_set(device, ptr, value, size));
+    } else {
+      std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+      memset(tmp.get(), value, size);
+      MemoryCopyH2D(dev_id, ptr, tmp.get(), size);
+    }
+  }
+
+  void MemoryStats(size_t dev_id, size_t* total, size_t* free) override {
+    const auto device = &devices_pool[dev_id];
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->device_memory_stats(device, total, free));
+
+    size_t used = *total - *free;
+    VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/"
+             << (*total >> 20) << "M, " << (*free >> 20)
+             << "M available to allocate";
+  }
+
+  size_t GetMinChunkSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t size = 0;
+    pimpl_->device_min_chunk_size(device, &size);
+    VLOG(10) << Type() << " min chunk size " << size << "B";
+    return size;
+  }
+
+  size_t GetMaxChunkSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t size = 0;
+    if (pimpl_->device_max_chunk_size) {
+      pimpl_->device_max_chunk_size(device, &size);
+      VLOG(10) << Type() << " max chunk size " << size << "B";
+    } else {
+      return DeviceInterface::GetMaxChunkSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetMaxAllocSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t size = 0;
+    if (pimpl_->device_max_alloc_size) {
+      pimpl_->device_max_alloc_size(device, &size);
+      VLOG(10) << Type() << " max alloc size " << (size >> 20) << "M";
+    } else {
+      return DeviceInterface::GetMaxAllocSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetInitAllocSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+    size_t size = 0;
+    if (pimpl_->device_init_alloc_size) {
+      pimpl_->device_init_alloc_size(device, &size);
+      VLOG(10) << Type() << " init alloc size " << (size >> 20) << "M";
+    } else {
+      return DeviceInterface::GetInitAllocSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetReallocSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+    size_t size = 0;
+    if (pimpl_->device_realloc_size) {
+      pimpl_->device_realloc_size(device, &size);
+      VLOG(10) << Type() << " realloc size " << (size >> 20) << "M";
+    } else {
+      return DeviceInterface::GetReallocSize(dev_id);
+    }
+    return size;
+  }
+
+  size_t GetExtraPaddingSize(size_t dev_id) override {
+    const auto device = &devices_pool[dev_id];
+
+    size_t padding_size = 0;
+    if (pimpl_->device_extra_padding_size) {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->device_extra_padding_size(device, &padding_size));
+      VLOG(10) << Type() << " extra padding size " << (padding_size >> 20)
+               << "M";
+    } else {
+      return DeviceInterface::GetExtraPaddingSize(dev_id);
+    }
+    return 0;
+  }
+
+  size_t GetComputeCapability() override {
+    size_t compute_capability = 0;
+    if (pimpl_->get_compute_capability) {
+      pimpl_->get_compute_capability(&compute_capability);
+    }
+    VLOG(10) << Type() << " get compute capability " << compute_capability;
+    return compute_capability;
+  }
+
+  size_t GetRuntimeVersion() override {
+    size_t version = 0;
+    if (pimpl_->get_runtime_version) {
+      pimpl_->get_runtime_version(&version);
+    }
+    VLOG(10) << Type() << " get runtime version " << version;
+    return version;
+  }
+
+  size_t GetDriverVersion() override {
+    size_t version = 0;
+    if (pimpl_->get_driver_version) {
+      pimpl_->get_driver_version(&version);
+    }
+    VLOG(10) << Type() << " get driver version " << version;
+    return version;
+  }
+
+ private:
+  inline int PlaceToIdNoCheck(const Place& place) {
+    int dev_id = place.GetDeviceId();
+    return dev_id;
+  }
+
+  inline int PlaceToId(const Place& place) {
+    int dev_id = PlaceToIdNoCheck(place);
+    PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(),
+                      platform::errors::NotFound(
+                          "Cannot found %s %d, please check visible devices",
+                          Type(), dev_id));
+    return dev_id;
+  }
+
+  std::unique_ptr<C_DeviceInterface> pimpl_;
+  void* dso_handle_;
+  std::unordered_map<size_t, C_Device_st> devices_pool;
+};
+
+bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
+#define CHECK_PTR(ptr, required)                                   \
+  if (params->interface->ptr == nullptr && required) {             \
+    LOG(WARNING) << "CustomRuntime [type: " << params->device_type \
+                 << "] pointer: " << #ptr << " is not set.";       \
+    return false;                                                  \
+  }
+
+  int version = params->version.major * 10000 + params->version.minor * 100 +
+                params->version.patch;
+  const int runtime_version = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION * 10000 +
+                              PADDLE_CUSTOM_RUNTIME_MINOR_VERSION * 100 +
+                              PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
+
+  if (version < runtime_version) {
+    LOG(WARNING) << "CustomRuntime [type: " << params->device_type
+                 << "] version: " << version
+                 << " < PADDLE_CUSTOM_RUNTIME_VERSION " << runtime_version;
+    return false;
+  }
+
+  CHECK_PTR(initialize, false);
+  CHECK_PTR(finalize, false)
+
+  CHECK_PTR(init_device, false);
+  CHECK_PTR(set_device, true);
+  CHECK_PTR(get_device, true);
+  CHECK_PTR(deinit_device, false);
+
+  CHECK_PTR(create_stream, true);
+  CHECK_PTR(destroy_stream, true);
+  CHECK_PTR(query_stream, false);
+  CHECK_PTR(stream_add_callback, false);
+
+  CHECK_PTR(create_event, true);
+  CHECK_PTR(record_event, true);
+  CHECK_PTR(destroy_event, true);
+  CHECK_PTR(query_event, false);
+
+  CHECK_PTR(synchronize_device, false);
+  CHECK_PTR(synchronize_stream, true);
+  CHECK_PTR(synchronize_event, true);
+  CHECK_PTR(stream_wait_event, true);
+
+  CHECK_PTR(device_memory_allocate, true);
+  CHECK_PTR(device_memory_deallocate, true);
+  CHECK_PTR(host_memory_allocate, false);
+  CHECK_PTR(host_memory_deallocate, false);
+  CHECK_PTR(unified_memory_allocate, false);
+  CHECK_PTR(unified_memory_deallocate, false);
+  CHECK_PTR(memory_copy_h2d, true);
+  CHECK_PTR(memory_copy_d2h, true);
+  CHECK_PTR(memory_copy_d2d, true);
+  CHECK_PTR(memory_copy_p2p, false);
+  CHECK_PTR(async_memory_copy_h2d, false);
+  CHECK_PTR(async_memory_copy_d2h, false);
+  CHECK_PTR(async_memory_copy_d2d, false);
+  CHECK_PTR(async_memory_copy_p2p, false);
+
+  CHECK_PTR(get_device_count, true);
+  CHECK_PTR(get_device_list, true);
+  CHECK_PTR(device_memory_stats, true);
+
+  CHECK_PTR(device_min_chunk_size, true);
+  CHECK_PTR(device_max_chunk_size, false);
+  CHECK_PTR(device_max_alloc_size, false);
+  CHECK_PTR(device_extra_padding_size, false);
+  CHECK_PTR(get_compute_capability, false);
+  CHECK_PTR(get_runtime_version, false);
+  CHECK_PTR(get_driver_version, false);
+
+  return true;
+#undef CHECK_PTR
+}
+
+typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
+
+bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
+                          std::unique_ptr<C_DeviceInterface> device_interface,
+                          void* dso_handle) {
+  if (ValidCustomCustomRuntimeParams(&runtime_params)) {
+    auto device =
+        std::make_unique<CustomDevice>(runtime_params.device_type, 255, true,
+                                       std::move(device_interface), dso_handle);
+    if (false == DeviceManager::Register(std::move(device))) {
+      LOG(WARNING) << "Skip this library. Register failed!!! there may be a "
+                      "Custom Runtime with the same name.";
+      return false;
+    }
+  } else {
+    LOG(WARNING)
+        << "Skip this library. Wrong parameters!!! please check the version "
+           "compatibility between PaddlePaddle and Custom Runtime.";
+    return false;
+  }
+  return true;
+}
+
+bool LoadCustomRuntimeLib(void* dso_handle) {
+  CustomRuntimeParams runtime_params;
+  std::memset(&runtime_params, 0, sizeof(CustomRuntimeParams));
+  runtime_params.size = sizeof(CustomRuntimeParams);
+  auto device_interface = std::make_unique<C_DeviceInterface>();
+  runtime_params.interface = device_interface.get();
+  std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
+  runtime_params.interface->size = sizeof(C_DeviceInterface);
+
+  RegisterDevicePluginFn init_plugin_fn =
+      reinterpret_cast<RegisterDevicePluginFn>(dlsym(dso_handle, "InitPlugin"));
+  if (!init_plugin_fn) {
+    LOG(WARNING) << "Skip this library. InitPlugin symbol not found.";
+    return false;
+  }
+  init_plugin_fn(&runtime_params);
+  if (runtime_params.device_type == nullptr) {
+    LOG(WARNING)
+        << "Skip this library. InitPlugin failed!!! please check the version "
+           "compatibility between PaddlePaddle and Custom Runtime.";
+    return false;
+  }
+  return LoadCustomRuntimeLib(runtime_params, std::move(device_interface),
+                              dso_handle);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/fluid/platform/device/custom/custom_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a874ea221228ef016ad3bff60620f949582cf9e
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/custom_device_test.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
+#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/fluid/platform/device_context.h"
+
+void RegisterDevice() {
+  CustomRuntimeParams runtime_params;
+  runtime_params.size = sizeof(CustomRuntimeParams);
+  auto device_interface = std::make_unique<C_DeviceInterface>();
+  runtime_params.interface = device_interface.get();
+  std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
+  runtime_params.interface->size = sizeof(C_DeviceInterface);
+
+  InitFakeCPUDevice(&runtime_params);
+  EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib(
+      runtime_params, std::move(device_interface), nullptr));
+}
+
+void InitDevice() {
+  RegisterDevice();
+  EXPECT_GT(static_cast<int>(
+                paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
+            0);
+  auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
+  auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+  EXPECT_NE(device, nullptr);
+
+  std::vector<paddle::platform::Place> places;
+  auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  for (auto dev_type : device_types) {
+    auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
+    for (auto dev_id : devices) {
+      places.push_back(
+          paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
+    }
+  }
+  EXPECT_GT(static_cast<int>(places.size()), 0);
+
+  paddle::platform::DeviceContextPool::Init(places);
+}
+
+void TestDeviceInterface(const paddle::platform::Place& place) {
+  std::cout << "TestDeviceInterface on " << place << std::endl;
+  if (paddle::platform::is_custom_place(place)) {
+    auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
+    auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
+    auto p1 = device->MemoryAllocate(
+        paddle::platform::DeviceManager::GetMinChunkSize(place));
+    EXPECT_NE(p1, nullptr);
+
+    paddle::platform::DeviceManager::SetDevice(place);
+    auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
+    EXPECT_EQ(dev_id, place.GetDeviceId());
+  }
+}
+
+void TestTensorMutableData(const paddle::platform::Place& place) {
+  std::cout << "TestTensorInitialization on " << place << std::endl;
+  paddle::framework::Tensor src_tensor;
+  float* p1 = nullptr;
+  float* p2 = nullptr;
+  // initialization
+  p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({1, 2, 3}),
+                                      place);
+  auto p1_holder = src_tensor.Holder();
+  EXPECT_NE(p1, nullptr);
+  // set src_tensor a new dim with large size
+  // momery is supposed to be re-allocated
+  p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({3, 1024}),
+                                      place);
+  auto p2_holder = src_tensor.Holder();
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p1_holder.get(), p2_holder.get());
+  // set src_tensor a new dim with same size
+  // momery block is supposed to be unchanged
+  p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2, 3}),
+                                      place);
+  EXPECT_EQ(p1, p2);
+  // set src_tensor a new dim with smaller size
+  // momery block is supposed to be unchanged
+  p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2}),
+                                      place);
+  EXPECT_EQ(p1, p2);
+}
+
+void TestTensorShareDataWith(const paddle::platform::Place& place) {
+  std::cout << "TestTensorShareDataWith on " << place << std::endl;
+  paddle::framework::Tensor src_tensor;
+  paddle::framework::Tensor dst_tensor;
+  src_tensor.mutable_data<int>(paddle::framework::make_ddim({2, 3, 4}), place);
+  dst_tensor.ShareDataWith(src_tensor);
+  ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+}
+
+void TestTensorUtils(const paddle::platform::Place& place) {
+  if (paddle::platform::is_custom_place(place) == false) {
+    return;
+  }
+  paddle::framework::Tensor src_tensor;
+  paddle::framework::Tensor gpu_tensor;
+  paddle::framework::Tensor dst_tensor;
+
+  int* src_ptr = src_tensor.mutable_data<int>(
+      paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+
+  // CPU Tensor to GPU Tensor
+  paddle::platform::CustomDeviceContext gpu_ctx(place);
+  paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor);
+#if 0
+  // GPU Tensor to CPU Tensor
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+  // Sync before Compare Tensors
+  gpu_ctx.Wait();
+  const int* dst_ptr = dst_tensor.data<int>();
+  EXPECT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  // Copy the same tensor
+  paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor);
+  gpu_ctx.Wait();
+  const int* dst_ptr_tmp = dst_tensor.data<int>();
+  EXPECT_NE(src_ptr, dst_ptr_tmp);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+  }
+
+  paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+  // CPU Slice Tensor to GPU Tensor
+  paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor);
+
+  // GPU Tensor to CPU Tensor
+  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+  // Sync before Compare Slice Tensors
+  gpu_ctx.Wait();
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  EXPECT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+#endif
+}
+
+TEST(CustomDevice, Tensor) {
+  InitDevice();
+  auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
+  for (const auto& dev_type : dev_types) {
+    std::cout << "Test on " << dev_type << std::endl;
+    EXPECT_GT(static_cast<int>(
+                  paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
+              0);
+    auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
+
+    TestDeviceInterface(place);
+    TestTensorMutableData(place);
+    TestTensorShareDataWith(place);
+    TestTensorUtils(place);
+  }
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbdb4627aba2662a2a12cc933a3a4c6e61aa55d5
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/enforce_custom.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/device_ext.h"
+
+namespace paddle {
+namespace platform {
+namespace details {
+template <typename T>
+struct CustomDeviceStatusType {};
+
+#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \
+  template <>                                                 \
+  struct CustomDeviceStatusType<type> {                       \
+    using Type = type;                                        \
+    static constexpr Type kSuccess = success_value;           \
+  }
+
+DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS);
+}  // namespace details
+
+inline std::string build_custom_device_error_msg(C_Status stat) {
+  std::ostringstream sout;
+  sout << " CustomDevice error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
+#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND)                      \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__);           \
+    constexpr auto __success_type__ =                                   \
+        ::paddle::platform::details::CustomDeviceStatusType<            \
+            __CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess;                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = ::paddle::platform::errors::External(          \
+          ::paddle::platform::build_custom_device_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+}  // namespace platform
+}  // namespace paddle
+#endif  // PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/fluid/platform/device/custom/fake_cpu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6d8ade4b08597b2c17e5df9dc333c3c4f70d69e
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/fake_cpu_device.h
@@ -0,0 +1,185 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/device/device_ext.h"
+
+constexpr size_t global_total_memory = 1024 * 1024UL;
+static size_t global_free_memory = global_total_memory;
+
+C_Status Init() { return C_SUCCESS; }
+
+C_Status InitDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status SetDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status GetDevice(const C_Device device) {
+  device->id = 0;
+  return C_SUCCESS;
+}
+
+C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status Finalize() { return C_SUCCESS; }
+
+C_Status GetDevicesCount(size_t *count) {
+  *count = 1;
+  return C_SUCCESS;
+}
+
+C_Status GetDevicesList(size_t *device) {
+  *device = 0;
+  return C_SUCCESS;
+}
+
+C_Status MemCpy(const C_Device device, void *dst, const void *src,
+                size_t size) {
+  memcpy(dst, src, size);
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
+                     const void *src, size_t size) {
+  memcpy(dst, src, size);
+  return C_SUCCESS;
+}
+
+C_Status Allocate(const C_Device device, void **ptr, size_t size) {
+  if (global_free_memory >= size) {
+    *ptr = malloc(size);
+    global_free_memory -= size;
+    return C_SUCCESS;
+  } else {
+    *ptr = nullptr;
+    return C_FAILED;
+  }
+}
+
+C_Status Deallocate(const C_Device device, void *ptr, size_t size) {
+  free(ptr);
+  global_free_memory += size;
+  return C_SUCCESS;
+}
+
+C_Status CreateStream(const C_Device device, C_Stream *stream) {
+  return C_SUCCESS;
+}
+
+C_Status DestroyStream(const C_Device device, C_Stream stream) {
+  return C_SUCCESS;
+}
+
+C_Status CreateEvent(const C_Device device, C_Event *event) {
+  return C_SUCCESS;
+}
+
+C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status DestroyEvent(const C_Device device, C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status SyncDevice(const C_Device device) { return C_SUCCESS; }
+
+C_Status SyncStream(const C_Device device, C_Stream stream) {
+  return C_SUCCESS;
+}
+
+C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
+
+C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
+                         C_Event event) {
+  return C_SUCCESS;
+}
+
+C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
+
+C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
+                        size_t *free_memory) {
+  *total_memory = global_total_memory;
+  *free_memory = global_free_memory;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMinChunkSize(const C_Device device, size_t *size) {
+  *size = 4 * 1024;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) {
+  *size = 64 * 1024;
+  return C_SUCCESS;
+}
+
+C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) {
+  *size = global_total_memory * 0.95;
+  return C_SUCCESS;
+}
+
+#define DEVICE_TYPE "FakeCPU"
+#define SUB_DEVICE_TYPE "V100"
+
+void InitFakeCPUDevice(CustomRuntimeParams *params) {
+  params->device_type = const_cast<char *>(DEVICE_TYPE);
+  params->sub_device_type = const_cast<char *>(SUB_DEVICE_TYPE);
+  params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION;
+  params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
+  params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
+
+  memset(reinterpret_cast<void *>(params->interface), 0,
+         sizeof(C_DeviceInterface));
+
+  params->interface->initialize = Init;
+  params->interface->finalize = Finalize;
+
+  params->interface->init_device = InitDevice;
+  params->interface->set_device = SetDevice;
+  params->interface->get_device = GetDevice;
+  params->interface->deinit_device = DestroyDevice;
+
+  params->interface->create_stream = CreateStream;
+  params->interface->destroy_stream = DestroyStream;
+
+  params->interface->create_event = CreateEvent;
+  params->interface->destroy_event = DestroyEvent;
+  params->interface->record_event = RecordEvent;
+
+  params->interface->synchronize_device = SyncDevice;
+  params->interface->synchronize_stream = SyncStream;
+  params->interface->synchronize_event = SyncEvent;
+  params->interface->stream_wait_event = StreamWaitEvent;
+
+  params->interface->memory_copy_h2d = MemCpy;
+  params->interface->memory_copy_d2d = MemCpy;
+  params->interface->memory_copy_d2h = MemCpy;
+  params->interface->async_memory_copy_h2d = AsyncMemCpy;
+  params->interface->async_memory_copy_d2d = AsyncMemCpy;
+  params->interface->async_memory_copy_d2h = AsyncMemCpy;
+  params->interface->device_memory_allocate = Allocate;
+  params->interface->host_memory_allocate = Allocate;
+  params->interface->unified_memory_allocate = Allocate;
+  params->interface->device_memory_deallocate = Deallocate;
+  params->interface->host_memory_deallocate = Deallocate;
+  params->interface->unified_memory_deallocate = Deallocate;
+
+  params->interface->get_device_count = GetDevicesCount;
+  params->interface->get_device_list = GetDevicesList;
+  params->interface->device_memory_stats = DeviceMemStats;
+
+  params->interface->device_max_chunk_size = DeviceMaxChunkSize;
+  params->interface->device_min_chunk_size = DeviceMinChunkSize;
+  params->interface->device_max_alloc_size = DeviceMaxAllocSize;
+}
diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/fluid/platform/device/device_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6234c9612687e507acd2642ef1d39cc0f8da4539
--- /dev/null
+++ b/paddle/fluid/platform/device/device_base.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "gflags/gflags.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+namespace paddle {
+namespace platform {
+
+#define INTERFACE_UNIMPLEMENT                   \
+  PADDLE_THROW(platform::errors::Unimplemented( \
+      "%s is not implemented on %s device.", __func__, Type()));
+
+// info
+size_t DeviceInterface::GetComputeCapability() {
+  VLOG(10) << Type() + " get compute capability " << 0;
+  return 0;
+}
+
+size_t DeviceInterface::GetRuntimeVersion() {
+  VLOG(10) << Type() + " get runtime version " << 0;
+  return 0;
+}
+
+size_t DeviceInterface::GetDriverVersion() {
+  VLOG(10) << Type() + " get driver version " << 0;
+  return 0;
+}
+
+// device manage
+void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::SynchronizeDevice(size_t dev_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
+
+int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
+
+// stream manage
+void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
+                                   const stream::Stream::Priority& priority,
+                                   const stream::Stream::Flag& flag) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::SynchronizeStream(size_t dev_id,
+                                        const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+  return true;
+}
+
+void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
+                                  stream::Stream::Callback* callback) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::StreamWaitEvent(size_t dev_id,
+                                      const stream::Stream* stream,
+                                      const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+// event manage
+void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
+                                  event::Event::Flag flags) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
+                                  const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::SynchronizeEvent(size_t dev_id,
+                                       const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
+  INTERFACE_UNIMPLEMENT;
+  return true;
+}
+
+// memery manage
+void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+                                    size_t size, const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
+                                    size_t src_id, const void* src, size_t size,
+                                    const stream::Stream* stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
+                                           size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
+  INTERFACE_UNIMPLEMENT;
+  return nullptr;
+}
+
+void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
+                                              size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
+                                size_t size) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
+  size_t available_to_alloc = AvailableAllocSize(dev_id);
+  PADDLE_ENFORCE_GT(available_to_alloc, 0,
+                    platform::errors::ResourceExhausted(
+                        "Not enough available %s memory.", Type()));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
+                    platform::errors::ResourceExhausted(
+                        "Not enough available %s memory.", Type()));
+  return alloc_bytes;
+}
+
+size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
+  size_t total = 0;
+  size_t available = 0;
+  MemoryStats(dev_id, &total, &available);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GetMinChunkSize(dev_id);
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  return available_to_alloc;
+}
+
+size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
+  size_t init_alloc_size = AllocSize(dev_id, false);
+  VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
+  return init_alloc_size;
+}
+
+size_t DeviceInterface::GetReallocSize(size_t dev_id) {
+  size_t realloc_size = AllocSize(dev_id, true);
+  VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
+  return realloc_size;
+}
+
+size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
+  size_t max_alloc_size =
+      std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
+  VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
+  return max_alloc_size;
+}
+
+size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
+  size_t max_chunk_size = GetMaxAllocSize(dev_id);
+  VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
+  VLOG(10) << Type() + " extra padding size " << 0;
+  return 0;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/fluid/platform/device/device_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d70b02be80eacd9d492b8a8d40c0a074dfe9c6e3
--- /dev/null
+++ b/paddle/fluid/platform/device/device_base.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceInterface {  // Driver / Runtime
+ public:
+  DeviceInterface(const std::string& type, uint8_t priority, bool is_custom)
+      : type_(type), priority_(priority), is_custom_(is_custom) {}
+  uint8_t Priority() { return priority_; }
+  std::string Type() { return type_; }
+  bool IsCustom() { return is_custom_; }
+
+  virtual ~DeviceInterface() {}
+
+  // Info
+  virtual size_t GetComputeCapability();
+
+  virtual size_t GetRuntimeVersion();
+
+  virtual size_t GetDriverVersion();
+
+  // Platform
+  //! Initialize
+  virtual void Initialize();
+
+  //! Finalize
+  virtual void Finalize();
+
+  // Device
+  virtual size_t GetDeviceCount() = 0;
+  virtual std::vector<size_t> GetDeviceList() = 0;
+
+  //! Wait for compute device to finish.
+  virtual void SynchronizeDevice(size_t dev_id);
+
+  //! Initialize device.
+  virtual void InitDevice(size_t dev_id);
+
+  //! Deinitialize device.
+  virtual void DeInitDevice(size_t dev_id);
+
+  // ! Set device to be used.
+  virtual void SetDevice(size_t dev_id);
+
+  // ! Returns which device is currently being used.
+  virtual int GetDevice();
+
+  // Stream
+  // ! Create an asynchronous stream
+  virtual void CreateStream(
+      size_t dev_id, stream::Stream* stream,
+      const stream::Stream::Priority& priority =
+          stream::Stream::Priority::kNormal,
+      const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
+
+  // ! Destroys an asynchronous stream.
+  virtual void DestroyStream(size_t dev_id, stream::Stream* stream);
+
+  // ! Waits for stream tasks to complete.
+  virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream);
+
+  // ! Queries an asynchronous stream for completion status.
+  virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
+
+  // ! Add a callback to a compute stream.
+  virtual void AddCallback(size_t dev_id, stream::Stream* stream,
+                           stream::Stream::Callback* callback);
+
+  // Event
+  // ! Create an event.
+  virtual void CreateEvent(size_t dev_id, event::Event* event,
+                           event::Event::Flag flags);
+
+  // ! Destroy an event.
+  virtual void DestroyEvent(size_t dev_id, event::Event* event);
+
+  // ! Records an event.
+  virtual void RecordEvent(size_t dev_id, const event::Event* event,
+                           const stream::Stream* stream);
+
+  // ! Waits for event to complete.
+  virtual void SynchronizeEvent(size_t dev_id, const event::Event* event);
+  // ! Queries an event for completion status.
+  virtual bool QueryEvent(size_t dev_id, const event::Event* event);
+
+  // ! Make a compute stream wait on an event
+  virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
+                               const event::Event* event);
+
+  // Memory
+  virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
+                             size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
+                             const void* src, size_t size,
+                             const stream::Stream* stream = nullptr);
+
+  virtual void* MemoryAllocate(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size);
+
+  virtual void* MemoryAllocateHost(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size);
+
+  virtual void* MemoryAllocateUnified(size_t dev_id, size_t size);
+
+  virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size);
+
+  virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size);
+
+  virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free);
+
+  virtual size_t GetMinChunkSize(size_t dev_id);
+
+  virtual size_t GetInitAllocSize(size_t dev_id);
+
+  virtual size_t GetReallocSize(size_t dev_id);
+
+  virtual size_t GetMaxAllocSize(size_t dev_id);
+
+  virtual size_t GetMaxChunkSize(size_t dev_id);
+
+  virtual size_t GetExtraPaddingSize(size_t dev_id);
+
+ private:
+  const std::string type_;
+  const uint8_t priority_;
+  const bool is_custom_;
+
+  size_t AllocSize(size_t dev_id, bool realloc);
+
+  size_t AvailableAllocSize(size_t dev_id);
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/fluid/platform/device/device_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1e1340f74b7741f867b85d7ab0b1e42c9621a47
--- /dev/null
+++ b/paddle/fluid/platform/device/device_ext.h
@@ -0,0 +1,497 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+#include <cstddef>
+#include <cstring>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0
+#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
+#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
+
+typedef enum {
+  C_SUCCESS = 0,    // success
+  C_WARNING,        // results may not meet expectation (such as an asynchronous
+                    // interface is actually synchronous)
+  C_FAILED,         // resource exhausted/query failed
+  C_ERROR,          // invalid argument/wrong usage/uninitialized
+  C_INTERNAL_ERROR  // plugin error
+} C_Status;
+
+typedef struct C_Device_st { int id; } * C_Device;
+
+typedef struct C_Stream_st* C_Stream;
+
+typedef struct C_Event_st* C_Event;
+
+typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
+                           C_Status* status);
+
+struct C_DeviceInterface {
+  // Core fill it and plugin must to check it
+  size_t size;
+
+  ///////////////////////
+  // device manage api //
+  ///////////////////////
+
+  /**
+   * @brief Initialize hardware
+   *
+   */
+  C_Status (*initialize)();
+
+  /**
+   * @brief Deinitialize hardware
+   *
+   */
+  C_Status (*finalize)();
+
+  /**
+   * @brief Initialize device
+   *
+   * @param[C_Device] device     Core fill it with a logical id, and then plugin
+   * must replace it with a physical id
+   */
+  C_Status (*init_device)(const C_Device device);
+
+  /**
+   * @brief Set current device
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   */
+  C_Status (*set_device)(const C_Device device);
+
+  /**
+   * @brief Get current device
+   *
+   * @param[C_Device] device     Plugin fill it with a physical id
+   */
+  C_Status (*get_device)(const C_Device device);
+
+  /**
+   * @brief Deinitialize device
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   */
+  C_Status (*deinit_device)(const C_Device device);
+
+  /**
+   * @brief Create a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream*] stream    Plugin create a stream and fill it
+   */
+  C_Status (*create_stream)(const C_Device device, C_Stream* stream);
+
+  /**
+   * @brief Destroy a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream] stream
+   */
+  C_Status (*destroy_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Query a stream
+   *
+   * @param[C_Device] device     Core fill it with a physical id
+   * @param[C_Stream] stream
+   */
+  C_Status (*query_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Add a callback to stream
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Callback] callback
+   * @param[void*]      user_data
+   */
+  C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
+                                  C_Callback callback, void* user_data);
+
+  /**
+   * @brief Create an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event*]   event      Plugin create an event and fill it
+   */
+  C_Status (*create_event)(const C_Device device, C_Event* event);
+
+  /**
+   * @brief Record an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Event]    event
+   */
+  C_Status (*record_event)(const C_Device device, C_Stream stream,
+                           C_Event event);
+
+  /**
+   * @brief Destroy an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*destroy_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Query an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*query_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Synchronize a device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   */
+  C_Status (*synchronize_device)(const C_Device device);
+
+  /**
+   * @brief Synchronize a stream
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   */
+  C_Status (*synchronize_stream)(const C_Device device, C_Stream stream);
+
+  /**
+   * @brief Synchronize an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Event]    event
+   */
+  C_Status (*synchronize_event)(const C_Device device, C_Event event);
+
+  /**
+   * @brief Make a stream wait on an event
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[C_Event]    event
+   */
+  C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
+                                C_Event event);
+
+  void* reserved_dev_api[8];
+
+  ///////////////////////
+  // memory manage api //
+  ///////////////////////
+
+  /**
+   * @brief Device memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
+                                     size_t size);
+
+  /**
+   * @brief Device memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
+                                       size_t size);
+
+  /**
+   * @brief Device memory set
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[unsigned char] value
+   * @param[size_t]     size
+   */
+  C_Status (*device_memory_set)(const C_Device device, void* ptr,
+                                unsigned char value, size_t size);
+
+  /**
+   * @brief Host memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
+                                   size_t size);
+
+  /**
+   * @brief Host memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
+                                     size_t size);
+
+  /**
+   * @brief Unified memory allocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void**]     ptr        Plugin allocate an address and fill it
+   * @param[size_t]     size
+   */
+  C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
+                                      size_t size);
+
+  /**
+   * @brief Unified memory deallocate
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      ptr
+   * @param[size_t]     size
+   */
+  C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
+                                        size_t size);
+
+  /**
+   * @brief Memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Memory copy from device to host
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Memory copy from device to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
+                              size_t size);
+
+  /**
+   * @brief Peer memory copy from device to device
+   *
+   * @param[C_Device]   dst_device     Core fill it with a physical id
+   * @param[C_Device]   src_device     Core fill it with a physical id
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*memory_copy_p2p)(const C_Device dst_device,
+                              const C_Device src_device, void* dst,
+                              const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from device to host
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Asynchonrize memory copy from device to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  /**
+   * @brief Peer asynchonrize memory copy from host to device
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[C_Stream]   stream
+   * @param[void*]      dst
+   * @param[void*]      src
+   * @param[size_t]     size
+   */
+  C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
+                                    const C_Device src_device, C_Stream stream,
+                                    void* dst, const void* src, size_t size);
+
+  void* reserved_mem_api[8];
+
+  //////////////
+  // info api //
+  //////////////
+
+  /**
+   * @brief Get visible device count
+   *
+   * @param[size_t*]    count       Plugin fill it
+   */
+  C_Status (*get_device_count)(size_t* count);
+
+  /**
+   * @brief Get visible device list
+   *
+   * @param[size_t*]    devices     Plugin fill it
+   */
+  C_Status (*get_device_list)(size_t* devices);
+
+  /**
+   * @brief Device memory statistic
+   *
+   * @param[C_Device]   device     Core fill it with a physical id
+   * @param[size_t*]    total_memory
+   * @param[size_t*]    free_memory
+   * @param[size_t*]    used_memory
+   */
+  C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
+                                  size_t* free_memory);
+
+  /**
+   * @brief Device minimum chunk size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_min_chunk_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device maximum chunk size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_max_chunk_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device maximum alloc size
+   *
+   * @param[size_t*]    count
+   */
+  C_Status (*device_max_alloc_size)(const C_Device device, size_t* count);
+
+  /**
+   * @brief Device extra padding size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_extra_padding_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Device initial allocated size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_init_alloc_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Device reallocated size
+   *
+   * @param[size_t*]    size
+   */
+  C_Status (*device_realloc_size)(const C_Device device, size_t* size);
+
+  /**
+   * @brief Get compute capability
+   *
+   * @param[size_t*]    compute_capability
+   */
+  C_Status (*get_compute_capability)(size_t* compute_capability);
+
+  /**
+   * @brief Get runtime version
+   *
+   * @param[size_t*]    version
+   */
+  C_Status (*get_runtime_version)(size_t* version);
+
+  /**
+   * @brief Get driver version
+   *
+   * @param[size_t*]    version
+   */
+  C_Status (*get_driver_version)(size_t* version);
+
+  void* reserved_info_api[8];
+
+  ///////////////
+  // other api //
+  ///////////////
+
+  void* reserved_other_api[8];
+};
+
+struct CustomRuntimeVersion {
+  size_t major, minor, patch;
+};
+
+struct CustomRuntimeParams {
+  // Core fill it and plugin must to check it
+  size_t size;
+  // Plugin fill it
+  C_DeviceInterface* interface;
+  // Plugin fill it and Core will to check it
+  CustomRuntimeVersion version;
+  // Plugin fill it
+  char* device_type;
+  // Plugin fill it
+  char* sub_device_type;
+
+  char reserved[32];
+};
+
+// Plugin implement it and fill CustomRuntimeParams
+void InitPlugin(CustomRuntimeParams*);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/fluid/platform/device/device_guard.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55d8b9dc6a9a58dda5ae8192709e6858da878da7
--- /dev/null
+++ b/paddle/fluid/platform/device/device_guard.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/device_guard.h"
+
+namespace paddle {
+namespace platform {
+// Even this source file does not contains any code, it is better to keep this
+// source file for cmake dependency.
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/fluid/platform/device/device_guard.h
new file mode 100644
index 0000000000000000000000000000000000000000..638e9c984b4d25e474fd5949e9fdc5df98a344ef
--- /dev/null
+++ b/paddle/fluid/platform/device/device_guard.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/device/device_manager.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceGuard {
+ public:
+  explicit inline DeviceGuard(const Place& place)
+      : dev_type_(PlaceHelper::GetDeviceType(place)) {
+    prev_id = DeviceManager::GetDevice(dev_type_);
+    cur_id = PlaceHelper::GetDeviceId(place);
+
+    if (cur_id != prev_id) {
+      DeviceManager::SetDevice(dev_type_, cur_id);
+    }
+  }
+
+  inline ~DeviceGuard() {
+    if (cur_id != prev_id) {
+      DeviceManager::SetDevice(dev_type_, prev_id);
+    }
+  }
+
+  DeviceGuard(const DeviceGuard& o) = delete;
+  DeviceGuard& operator=(const DeviceGuard& o) = delete;
+
+ private:
+  size_t prev_id, cur_id;
+  std::string dev_type_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/fluid/platform/device/device_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38dcb721b1faeac8bc14b49cf7f0957406d4c590
--- /dev/null
+++ b/paddle/fluid/platform/device/device_manager.cc
@@ -0,0 +1,420 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/device_manager.h"
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#else
+
+#endif
+
+#include <functional>
+#include <regex>
+
+namespace paddle {
+namespace platform {
+
+void Device::CreateStream(stream::Stream* stream,
+                          const stream::Stream::Priority& priority,
+                          const stream::Stream::Flag& flag) {
+  impl_->CreateStream(dev_id_, stream, priority, flag);
+}
+
+void Device::DestroyStream(stream::Stream* stream) {
+  impl_->DestroyStream(dev_id_, stream);
+}
+
+void Device::SynchronizeStream(const stream::Stream* stream) {
+  impl_->SynchronizeStream(dev_id_, stream);
+}
+
+bool Device::QueryStream(const stream::Stream* stream) {
+  return impl_->QueryStream(dev_id_, stream);
+}
+
+void Device::AddCallback(stream::Stream* stream,
+                         stream::Stream::Callback* callback) {
+  impl_->AddCallback(dev_id_, stream, callback);
+}
+
+void Device::CreateEvent(event::Event* event, event::Event::Flag flags) {
+  impl_->CreateEvent(dev_id_, event, flags);
+}
+
+void Device::DestroyEvent(event::Event* event) {
+  impl_->DestroyEvent(dev_id_, event);
+}
+
+void Device::RecordEvent(const event::Event* event,
+                         const stream::Stream* stream) {
+  impl_->RecordEvent(dev_id_, event, stream);
+}
+
+void Device::SynchronizeEvent(const event::Event* event) {
+  impl_->SynchronizeEvent(dev_id_, event);
+}
+
+bool Device::QueryEvent(const event::Event* event) {
+  return impl_->QueryEvent(dev_id_, event);
+}
+
+void Device::StreamWaitEvent(const stream::Stream* stream,
+                             const event::Event* event) {
+  impl_->StreamWaitEvent(dev_id_, stream, event);
+}
+
+void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
+                           const stream::Stream* stream) {
+  impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
+}
+
+void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
+                           size_t size, const stream::Stream* stream) {
+  impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
+}
+
+void* Device::MemoryAllocate(size_t size) {
+  return impl_->MemoryAllocate(dev_id_, size);
+}
+
+void Device::MemoryDeallocate(void* ptr, size_t size) {
+  impl_->MemoryDeallocate(dev_id_, ptr, size);
+}
+
+void* Device::MemoryAllocateHost(size_t size) {
+  return impl_->MemoryAllocateHost(dev_id_, size);
+}
+
+void Device::MemoryDeallocateHost(void* ptr, size_t size) {
+  impl_->MemoryDeallocateHost(dev_id_, ptr, size);
+}
+
+void* Device::MemoryAllocateUnified(size_t size) {
+  return impl_->MemoryAllocateUnified(dev_id_, size);
+}
+
+void Device::MemoryDeallocateUnified(void* ptr, size_t size) {
+  impl_->MemoryDeallocateUnified(dev_id_, ptr, size);
+}
+
+void Device::MemorySet(void* ptr, uint8_t value, size_t size) {
+  impl_->MemorySet(dev_id_, ptr, value, size);
+}
+
+std::string Device::Type() { return impl_->Type(); }
+
+static pten::RWLock _global_device_manager_rw_lock;
+
+bool DeviceManager::Register(std::unique_ptr<DeviceInterface> device_impl) {
+  pten::AutoWRLock lock(&_global_device_manager_rw_lock);
+  VLOG(4) << "Register Device - " << device_impl->Type();
+  auto device_type = device_impl->Type();
+  auto& dev_impl_map = Instance().device_impl_map_;
+  auto& dev_map = Instance().device_map_;
+
+  if (dev_impl_map.find(device_type) == dev_impl_map.end()) {
+    dev_impl_map.insert(
+        std::pair<std::string, std::unique_ptr<DeviceInterface>>(
+            device_type, std::move(device_impl)));
+    auto& dev_impl = dev_impl_map[device_type];
+    auto& dev_vec = dev_map[device_type];
+    VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
+    for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
+      dev_vec.emplace_back(new Device(i, dev_impl.get()));
+    }
+  } else {
+    auto& plat = dev_impl_map[device_type];
+    if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) {
+      dev_impl_map[device_type] = std::move(device_impl);
+      auto& dev_impl = dev_impl_map[device_type];
+      auto& dev_vec = dev_map[device_type];
+      dev_vec.clear();
+      VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
+      for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
+        dev_vec.emplace_back(new Device(i, dev_impl.get()));
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
+    const std::string& device_type) {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+
+  auto& dev_impl_map = Instance().device_impl_map_;
+  if (dev_impl_map.find(device_type) != dev_impl_map.end()) {
+    return dev_impl_map.at(device_type).get();
+  } else {
+    LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
+    PADDLE_THROW(
+        platform::errors::Fatal("Unregistered device type %s.", device_type));
+    return nullptr;
+  }
+}
+
+Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+
+  auto& dev_map = Instance().device_map_;
+  auto dev_type = PlaceHelper::GetDeviceType(place);
+  auto dev_id = PlaceHelper::GetDeviceId(place);
+  PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
+                    platform::errors::NotFound(
+                        "Unable to find Device with type %s.", dev_type));
+  auto& dev_vec = dev_map[dev_type];
+  PADDLE_ENFORCE_LT(
+      dev_id, dev_vec.size(),
+      platform::errors::OutOfRange(
+          "The visible devices count of type %s is %d, but dev_id is %d.",
+          dev_type, dev_vec.size(), dev_id));
+  return dev_vec[dev_id].get();
+}
+
+std::vector<std::string> DeviceManager::GetAllDeviceTypes() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    devices.push_back(iter->first);
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllCustomDeviceTypes() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    if (iter->second->IsCustom()) {
+      devices.push_back(iter->first);
+    }
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllDeviceList() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    size_t device_count = iter->second->GetDeviceCount();
+    std::string dev_type = iter->second->Type();
+    if (device_count == 1) {
+      devices.push_back(dev_type);
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        devices.push_back(dev_type + ":" + std::to_string(i));
+      }
+    }
+  }
+  return devices;
+}
+
+std::vector<std::string> DeviceManager::GetAllCustomDeviceList() {
+  pten::AutoRDLock lock(&_global_device_manager_rw_lock);
+  auto& dev_impl_map = Instance().device_impl_map_;
+  std::vector<std::string> devices;
+  for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
+    size_t device_count = iter->second->GetDeviceCount();
+    std::string dev_type = iter->second->Type();
+    if (iter->second->IsCustom()) {
+      if (device_count == 1) {
+        devices.push_back(dev_type);
+      } else {
+        for (size_t i = 0; i < device_count; ++i) {
+          devices.push_back(dev_type + ":" + std::to_string(i));
+        }
+      }
+    }
+  }
+  return devices;
+}
+
+bool DeviceManager::HasDeviceType(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl != nullptr;
+}
+
+bool DeviceManager::IsCustom(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->IsCustom();
+}
+
+void DeviceManager::Initialize(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->Initialize();
+}
+
+void DeviceManager::Finalize(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->Finalize();
+}
+
+void DeviceManager::SynchronizeDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->SynchronizeDevice(device_id);
+}
+
+void DeviceManager::InitDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->InitDevice(device_id);
+}
+
+void DeviceManager::DeInitDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->DeInitDevice(device_id);
+}
+
+void DeviceManager::SetDevice(const std::string& device_type,
+                              size_t device_id) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->SetDevice(device_id);
+}
+
+void DeviceManager::SetDevice(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  DeviceManager::SetDevice(device_type, device_id);
+}
+
+int DeviceManager::GetDevice(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDevice();
+}
+
+size_t DeviceManager::GetMinChunkSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMinChunkSize(device_id);
+}
+
+size_t DeviceManager::GetMaxChunkSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMaxChunkSize(device_id);
+}
+
+size_t DeviceManager::GetMaxAllocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetMaxAllocSize(device_id);
+}
+
+size_t DeviceManager::GetInitAllocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetInitAllocSize(device_id);
+}
+
+size_t DeviceManager::GetReallocSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetReallocSize(device_id);
+}
+
+size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetExtraPaddingSize(device_id);
+}
+
+void DeviceManager::MemoryStats(const Place& place, size_t* total,
+                                size_t* free) {
+  auto device_type = PlaceHelper::GetDeviceType(place);
+  auto device_id = PlaceHelper::GetDeviceId(place);
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->MemoryStats(device_id, total, free);
+}
+
+size_t DeviceManager::GetDeviceCount(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDeviceCount();
+}
+
+std::vector<size_t> DeviceManager::GetDeviceList(
+    const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  return dev_impl->GetDeviceList();
+}
+
+DeviceManager& DeviceManager::Instance() {
+  static DeviceManager platform_manager;
+  return platform_manager;
+}
+
+std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
+  std::vector<std::string> libraries;
+  std::regex express(".*\\.so");
+  std::match_results<std::string::iterator> results;
+  DIR* dir = nullptr;
+  dirent* ptr = nullptr;
+
+  dir = opendir(library_dir.c_str());
+  if (dir == nullptr) {
+    VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed";
+  } else {
+    while ((ptr = readdir(dir)) != nullptr) {
+      std::string filename(ptr->d_name);
+      if (std::regex_match(filename.begin(), filename.end(), results,
+                           express)) {
+        libraries.push_back(library_dir + '/' + filename);
+        VLOG(4) << "found CustomDevice library: " << libraries.back()
+                << std::endl;
+      }
+    }
+    closedir(dir);
+  }
+
+  return libraries;
+}
+
+bool LoadCustomDevice(const std::string& library_dir) {
+  std::vector<std::string> libs = ListAllLibraries(library_dir);
+  for (const auto& lib_path : libs) {
+    auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
+    LoadCustomRuntimeLib(dso_handle);
+  }
+  return true;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/fluid/platform/device/device_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad910605d987aed726c41ff242434979aa2bb058
--- /dev/null
+++ b/paddle/fluid/platform/device/device_manager.h
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+
+#include "paddle/fluid/platform/device/device_base.h"
+#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/pten/backends/dynload/port.h"
+#include "paddle/pten/core/utils/rw_lock.h"
+
+namespace paddle {
+namespace platform {
+class Device final {
+ public:
+  Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
+
+  // Stream
+  // ! Create an asynchronous stream
+  void CreateStream(
+      stream::Stream* stream, const stream::Stream::Priority& priority =
+                                  stream::Stream::Priority::kNormal,
+      const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
+
+  // ! Destroys an asynchronous stream.
+  void DestroyStream(stream::Stream* stream);
+
+  // ! Waits for stream tasks to complete.
+  void SynchronizeStream(const stream::Stream* stream);
+
+  // ! Queries an asynchronous stream for completion status.
+  bool QueryStream(const stream::Stream* stream);
+
+  // ! Add a callback to a compute stream.
+  void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback);
+
+  // Event
+  // ! Create an event.
+  void CreateEvent(event::Event* event, event::Event::Flag flags);
+
+  // ! Destroy an event.
+  void DestroyEvent(event::Event* event);
+
+  // ! Records an event.
+  void RecordEvent(const event::Event* event, const stream::Stream* stream);
+
+  // ! Waits for event to complete.
+  void SynchronizeEvent(const event::Event* event);
+
+  // ! Queries an event for completion status.
+  bool QueryEvent(const event::Event* event);
+
+  // ! Make a compute stream wait on an event
+  void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
+
+  // Memory
+  void MemoryCopyH2D(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyD2H(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyD2D(void* dst, const void* src, size_t size,
+                     const stream::Stream* stream = nullptr);
+
+  void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
+                     size_t size, const stream::Stream* stream = nullptr);
+
+  void* MemoryAllocate(size_t size);
+
+  void MemoryDeallocate(void* ptr, size_t size);
+
+  void* MemoryAllocateHost(size_t size);
+
+  void MemoryDeallocateHost(void* ptr, size_t size);
+
+  void* MemoryAllocateUnified(size_t size);
+
+  void MemoryDeallocateUnified(void* ptr, size_t size);
+
+  void MemorySet(void* ptr, uint8_t value, size_t size);
+
+  std::string Type();
+
+ private:
+  size_t dev_id_;
+  DeviceInterface* impl_;
+};
+
+class DeviceManager {
+ public:
+  static bool Register(std::unique_ptr<DeviceInterface> device);
+  static bool RegisterPinnedDevice(DeviceInterface* device);
+  static Device* GetDeviceWithPlace(const Place& place);
+  static std::vector<std::string> GetAllDeviceTypes();
+  static std::vector<std::string> GetAllCustomDeviceTypes();
+  static std::vector<std::string> GetAllDeviceList();
+  static std::vector<std::string> GetAllCustomDeviceList();
+  static bool HasDeviceType(const std::string& device_type);
+  static bool IsCustom(const std::string& device_type);
+
+  // platform & device
+  static void Initialize(const std::string& device_type);
+
+  static void Finalize(const std::string& device_type);
+
+  static void SynchronizeDevice(const Place& place);
+
+  static void InitDevice(const Place& place);
+
+  static void DeInitDevice(const Place& place);
+
+  static void SetDevice(const std::string& device_type, size_t device_id);
+
+  static void SetDevice(const Place& place);
+
+  static int GetDevice(const std::string& device_type);
+
+  static size_t GetMinChunkSize(const Place& place);
+
+  static size_t GetMaxChunkSize(const Place& place);
+
+  static size_t GetMaxAllocSize(const Place& place);
+
+  static size_t GetInitAllocSize(const Place& place);
+
+  static size_t GetReallocSize(const Place& place);
+
+  static size_t GetExtraPaddingSize(const Place& place);
+
+  static void MemoryStats(const Place& place, size_t* total, size_t* free);
+
+  static size_t GetDeviceCount(const std::string& device_type);
+
+  static std::vector<size_t> GetDeviceList(const std::string& device_type);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(DeviceManager);
+  DeviceManager() {}
+  static DeviceManager& Instance();
+  static DeviceInterface* GetDeviceInterfaceWithType(
+      const std::string& device_type);
+
+  std::unordered_map<std::string, std::unique_ptr<DeviceInterface>>
+      device_impl_map_;
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Device>>>
+      device_map_;
+};
+
+bool LoadCustomRuntimeLib(void* dso_handle);
+
+bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
+                          std::unique_ptr<C_DeviceInterface> device_interface,
+                          void* dso_handle);
+
+bool LoadCustomDevice(const std::string& library_path);
+
+class Registrar {
+ public:
+  template <typename DeviceT>
+  explicit Registrar(DeviceT* device_ptr) {
+    DeviceManager::Register(std::unique_ptr<DeviceT>(device_ptr));
+  }
+
+  void Touch() {}
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index 4f8bbb2d2689eb6ffee1119c6eb14ef27de7a2c8..ba3461d8c14871561b2d069f9350698306e22366 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -38,3 +38,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_IPU
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/device/custom/enforce_custom.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_manager.h"
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/stream.h"
+#endif
diff --git a/paddle/fluid/platform/device/event.cc b/paddle/fluid/platform/device/event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e6316ea16de020801a7afce6ad47f4b06eca022
--- /dev/null
+++ b/paddle/fluid/platform/device/event.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/event.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/stream.h"
+
+namespace paddle {
+namespace platform {
+namespace event {
+
+event_t Event::raw_event() const { return event_; }
+
+void Event::set_event(event_t event) { event_ = event; }
+
+Event::Event(const Place& place, event_t event)
+    : place_(place),
+      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      event_(event),
+      own_data_(false) {}
+
+Event::~Event() { Destroy(); }
+
+bool Event::Init(const Place& place, Flag flags) {
+  place_ = place;
+  DeviceGuard guard(place_);
+  device_->CreateEvent(this, flags);
+  VLOG(3) << "Init Event: " << event_ << ", place: " << place_
+          << ", flag:" << static_cast<int>(flags);
+  own_data_ = true;
+  return true;
+}
+
+void Event::Destroy() {
+  if (own_data_) {
+    DeviceGuard guard(place_);
+    device_->DestroyEvent(this);
+    own_data_ = false;
+  }
+}
+
+void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
+
+bool Event::Query() const { return device_->QueryEvent(this); }
+
+void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
+
+const Place& Event::GetPlace() const { return place_; }
+
+}  // namespace event
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/event.h b/paddle/fluid/platform/device/event.h
new file mode 100644
index 0000000000000000000000000000000000000000..376d73eb66660fdcdc0b2412d5d5e1371145e634
--- /dev/null
+++ b/paddle/fluid/platform/device/event.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+class Device;
+
+namespace stream {
+class Stream;
+}  // namespace stream
+
+namespace event {
+using event_t = void*;
+
+class Event {
+ public:
+  enum Flag {
+    Default = 0x0,
+    BlockingSync = 0x1,
+    DisableTiming = 0x2,
+    Interprocess = 0x4,
+  };
+
+  // For compatible
+  Event(const Place& place, event_t event);
+  ~Event();
+  event_t raw_event() const;
+  void set_event(event_t event);
+  bool Init(const Place& place, Flag flags = Flag::Default);
+  void Destroy();
+  void Record(const stream::Stream* stream);
+  bool Query() const;
+  void Synchonrize() const;
+  const Place& GetPlace() const;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Event);
+  Place place_;
+  Device* device_;
+  event_t event_;
+  bool own_data_ = true;
+};
+}  // namespace event
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/fluid/platform/device/stream.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f867e5ee7737d45f26a1967a3112c7075843454
--- /dev/null
+++ b/paddle/fluid/platform/device/stream.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/stream.h"
+#include "paddle/fluid/platform/device/device_guard.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/event.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+Stream::~Stream() { Destroy(); }
+
+const stream_t& Stream::raw_stream() const { return stream_; }
+
+void Stream::set_stream(stream_t stream) { stream_ = stream; }
+
+// For compatiable
+Stream::Stream(const Place& place, stream_t stream)
+    : place_(place),
+      device_(platform::DeviceManager::GetDeviceWithPlace(place)),
+      stream_(stream),
+      callback_manager_(new CallbackManager(this)),
+      own_data_(false) {}
+
+bool Stream::Init(const Place& place, const Priority& priority,
+                  const Flag& flag) {
+  place_ = place;
+  device_ = platform::DeviceManager::GetDeviceWithPlace(place);
+  DeviceGuard guard(place_);
+  device_->CreateStream(this, priority, flag);
+
+  callback_manager_.reset(new CallbackManager(this));
+  VLOG(3) << "Init Stream: " << stream_ << ", place: " << place_
+          << ", priority: " << static_cast<int>(priority)
+          << ", flag:" << static_cast<int>(flag);
+  own_data_ = true;
+  return true;
+}
+
+void Stream::RecordEvent(event::Event* event, Callback callback) const {
+  callback();
+  device_->RecordEvent(event, this);
+}
+
+void Stream::RecordEvent(event::Event* event) const {
+  device_->RecordEvent(event, this);
+}
+
+void Stream::WaitEvent(event::Event* event) const {
+  device_->StreamWaitEvent(this, event);
+}
+
+void Stream::Wait() const {
+#if !defined(_WIN32)
+  device_->SynchronizeStream(this);
+#else
+  while (1) {
+    if (device_->QueryStream(this)) {
+      break;
+    }
+  }
+#endif
+}
+
+void Stream::WaitCallback() const { callback_manager_->Wait(); }
+
+void Stream::Destroy() {
+  if (own_data_) {
+    DeviceGuard guard(place_);
+    device_->DestroyStream(this);
+    own_data_ = false;
+  }
+}
+
+bool Stream::Query() const { return device_->QueryStream(this); }
+
+void Stream::Synchronize() const { device_->SynchronizeStream(this); }
+
+const Place& Stream::GetPlace() const { return place_; }
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/stream.h b/paddle/fluid/platform/device/stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..25cf705ee0951847bfda84b336d3579403e8ab37
--- /dev/null
+++ b/paddle/fluid/platform/device/stream.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/callback_manager.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+class Device;
+
+namespace event {
+class Event;
+}  // namespace event
+
+namespace stream {
+using stream_t = void*;
+class Stream {
+ public:
+  enum class Priority : uint8_t {
+    kNull = 0x0,
+    kHigh = 0x1,
+    kNormal = 0x2,
+  };
+
+  enum class Flag : uint8_t {
+    kDefaultFlag = 0x0,
+    kStreamNonBlocking = 0x1,
+  };
+
+  using Callback = std::function<void()>;
+
+  Stream() = default;
+  // For compatiable
+  Stream(const Place& place, stream_t stream);
+  ~Stream();
+  const stream_t& raw_stream() const;
+  void set_stream(stream_t stream);
+  bool Init(const Place& place, const Priority& priority = Priority::kNormal,
+            const Flag& flag = Flag::kDefaultFlag);
+  template <typename Callback>
+  void AddCallback(Callback&& callback) const {
+    callback_manager_->AddCallback(callback);
+  }
+  void RecordEvent(event::Event* event, Callback callback) const;
+  void RecordEvent(event::Event* event) const;
+  void WaitEvent(event::Event* event) const;
+  void Wait() const;
+  void WaitCallback() const;
+  void Destroy();
+  bool Query() const;
+  void Synchronize() const;
+  const Place& GetPlace() const;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Stream);
+  Place place_;
+  Device* device_;
+  stream_t stream_;
+  std::unique_ptr<CallbackManager> callback_manager_;
+  bool own_data_ = true;
+};
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a0a853a2f059745b281d3651d39baf061edf1053..d448df0702aadd56157902b55b11c41496bcf484 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -256,6 +257,15 @@ DeviceContextPool::DeviceContextPool(
           "NPUPinnedPlace is not supported. Please re-compile with "
           "WITH_ASCEND_CL "
           "option."));
+#endif
+    } else if (platform::is_custom_place(p)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      EmplaceDeviceContext<CustomDeviceContext>(&device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CustomPlace is not supported. Please re-compile with "
+          "WITH_CUSTOM_DEVICE "
+          "option."));
 #endif
     }
   }
@@ -885,6 +895,24 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
   return key_it->second;
 }
 
+#endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) {
+  DeviceGuard guard(place_);
+  stream_.reset(new stream::Stream());
+  stream_->Init(place_);
+}
+
+CustomDeviceContext::~CustomDeviceContext() {}
+
+const Place& CustomDeviceContext::GetPlace() const { return place_; }
+
+void CustomDeviceContext::Wait() const {
+  // platform::RecordEvent record_event("NPUDeviceContext/wait");
+  VLOG(4) << "CustomDevice context(" << this << ")  Wait";
+  stream_->Wait();
+}
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 80dcf6d2ec23cea4f375f54d5d9f1b6e24f382cb..1d51383f6833b584f77bce9e865ad5d229590421 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -70,6 +70,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
+
+#include "paddle/fluid/platform/device/device_ext.h"
+#include "paddle/fluid/platform/device/stream.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -815,6 +818,47 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 };
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class CustomDeviceContext : public DeviceContext {
+ public:
+  explicit CustomDeviceContext(CustomPlace place);
+  virtual ~CustomDeviceContext();
+
+  const Place& GetPlace() const override;
+  void Wait() const override;
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  C_Stream stream() const {
+    return reinterpret_cast<C_Stream>(stream_->raw_stream());
+  }
+
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    return stream_->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
+ private:
+  std::string device_type_;
+
+  CustomPlace place_;
+
+  std::shared_ptr<platform::stream::Stream> stream_;
+
+  CustomDeviceContext();
+  DISABLE_COPY_AND_ASSIGN(CustomDeviceContext);
+};
+template <>
+struct DefaultDeviceContextType<platform::CustomPlace> {
+  using TYPE = CustomDeviceContext;
+};
+#else
+template <>
+struct DefaultDeviceContextType<platform::CustomPlace> {
+  using TYPE = DeviceContext;
+};
+#endif
+
 /*! \brief device context pool singleton */
 class DeviceContextPool {
  public:
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index b969ba971b6b1ec2ca1ad6e8c0c28fdf07bb6431..39f95a9295661b2b3432d7ca062b2bdb1fe5c40a 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
 
 /**
  * Memory related FLAG
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index f7a86e5aac765c68e3f11e8adcfdf1c9a75aba7c..5d0fccf9e9d4188e66ac54213271ac7cb10d019e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cupti.h"
 #endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -234,6 +235,19 @@ void InitDevices(const std::vector<int> devices) {
     if (!custom_kernel_root.empty()) {
       LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
       framework::LoadCustomKernel(custom_kernel_root);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      if (platform::LoadCustomDevice(custom_kernel_root)) {
+        auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+        for (auto &dev_type : device_types) {
+          VLOG(1) << "Device type: " << dev_type << ", visible devices count: "
+                  << platform::DeviceManager::GetDeviceCount(dev_type);
+          for (size_t i = 0;
+               i < platform::DeviceManager::GetDeviceCount(dev_type); i++) {
+            places.push_back(platform::CustomPlace(dev_type, i));
+          }
+        }
+      }
+#endif
     } else {
       VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty.";
     }
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index e73e3736f64b462f03e6cda1e6212fcfe55c9939..b73e2e398f270646b19cca06274e549a4a4b62ba 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -56,7 +56,16 @@ bool is_npu_pinned_place(const Place &p) {
   return p.GetType() == pten::AllocationType::NPUPINNED;
 }
 
+bool is_custom_place(const Place &p) {
+  return p.GetType() == pten::AllocationType::CUSTOM;
+}
+
 bool places_are_same_class(const Place &p1, const Place &p2) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (is_custom_place(p1) && is_custom_place(p2)) {
+    return p1.GetDeviceType() == p2.GetDeviceType();
+  }
+#endif
   return p1.GetType() == p2.GetType();
 }
 
@@ -73,6 +82,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return p1 == p2;
     } else if (is_ipu_place(p1)) {
       return p1 == p2;
+    } else if (is_custom_place(p1)) {
+      return p1 == p2;
     } else {
       return p1 == p2;
     }
@@ -81,5 +92,43 @@ bool is_same_place(const Place &p1, const Place &p2) {
   }
 }
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+std::string PlaceHelper::GetDeviceType(const Place &place) {
+  if (is_cpu_place(place)) {
+    return "cpu";
+  } else if (is_gpu_place(place)) {
+    return "gpu";
+  } else if (is_npu_place(place)) {
+    return "npu";
+  } else if (is_xpu_place(place)) {
+    return "xpu";
+  } else if (is_custom_place(place)) {
+    return place.GetDeviceType();
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Unknown device type. Please check available devices by "
+        "paddle.device.get_available_device()"));
+  }
+}
+
+size_t PlaceHelper::GetDeviceId(const Place &place) {
+  return place.GetDeviceId();
+}
+
+Place PlaceHelper::CreatePlace(const std::string &dev_type, size_t dev_id) {
+  if (dev_type == "cpu") {
+    return platform::CPUPlace();
+  } else if (dev_type == "gpu") {
+    return platform::CUDAPlace(dev_id);
+  } else if (dev_type == "npu") {
+    return platform::NPUPlace(dev_id);
+  } else if (dev_type == "xpu") {
+    return platform::XPUPlace(dev_id);
+  } else {
+    return platform::CustomPlace(dev_type, dev_id);
+  }
+}
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 80bbeac251810b6d32167433292fc55c3105234e..278bfad003cd444143fc98f3f8382687073cc483 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -36,9 +36,19 @@ using NPUPinnedPlace = pten::NPUPinnedPlace;
 using XPUPlace = pten::XPUPlace;
 using IPUPlace = pten::IPUPlace;
 using MLUPlace = pten::MLUPlace;
+using CustomPlace = pten::CustomPlace;
 
 using PlaceList = std::vector<Place>;
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+class PlaceHelper {
+ public:
+  static std::string GetDeviceType(const Place &place);
+  static size_t GetDeviceId(const Place &place);
+  static Place CreatePlace(const std::string &dev_type, size_t dev_id = 0);
+};
+#endif
+
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
@@ -47,6 +57,7 @@ bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool is_npu_pinned_place(const Place &);
+bool is_custom_place(const Place &p);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
@@ -121,6 +132,15 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #else
       PADDLE_THROW(platform::errors::Unavailable(
           "Paddle is not compiled with MLU. Cannot visit mlu device"));
+#endif
+    }
+    case pten::AllocationType::CUSTOM: {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      platform::CustomPlace p(place.GetDeviceType(), place.GetDeviceId());
+      return visitor(p);
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Paddle is not compiled with CUSTOM. Cannot visit custom device"));
 #endif
     }
     default: {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 265f0fba8f376e5c4e748415469f1b4caab1d4c4..b1fe9f99b5d428d735a6e6734ccd5d7d6faa74e8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -284,7 +284,7 @@ if(WITH_PYTHON)
 
   cc_library(paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
-    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
 
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index c84a71d8aaa002b8d40ff2713252d2cd6afff2bb..f4ed1ee3424f229d77c293d19edca911aea31f69 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -136,10 +136,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::Place>();
   } else if (py::isinstance<platform::MLUPlace>(place_obj)) {
     return place_obj.cast<platform::MLUPlace>();
+  } else if (py::isinstance<platform::CustomPlace>(place_obj)) {
+    return place_obj.cast<platform::CustomPlace>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
-        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace"));
+        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/"
+        "CustomPlace"));
   }
 }
 
@@ -183,6 +186,9 @@ static void InitVarBaseAndTensor(
     SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_mlu_place(place)) {
     SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
+  } else if (platform::is_custom_place(place)) {
+    SetTensorFromPyArray<platform::CustomPlace>(tensor, array, place,
+                                                zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
@@ -941,6 +947,10 @@ void BindImperative(py::module *m_ptr) {
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
            py::arg("zero_copy") = false, py::arg("name") = "",
            py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
       .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"),
            py::arg("name") = "")
@@ -956,6 +966,8 @@ void BindImperative(py::module *m_ptr) {
            py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::MLUPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("name") = "")
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
       .def(
           "__setitem_varbase__",
@@ -2258,6 +2270,11 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
               VLOG(4) << "Tracer(" << &self << ")"
                       << " set expected place " << *p;
+            } else if (py::isinstance<platform::CustomPlace>(obj)) {
+              auto p = obj.cast<platform::CustomPlace *>();
+              self.SetExpectedPlace(*p);
+              VLOG(4) << "Tracer(" << &self << ")"
+                      << " set expected place " << *p;
             } else if (py::isinstance<platform::Place>(obj)) {
               auto p = obj.cast<platform::Place *>();
               self.SetExpectedPlace(*p);
@@ -2301,6 +2318,21 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::CustomPlace &place,
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 959e34afe3da66987f040c81b21b410d66c7a555..5289b862dc948baacf7c373ebcee483dc589d9a6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -69,6 +69,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -1667,6 +1668,139 @@ All parameter, weight, gradient are variables in Paddle.
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
+  m.def("get_all_device_type", []() {
+    std::vector<std::string> device_types;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    device_types = platform::DeviceManager::GetAllDeviceTypes();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_all_device_type because you have installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_all_device_type, please try to install"
+              "CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return device_types;
+  });
+  m.def("get_all_custom_device_type", []() {
+    std::vector<std::string> device_types;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_all_custom_device_type because you have installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_all_custom_device_type, please try to "
+              "install CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return device_types;
+  });
+  m.def("get_available_device", [] {
+    std::vector<std::string> devices;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    devices = platform::DeviceManager::GetAllDeviceList();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_available_device because you have installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_available_device, please try to install"
+              "CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return devices;
+  });
+  m.def("get_available_custom_device", [] {
+    std::vector<std::string> devices;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    devices = platform::DeviceManager::GetAllCustomDeviceList();
+#else
+          LOG(WARNING) << string::Sprintf(
+              "Cannot use get_available_custom_device because you have "
+              "installed"
+              "CPU/GPU version PaddlePaddle.\n"
+              "If you want to use get_available_custom_device, please try to "
+              "install"
+              "CustomDevice version "
+              "PaddlePaddle by: pip install paddlepaddle-core\n");
+#endif
+    return devices;
+  });
+  py::class_<platform::CustomPlace>(m, "CustomPlace",
+                                    R"DOC(
+    CustomPlace is a descriptor of a device.
+    It represents a custom device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
+                                             )DOC")
+      .def("__init__",
+           [](platform::CustomPlace &self, const std::string &device_type,
+              int dev_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), device id must be 0 "
+                   "or "
+                   "positive integer",
+                   device_type, dev_id);
+               std::exit(-1);
+             }
+
+             if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) &&
+                        platform::DeviceManager::IsCustom(device_type))) {
+               int dev_count = static_cast<int>(
+                   platform::DeviceManager::GetDeviceCount(device_type));
+               if (UNLIKELY(dev_id >= dev_count)) {
+                 if (dev_count == 0) {
+                   LOG(ERROR) << "Cannot use " << device_type
+                              << " because there is no " << device_type
+                              << " detected on your "
+                                 "machine.";
+                   std::exit(-1);
+                 } else {
+                   LOG(ERROR) << string::Sprintf(
+                       "Invalid CustomPlace(%s, %d), dev_id must "
+                       "inside "
+                       "[0, %d), because %s "
+                       "number on your machine is %d",
+                       device_type, dev_id, dev_count, device_type, dev_count);
+                   std::exit(-1);
+                 }
+               }
+               new (&self) platform::CustomPlace(device_type, dev_id);
+             } else {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), the device type is "
+                   "not registered "
+                   "as a custom device.",
+                   device_type, dev_id);
+               std::exit(-1);
+             }
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use CustomDevice because you have installed CPU/GPU"
+                 "version PaddlePaddle.\n"
+                 "If you want to use CustomDevice, please try to install"
+                 "CustomDevice version "
+                 "PaddlePaddle by: pip install paddlepaddle-core\n"
+                 "If you only have CPU, please change "
+                 "CustomPlace(%s, %d) to be CPUPlace().\n",
+                 device_type, dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("get_device_id",
+           [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
+      .def("get_device_type",
+           [](const platform::CustomPlace &self) {
+             return self.GetDeviceType();
+           })
+      .def("__repr__", string::to_string<const platform::CustomPlace &>)
+      .def("__str__", string::to_string<const platform::CustomPlace &>);
   py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
 
     CUDAPlace is a descriptor of a device.
@@ -2118,11 +2252,16 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("is_mlu_place",
            [](platform::Place &self) { return platform::is_mlu_place(self); })
+      .def(
+          "is_custom_place",
+          [](platform::Place &self) { return platform::is_custom_place(self); })
       .def("gpu_device_id", [](platform::Place &self) { return self.device; })
       .def("xpu_device_id", [](platform::Place &self) { return self.device; })
       .def("npu_device_id", [](platform::Place &self) { return self.device; })
       .def("ipu_device_id", [](platform::Place &self) { return self.device; })
       .def("mlu_device_id", [](platform::Place &self) { return self.device; })
+      .def("custom_device_id",
+           [](platform::Place &self) { return self.device; })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -2154,6 +2293,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self, const platform::MLUPlace &mlu_place) {
              self = mlu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::CustomPlace &plug_place) {
+             self = plug_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 9a11c5946f318b7e861b853d301e103e641d2722..f1983175bdf94fa6e9fcee49e6f85e7bdf6f4765 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -247,6 +248,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
     auto p = self.place();
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_custom_place(self.place())) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    const T *a = self.data<T>();
+    auto p = self.place();
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
+                         nullptr);
 #endif
   }
   VLOG(10) << "TensorGetElement, place: " << self.place()
@@ -289,6 +297,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_custom_place(self->place())) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    auto p = self->place();
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
+                         nullptr);
 #endif
   }
 }
@@ -368,6 +383,24 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use MLUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with MLU support."));
+#endif
+  } else if (paddle::platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    platform::Place tmp_place = place;
+    platform::DeviceGuard guard(tmp_place);
+    auto dst = self->mutable_data<T>(place);
+
+    platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
+        reinterpret_cast<void *>(dst),
+        const_cast<void *>(reinterpret_cast<const void *>(array.data())),
+        array.nbytes());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    ctx.Wait();
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CustomDevice in CPU/GPU/XPU version. "
+        "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -757,6 +790,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
   bool is_npu_tensor = platform::is_npu_place(tensor.place());
   bool is_mlu_tensor = platform::is_mlu_place(tensor.place());
+  bool is_custom_device_tensor = platform::is_custom_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype());
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -776,7 +810,8 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(
       framework::TransToProtoVarType(tensor.dtype()));
 
-  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor &&
+      !is_custom_device_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -900,6 +935,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
         "Please recompile or reinstall Paddle with MLU support."));
+#endif
+  } else if (is_custom_device_tensor) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), tensor.place(),
+        tensor_buf_ptr, copy_bytes,
+        reinterpret_cast<const platform::CustomDeviceContext &>(ctx).stream());
+    ctx.Wait();
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with CustomPlace "
+        "support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc
index e2cb934f0a1c5d5fb599bddcf44345f70ac688c2..0a3bfccb16a4b2aa83425ddc41ae141251842bac 100644
--- a/paddle/pten/common/place.cc
+++ b/paddle/pten/common/place.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
+#include <unordered_map>
 
 #include "paddle/pten/api/ext/exception.h"
 
@@ -50,7 +51,11 @@ const char *AllocationTypeStr(AllocationType type) {
 std::string Place::DebugString() const {
   std::ostringstream os;
   os << "Place(";
-  os << AllocationTypeStr(alloc_type_);
+  if (alloc_type_ == AllocationType::CUSTOM) {
+    os << GetGlobalDeviceType(device_type_id_);
+  } else {
+    os << AllocationTypeStr(alloc_type_);
+  }
   if (alloc_type_ == AllocationType::GPUPINNED ||
       alloc_type_ == AllocationType::NPUPINNED ||
       alloc_type_ == AllocationType::CPU) {
@@ -66,4 +71,23 @@ std::ostream &operator<<(std::ostream &os, const Place &p) {
   return os;
 }
 
+static std::unordered_map<std::string, size_t> global_registered_device_type_id;
+static std::unordered_map<size_t, std::string> global_registered_device_type;
+
+size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) {
+  if (device_type.empty()) return 0;
+  if (global_registered_device_type_id.find(device_type) ==
+      global_registered_device_type_id.end()) {
+    size_t device_type_id = global_registered_device_type_id.size() + 1;
+    global_registered_device_type_id[device_type] = device_type_id;
+    global_registered_device_type[device_type_id] = device_type;
+  }
+  return global_registered_device_type_id[device_type];
+}
+
+std::string GetGlobalDeviceType(size_t device_type_id) {
+  if (device_type_id == 0) return "";
+  return global_registered_device_type[device_type_id];
+}
+
 }  // namespace pten
diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h
index 75f1f4de9984c72200df68f1d55cf45ce7a58c98..6b7d1ea55d5c4159bd2d005518dd3631db7c05a7 100644
--- a/paddle/pten/common/place.h
+++ b/paddle/pten/common/place.h
@@ -28,29 +28,49 @@ enum class AllocationType : int8_t {
   NPUPINNED = 6,
   IPU = 7,
   MLU = 8,
+  CUSTOM = 9,
 };
 
 const char* AllocationTypeStr(AllocationType type);
 
+size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
+std::string GetGlobalDeviceType(size_t device_type_id_);
+
 /// \brief The place is used to specify where the data is stored.
 class Place {
  public:
   Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {}
 
-  explicit Place(AllocationType type, int8_t id)
-      : device(id), alloc_type_(type) {}
-
-  explicit Place(AllocationType type) : device(0), alloc_type_(type) {}
-
-  void Reset(AllocationType type, int8_t device_id = 0) noexcept {
+  explicit Place(AllocationType type,
+                 int8_t id,
+                 const std::string& dev_type = "")
+      : device(id),
+        alloc_type_(type),
+        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+
+  explicit Place(AllocationType type, const std::string& dev_type = "")
+      : device(0),
+        alloc_type_(type),
+        device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
+
+  void Reset(AllocationType type,
+             int8_t device_id = 0,
+             const std::string& dev_type = "") noexcept {
     alloc_type_ = type;
     device = device_id;
+    if (!dev_type.empty()) {
+      device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type);
+    }
   }
 
   AllocationType GetType() const { return alloc_type_; }
 
   int8_t GetDeviceId() const { return device; }
 
+  std::string GetDeviceType() const {
+    return GetGlobalDeviceType(device_type_id_);
+  }
+
   std::string DebugString() const;
 
   inline bool operator==(const Place& rhs) const {
@@ -62,6 +82,10 @@ class Place {
         alloc_type_ == AllocationType::NPUPINNED) {
       return true;
     }
+    if (alloc_type_ == AllocationType::CUSTOM) {
+      return device_type_id_ == rhs.device_type_id_ &&
+             device == rhs.GetDeviceId();
+    }
     return device == rhs.GetDeviceId();
   }
   inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
@@ -69,6 +93,10 @@ class Place {
     if (alloc_type_ != rhs.GetType()) {
       return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
     }
+    if (alloc_type_ == AllocationType::CUSTOM &&
+        device_type_id_ != rhs.device_type_id_) {
+      return device_type_id_ < rhs.device_type_id_;
+    }
     return device < rhs.GetDeviceId();
   }
 
@@ -79,6 +107,7 @@ class Place {
 
  private:
   AllocationType alloc_type_{AllocationType::UNDEFINED};
+  size_t device_type_id_;
 };
 
 class CPUPlace : public Place {
@@ -157,6 +186,22 @@ class MLUPlace : public Place {
       : Place(AllocationType::MLU, place.GetDeviceId()) {}
 };
 
+class CustomPlace : public Place {
+ public:
+  explicit CustomPlace(const std::string dev_type)
+      : Place(AllocationType::CUSTOM, 0, dev_type) {}
+  CustomPlace(const std::string dev_type, int device_id)
+      : Place(AllocationType::CUSTOM, device_id, dev_type) {}
+
+  CustomPlace(const CustomPlace&) = default;
+  CustomPlace(const Place& place) {  // NOLINT
+    if (place.GetType() == AllocationType::CUSTOM) {
+      this->Reset(
+          AllocationType::CUSTOM, place.GetDeviceId(), place.GetDeviceType());
+    }
+  }
+};
+
 std::ostream& operator<<(std::ostream&, const Place&);
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc
index 780068e0381aa87221dadc4b79bb8edb2fdf3842..09717ee65e0452a8563b063dfb790821297800f3 100644
--- a/paddle/pten/kernels/funcs/math_function.cc
+++ b/paddle/pten/kernels/funcs/math_function.cc
@@ -215,6 +215,15 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
       paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<paddle::platform::CustomPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("CustomPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<paddle::platform::CPUPlace>(
     const paddle::platform::DeviceContext& context,
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8ce9716b169b9c64b82d66b949609ff502775942..12d31aee41e394968d58753f2b54fcce8648a35e 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -293,6 +293,7 @@ from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
 from .framework import MLUPlace  # noqa: F401
+from .framework import CustomPlace  # noqa: F401
 
 from .autograd import grad  # noqa: F401
 from .autograd import no_grad  # noqa: F401
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index d102473fef791124e0605008dd1844507c3b4a61..89e0ae49fc48f73840129826952a01aec07dd3ab 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -36,7 +36,11 @@ __all__ = [  # noqa
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_npu',
-    'is_compiled_with_mlu'
+    'is_compiled_with_mlu',
+    'get_all_device_type',
+    'get_all_custom_device_type',
+    'get_available_device',
+    'get_available_custom_device',
 ]
 
 _cudnn_version = None
@@ -225,15 +229,26 @@ def _convert_to_place(device):
         selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
         device_id = int(selected_mlus[0])
         place = core.MLUPlace(device_id)
+    elif device in core.get_all_custom_device_type():
+        place = core.CustomPlace(device, 0)
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
         avaliable_npu_device = re.match(r'npu:\d+', lower_device)
         avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
         if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device:
-            raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'mlu', 'mlu:x', 'npu', 'npu:x' or ipu"
-            )
+            device_info_list = device.split(':', 1)
+            device_type = device_info_list[0]
+            if device_type in core.get_all_custom_device_type():
+                device_id = device_info_list[1]
+                device_id = int(device_id)
+                place = core.CustomPlace(device_type, device_id)
+            else:
+                raise ValueError(
+                    "The device must be a string which is like 'cpu', {}".
+                    format(', '.join("'{}', '{}:x'".format(x, x)
+                                     for x in ['gpu', 'xpu', 'npu', 'mlu'] +
+                                     core.get_all_custom_device_type())))
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
                 raise ValueError(
@@ -338,3 +353,103 @@ def get_device():
         raise ValueError("The device specification {} is invalid".format(place))
 
     return device
+
+
+def get_all_device_type():
+    """
+    Get all available device types.
+
+    Returns:
+        A list of all available device types.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_all_device_type()
+
+            # Case 1: paddlepaddle-cpu package installed, and no custom device registerd.
+            # Output: ['cpu']
+
+            # Case 2: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: ['cpu', 'gpu']
+
+            # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd.
+            # Output: ['cpu', 'CustomCPU']
+
+            # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['cpu', 'gpu', 'CustomCPU', 'CustomGPU']
+    """
+    return core.get_all_device_type()
+
+
+def get_all_custom_device_type():
+    """
+    Get all available custom device types.
+
+    Returns: 
+        A list of all available custom device types.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_all_custom_device_type()
+
+            # Case 1: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: None
+
+            # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['CustomCPU', 'CustomGPU']
+    """
+    return core.get_all_custom_device_type()
+
+
+def get_available_device():
+    """
+    Get all available devices.
+
+    Returns:
+        A list of all available devices.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_available_device()
+
+            # Case 1: paddlepaddle-cpu package installed, and no custom device registerd.
+            # Output: ['cpu']
+
+            # Case 2: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: ['cpu', 'gpu:0', 'gpu:1']
+
+            # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd.
+            # Output: ['cpu', 'CustomCPU']
+
+            # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['cpu', 'gpu:0', 'gpu:1', 'CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
+    """
+    return core.get_available_device()
+
+
+def get_available_custom_device():
+    """
+    Get all available custom devices.
+
+    Returns:
+       A list of all available custom devices.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.device.get_available_custom_device()
+
+            # Case 1: paddlepaddle-gpu package installed, and no custom device registerd.
+            # Output: None
+
+            # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
+            # Output: ['CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
+    """
+    return core.get_available_custom_device()
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index db6faa1a1b16578b95db4d81ab5bd66e5a003f75..997075590e5cf97241188b847c0c5b5036ecee59 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,7 @@ from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace, CustomPlace
 from .incubate import fleet
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bb77f6031f7f99f85925cc805ee9b8ae57fc17df..b8854dfd2ad551d2fcb30fe8c7a490a7377f00dd 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -6918,7 +6918,7 @@ def _get_paddle_place(place):
         return place
     if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
                           core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace,
-                          core.IPUPlace, core.MLUPlace)):
+                          core.IPUPlace, core.MLUPlace, core.CustomPlace)):
         return place
 
     if not isinstance(place, str):
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index a0503322806e5825ca720740e93c07ecf6cb51fb..72e8e73ce7c2e51b9f7d1e38dba1098149ffcf89 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -29,6 +29,7 @@ from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401
 from ..fluid.core import MLUPlace  # noqa: F401
+from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401
 
 from paddle.fluid import core  # noqa: F401
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index cd1faf64f3ea5cdddadcaa85cd68520b255d1db4..c121d7b6b83ec6fbde5b50852293901db9d61686 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -106,9 +106,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     if place is None:
         place = _current_expected_place()
     elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
-                                core.CUDAPlace, core.NPUPlace, core.XPUPlace)):
+                                core.CUDAPlace, core.NPUPlace, core.XPUPlace,
+                                core.CustomPlace)):
         raise ValueError(
-            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace"
+            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace"
         )
 
     #Todo(zhouwei): Support allocate tensor on any other specified card
diff --git a/python/setup.py.in b/python/setup.py.in
index 8f42beaf1c09b5e9d23946fb6436151590868072..9977ddeb26b17f6e69dbd49b782ff50490ab55a5 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -579,7 +579,8 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) +  # pten core headers
     # utila api headers
     ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -624,6 +625,8 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            if 'device_ext.h' in header:
+                install_dir = "paddle/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)