From 3e7825f375ce0a3e91d11979b883acfbfa7556f1 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 15 Feb 2022 16:35:08 +0800 Subject: [PATCH] [PluggableDevice] Add custom runtime support (#38740) * [CustomRuntime] Add DeviceManager * [CustomRuntime] Add DeviceInterface * [CustomRuntime] Add Stream, Event, DeviceGuard, CallbackManager * [CustomRuntime] Add plug-in device * [CustomRuntime] Memory module support PluggableDevice * [CustomRuntime] Add WITH_PLUGGABLE_DEVICE cmake option * update * [API] update API doc based on comments, test=develop Co-authored-by: qili93 --- CMakeLists.txt | 5 + cmake/configure.cmake | 4 + paddle/fluid/framework/dlpack_tensor.cc | 5 + paddle/fluid/framework/executor.cc | 14 + paddle/fluid/framework/garbage_collector.cc | 53 ++ paddle/fluid/framework/garbage_collector.h | 41 ++ paddle/fluid/framework/op_kernel_type.cc | 12 +- paddle/fluid/framework/operator.cc | 10 + paddle/fluid/framework/parallel_executor.cc | 15 + paddle/fluid/framework/tensor_util.cc | 103 ++- paddle/fluid/framework/tensor_util.h | 47 ++ .../fluid/imperative/gradient_accumulator.cc | 15 +- paddle/fluid/imperative/prepared_operator.cc | 10 + paddle/fluid/imperative/tracer.cc | 20 + paddle/fluid/memory/allocation/CMakeLists.txt | 5 + .../memory/allocation/allocator_facade.cc | 53 +- .../memory/allocation/custom_allocator.cc | 63 ++ .../memory/allocation/custom_allocator.h | 42 ++ .../allocation/naive_best_fit_allocator.cc | 132 +++- paddle/fluid/memory/detail/buddy_allocator.cc | 38 +- paddle/fluid/memory/detail/buddy_allocator.h | 6 +- .../fluid/memory/detail/system_allocator.cc | 47 ++ paddle/fluid/memory/detail/system_allocator.h | 18 + paddle/fluid/memory/memcpy.cc | 285 ++++++-- paddle/fluid/memory/memcpy.h | 59 +- paddle/fluid/operators/math/math_function.cc | 313 ++++++++ paddle/fluid/platform/device/CMakeLists.txt | 23 + .../fluid/platform/device/callback_manager.cc | 52 ++ .../fluid/platform/device/callback_manager.h | 62 ++ .../platform/device/custom/CMakeLists.txt | 4 + .../platform/device/custom/custom_device.cc | 672 ++++++++++++++++++ .../device/custom/custom_device_test.cc | 193 +++++ .../platform/device/custom/enforce_custom.h | 56 ++ .../platform/device/custom/fake_cpu_device.h | 185 +++++ paddle/fluid/platform/device/device_base.cc | 249 +++++++ paddle/fluid/platform/device/device_base.h | 166 +++++ paddle/fluid/platform/device/device_ext.h | 497 +++++++++++++ paddle/fluid/platform/device/device_guard.cc | 22 + paddle/fluid/platform/device/device_guard.h | 48 ++ .../fluid/platform/device/device_manager.cc | 420 +++++++++++ paddle/fluid/platform/device/device_manager.h | 186 +++++ paddle/fluid/platform/device/device_wrapper.h | 9 + paddle/fluid/platform/device/event.cc | 64 ++ paddle/fluid/platform/device/event.h | 61 ++ paddle/fluid/platform/device/stream.cc | 96 +++ paddle/fluid/platform/device/stream.h | 79 ++ paddle/fluid/platform/device_context.cc | 28 + paddle/fluid/platform/device_context.h | 44 ++ paddle/fluid/platform/flags.cc | 5 +- paddle/fluid/platform/init.cc | 14 + paddle/fluid/platform/place.cc | 49 ++ paddle/fluid/platform/place.h | 20 + paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/imperative.cc | 34 +- paddle/fluid/pybind/pybind.cc | 143 ++++ paddle/fluid/pybind/tensor_py.h | 65 +- paddle/pten/common/place.cc | 26 +- paddle/pten/common/place.h | 57 +- paddle/pten/kernels/funcs/math_function.cc | 9 + python/paddle/__init__.py | 1 + python/paddle/device/__init__.py | 123 +++- python/paddle/fluid/__init__.py | 2 +- python/paddle/fluid/framework.py | 2 +- python/paddle/framework/__init__.py | 1 + python/paddle/tensor/creation.py | 5 +- python/setup.py.in | 5 +- 66 files changed, 5056 insertions(+), 138 deletions(-) create mode 100644 paddle/fluid/memory/allocation/custom_allocator.cc create mode 100644 paddle/fluid/memory/allocation/custom_allocator.h create mode 100644 paddle/fluid/operators/math/math_function.cc create mode 100644 paddle/fluid/platform/device/callback_manager.cc create mode 100644 paddle/fluid/platform/device/callback_manager.h create mode 100644 paddle/fluid/platform/device/custom/CMakeLists.txt create mode 100644 paddle/fluid/platform/device/custom/custom_device.cc create mode 100644 paddle/fluid/platform/device/custom/custom_device_test.cc create mode 100644 paddle/fluid/platform/device/custom/enforce_custom.h create mode 100644 paddle/fluid/platform/device/custom/fake_cpu_device.h create mode 100644 paddle/fluid/platform/device/device_base.cc create mode 100644 paddle/fluid/platform/device/device_base.h create mode 100644 paddle/fluid/platform/device/device_ext.h create mode 100644 paddle/fluid/platform/device/device_guard.cc create mode 100644 paddle/fluid/platform/device/device_guard.h create mode 100644 paddle/fluid/platform/device/device_manager.cc create mode 100644 paddle/fluid/platform/device/device_manager.h create mode 100644 paddle/fluid/platform/device/event.cc create mode 100644 paddle/fluid/platform/device/event.h create mode 100644 paddle/fluid/platform/device/stream.cc create mode 100644 paddle/fluid/platform/device/stream.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e8321010d38..a4c1b9c8098 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup ji option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_POCKETFFT "Compile with pocketfft support" ON) option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) +option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) if(WITH_RECORD_BUILDTIME) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") @@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr return() endif() +if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER) +set(WITH_CUSTOM_DEVICE ON) +endif() + if(WIN32) if(WITH_DISTRIBUTE) MESSAGE(WARNING diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 9ebde06bd01..20a35c91bdd 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -219,3 +219,7 @@ endif(ON_INFER) if(WITH_CRYPTO) add_definitions(-DPADDLE_WITH_CRYPTO) endif(WITH_CRYPTO) + +if(WITH_CUSTOM_DEVICE AND NOT WIN32) + add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) +endif() diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 24f1591ff33..20d08ef18ae 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { platform::errors::Unimplemented("platform::MLUPlace is not supported")); } + inline ::DLDevice operator()(const platform::CustomPlace &place) const { + PADDLE_THROW(platform::errors::Unimplemented( + "platform::CustomPlace is not supported")); + } + inline ::DLDevice operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLDevice device; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5596aba5213..4e6a4d53608 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle")); +#endif + } else if (platform::is_custom_place(place_)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for " << place_ << "."; + gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_, + max_memory_size)); + } else { + VLOG(4) << "Use default stream gc for " << place_ << "."; + gc.reset( + new CustomDefaultStreamGarbageCollector(place_, max_memory_size)); + } +#else + PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found")); #endif } } diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 22f77be8505..9f2bdeffecf 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -18,6 +18,7 @@ #endif #include "gflags/gflags.h" #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/platform/device/device_wrapper.h" DECLARE_double(eager_delete_tensor_gb); DECLARE_double(memory_fraction_of_eager_deletion); @@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback( } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector( + const platform::CustomPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CustomDefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void CustomDefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector( + const platform::CustomPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CustomDeviceUnsafeFastGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +CustomStreamGarbageCollector::CustomStreamGarbageCollector( + const platform::CustomPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::DeviceGuard guard(place); + stream_.reset(new platform::stream::Stream); + stream_->Init(place); + callback_manager_.reset(new platform::CallbackManager(stream_.get())); +} + +CustomStreamGarbageCollector::~CustomStreamGarbageCollector() { + platform::DeviceGuard guard(this->dev_ctx_->GetPlace()); + stream_->Synchronize(); + stream_->Destroy(); +} + +platform::stream::Stream *CustomStreamGarbageCollector::stream() const { + return stream_.get(); +} + +void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void CustomStreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif + int64_t GetEagerDeletionThreshold() { return FLAGS_eager_delete_tensor_gb < 0 ? -1 diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index f5d79d864b5..a67860c6087 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class CustomDefaultStreamGarbageCollector : public GarbageCollector { + public: + CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector { + public: + CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class CustomStreamGarbageCollector : public GarbageCollector { + public: + CustomStreamGarbageCollector(const platform::CustomPlace &place, + size_t max_memory_size); + + ~CustomStreamGarbageCollector(); + + void Wait() const override; + + platform::stream::Stream *stream() const; + + protected: + void ClearCallback(const std::function &callback) override; + + private: + std::unique_ptr stream_; + std::unique_ptr callback_manager_; +}; +#endif + template void GarbageCollector::Add(Container &&objs) { Add(std::forward(objs), []() {}); diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc index 7dac6a092d2..9d1f0986998 100644 --- a/paddle/fluid/framework/op_kernel_type.cc +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { "Too many OpKernel attribute values, expected maximum " "value is 64, received value is %d.", cur_loc)); - +#ifdef PADDLE_WITH_CUSTOM_DEVICE + std::hash hasher; + size_t seed = + hasher(place + data_type + data_layout + library_type + customized_value); + if (platform::is_custom_place(key.place_)) { + seed ^= std::hash{}(key.place_.GetDeviceType()) + 0x9e3779b9 + + (seed << 6) + (seed >> 2) + 4; + } + return seed; +#else std::hash hasher; return hasher(place + data_type + data_layout + library_type + customized_value); +#endif } bool OpKernelType::operator==(const OpKernelType& o) const { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b6d8ca4aa67..7c13fa90f9b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/pten/common/scalar.h" @@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #else auto dev_id = place.device; platform::SetMLUDeviceId(dev_id); +#endif + } else if (platform::is_custom_place(place)) { +#ifndef PADDLE_WITH_CUSTOM_DEVICE + PADDLE_THROW(platform::errors::Unavailable( + "Cannot run operator on place %s, please recompile paddle or " + "reinstall Paddle with CustomDevice support.", + place)); +#else + platform::DeviceManager::SetDevice(place); #endif } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index aed5e2c7405..1a826f6bdd5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use XPU device since it's not compiled with XPU," "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (platform::is_custom_place(place)) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + if (IsFastEagerDeletionModeEnabled()) { + gc.reset( + new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size)); + } else { + gc.reset(new CustomStreamGarbageCollector(place, max_memory_size)); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use custom device since it's not compiled with " + "CustomDevice," + "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else if (platform::is_cpu_place(place)) { gc.reset(new CPUGarbageCollector(place, max_memory_size)); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 844b5d82695..e510257c610 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #endif - +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_custom_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_custom_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } +#endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) || - platform::is_mlu_place(dst_place)) { + platform::is_mlu_place(dst_place) || + platform::is_custom_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { dev_ctx = pool.Get(src.place()); @@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/ + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/ + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } + else if (platform::is_custom_place(src_place) && // NOLINT + platform::is_custom_place( + dst_place)) { /* custom_device -> custom_device*/ + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } +#endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor { const platform::CUDAPinnedPlace& cpu) const { return *out.data(); } + + bool GetResult(const framework::Tensor& out, + const platform::CustomPlace& custom_dev) const { + PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ", + custom_dev)); + return false; + } }; template @@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> { out_ptr[i] = lhs && rhs; } } + + void VisitorImpl(const platform::CustomPlace& custom_dev) const { + PADDLE_THROW( + platform::errors::Unimplemented("CustomPlace is not supported")); + } }; void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { @@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported when not compiled with NPU")); +#endif + } else if (platform::is_custom_place(tensor.place())) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& custom_device_context = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), tensor.place(), + reinterpret_cast(data), size_to_write, + custom_device_context.stream()); + custom_device_context.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CustomPlace is not supported when not compiled with " + "CustomDevice")); #endif } else { os.write(static_cast(data_ptr), @@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_mlu_place(dev_ctx.GetPlace()) || - platform::is_npu_place(dev_ctx.GetPlace())) { + platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); - if (platform::is_npu_place(dev_ctx.GetPlace())) { + if (platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { dev_ctx.Wait(); } #else @@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_mlu_place(dev_ctx.GetPlace()) || - platform::is_npu_place(dev_ctx.GetPlace())) { + platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); - if (platform::is_npu_place(dev_ctx.GetPlace())) { + if (platform::is_npu_place(dev_ctx.GetPlace()) || + platform::is_custom_place(dev_ctx.GetPlace())) { dev_ctx.Wait(); } #else @@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, } else if (platform::is_mlu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "MLUPlace is not supported when not compiled with MLU")); - } else { + } else if (platform::is_npu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported when not compiled with NPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "CutomPlace is not supported when not compiled with CustomDevice")); } #endif } else { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index bcaf3c719cb..1c1a86f1d32 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(dst_place)) { // NOLINT + memory::Copy( + dst_place, dst_ptr, src_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorFromArray on %s is not supported.", dst_place)); + } } template @@ -241,6 +252,17 @@ void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(dst_place)) { // NOLINT + memory::Copy( + dst_place, dst_ptr, src_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorFromVector on %s is not supported.", dst_place)); + } } // The fully specialized function should be inline to avoid @@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEICE + else if (platform::is_custom_place(dst_place)) { // NOLINT + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorFromVector on %s is not supported.", dst_place)); + } delete[] array; } @@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); + } +#endif + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "TensorToVector on %s is not supported.", src.place())); + } } template <> @@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src, dst_place, dst_ptr, src.place(), src_ptr, size, reinterpret_cast(ctx).stream()); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (platform::is_custom_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); + } #endif for (unsigned int i = 0; i < src.numel(); i++) { (*dst)[i] = static_cast(array[i]); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 4a71dd4deac..dc8b3982ba9 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -180,6 +180,12 @@ class TensorAddFunctor : public boost::static_visitor<> { "is not supported in imperative mode", place)); } + void operator()(const platform::CustomPlace& place) const { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } private: int64_t numel_; @@ -331,7 +337,14 @@ void TensorAdd(const VarType& src, VarType* dst) { return; } #endif - +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::is_custom_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } +#endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(place)) { if (data_type == framework::DataTypeTrait::DataType()) { diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index c8ff561f7af..ae7d0807530 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -278,6 +278,16 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (kernel_iter == kernels.end() && + paddle::platform::is_custom_place(expected_kernel_key.place_)) { + VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } #endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that // case diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 81cd39c225b..c2dd761c23c 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use MLU device since it's not compiled with MLU," "Please recompile or reinstall Paddle with MLU support.")); +#endif + } else if (platform::is_custom_place(place)) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0)); + VLOG(10) << "Created GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use CustomDevice since it's not compiled with " + "CustomDevice," + "Please recompile or reinstall Paddle with CustomDevice " + "support.")); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -222,6 +234,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with MLU if use MLUPlace.")); +#endif + } else if (platform::is_custom_place(place)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + platform::DeviceManager::SetDevice(place); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with CustomDevice if use " + "CustomPlace.")); #endif } if (!override_default_attr_map) { diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index b899ddbcd5a..6cd7d873323 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -58,6 +58,11 @@ else () set(AllocatorFacadeDeps) endif() +if (WITH_CUSTOM_DEVICE) + cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager) + set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator) +endif() + if (WITH_GPU) nv_test(best_fit_allocator_test SRCS best_fit_allocator_test.cc diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 35131446d86..fc34a64d626 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -62,6 +62,11 @@ #include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/memory/allocation/custom_allocator.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#endif + PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, "The retry time (milliseconds) when allocator fails " @@ -186,6 +191,17 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto& dev_type : device_types) { + for (size_t dev_id = 0; + dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + ++dev_id) { + InitNaiveBestFitCustomDeviceAllocator( + platform::CustomPlace(dev_type, dev_id)); + } + } #endif break; } @@ -222,6 +238,17 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto& dev_type : device_types) { + for (size_t dev_id = 0; + dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + ++dev_id) { + InitAutoGrowthCustomDeviceAllocator( + platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk); + } + } #endif break; } @@ -700,6 +727,21 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) { + allocators_[p] = std::make_shared(p); + } + + void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p, + bool allow_free_idle_chunk) { + auto custom_allocator = + std::make_shared(p); + allocators_[p] = std::make_shared( + custom_allocator, platform::DeviceManager::GetMinChunkSize(p), + allow_free_idle_chunk); + } +#endif + void InitSystemAllocators() { if (!system_allocators_.empty()) return; system_allocators_[platform::CPUPlace()] = std::make_shared(); @@ -770,6 +812,16 @@ class AllocatorFacadePrivate { places.emplace_back(platform::MLUPlace(dev_id)); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto& dev_type : device_types) { + for (size_t dev_id = 0; + dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id++) { + places.emplace_back(platform::CustomPlace(dev_type, dev_id)); + } + } +#endif for (auto& p : places) { zero_size_allocators_[p] = std::make_shared(p); @@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); } #endif - platform::CUDAPlace p(place.GetDeviceId()); if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc new file mode 100644 index 00000000000..eb035ea5e3a --- /dev/null +++ b/paddle/fluid/memory/allocation/custom_allocator.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/custom_allocator.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool CustomAllocator::IsAllocThreadSafe() const { return true; } +void CustomAllocator::FreeImpl(pten::Allocation* allocation) { + PADDLE_ENFORCE_EQ( + allocation->place(), place_, + platform::errors::PermissionDenied("CustomDevice memory is " + "freed in incorrect device. " + "This may be a bug")); + + delete allocation; +} + +pten::Allocation* CustomAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::DeviceManager::SetDevice(place_); }); + + void* ptr = + platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); + if (LIKELY(ptr)) { + return new Allocation(ptr, size, place_); + } + + size_t avail, total; + platform::DeviceManager::MemoryStats(place_, &total, &avail); + + auto dev_type = platform::PlaceHelper::GetDeviceType(place_); + auto dev_id = platform::PlaceHelper::GetDeviceId(place_); + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on %s:%d. " + "Cannot allocate %s memory on %s:%d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using %s:%d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another %s.\n" + "2. If no, please decrease the batch size of your model.\n\n", + dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id, + string::HumanReadableSize(avail), dev_type, dev_id, dev_type)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h new file mode 100644 index 00000000000..708c105a850 --- /dev/null +++ b/paddle/fluid/memory/allocation/custom_allocator.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CustomAllocator : public Allocator { + public: + explicit CustomAllocator(const platform::CustomPlace& place) + : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; + + private: + platform::Place place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 91358b68804..b63f872141c 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -20,6 +20,7 @@ #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -30,7 +31,6 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif -#include "paddle/fluid/platform/device/device_wrapper.h" PADDLE_DEFINE_EXPORTED_bool( init_allocated_mem, false, @@ -733,6 +733,136 @@ uint64_t Release(const platform::MLUPlace &place) { #endif } +// For CustomDevice +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class BuddyAllocatorList { + private: + explicit BuddyAllocatorList(const std::string &device_type) + : device_type_(device_type) { + auto devices = platform::DeviceManager::GetDeviceList(device_type); + for (auto dev_id : devices) { + init_flags_[dev_id].reset(new std::once_flag()); + } + } + + static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) { + return new BuddyAllocatorList(device_type); + } + + public: + static BuddyAllocatorList *Instance(const std::string &device_type) { + // DeviceType -> AllocatorList + static std::unordered_map pool; + if (pool.find(device_type) == pool.end()) { + pool[device_type] = CreateNewInstance(device_type); + } + return pool[device_type]; + } + + BuddyAllocator *Get(int dev_id) { + PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(), + platform::errors::OutOfRange( + "Cannot find %s %d, please check visible devices.", + device_type_, dev_id)); + + std::call_once(*init_flags_[dev_id], [this, dev_id] { + platform::DeviceManager::SetDevice(device_type_, dev_id); + platform::CustomPlace place(device_type_, dev_id); + + allocators_[dev_id].reset(new BuddyAllocator( + std::unique_ptr( + new detail::CustomAllocator(device_type_, dev_id)), + platform::DeviceManager::GetMinChunkSize(place), + platform::DeviceManager::GetMaxChunkSize(place), + platform::DeviceManager::GetExtraPaddingSize(place), device_type_)); + }); + + return allocators_[dev_id].get(); + } + + private: + std::string device_type_; + std::unordered_map> init_flags_; + std::unordered_map> allocators_; +}; + +BuddyAllocator *GetBuddyAllocator(const platform::Place &place) { + VLOG(10) << "GetBuddyAllocator place = " << place; + if (platform::is_custom_place(place)) { + return BuddyAllocatorList::Instance( + platform::PlaceHelper::GetDeviceType(place)) + ->Get(platform::PlaceHelper::GetDeviceId(place)); + } else { + PADDLE_THROW( + platform::errors::InvalidArgument("place must be CustomPlace")); + } +} +#endif + +template <> +void *Alloc(const platform::CustomPlace &place, + size_t size) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + auto *buddy_allocator = GetBuddyAllocator(place); + auto *ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + platform::DeviceGuard guard(place); + size_t avail, total; + platform::DeviceManager::MemoryStats(place, &total, &avail); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " + "%s. ", + string::HumanReadableSize(size), place.GetDeviceType(), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(total - avail))); + } else { + if (FLAGS_init_allocated_mem) { + platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, + size); + } + } + VLOG(10) << " pointer=" << ptr; + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::CustomPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetBuddyAllocator(place)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::CustomPlace &place) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + return GetBuddyAllocator(place)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + +template <> +size_t Used(const platform::CustomPlace &place) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + return GetBuddyAllocator(place)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CustomPlace' is not supported in CPU only device.")); +#endif +} + struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index b02fb6642be..d7bbfba932c 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -25,9 +25,7 @@ limitations under the License. */ DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif -#ifdef PADDLE_WITH_MLU -#include "paddle/fluid/platform/device/mlu/mlu_info.h" -#endif +#include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace memory { @@ -35,12 +33,37 @@ namespace detail { BuddyAllocator::BuddyAllocator( std::unique_ptr system_allocator, size_t min_chunk_size, - size_t max_chunk_size, size_t extra_padding_size) + size_t max_chunk_size, size_t extra_padding_size, + const std::string dev_type) : min_chunk_size_(min_chunk_size), max_chunk_size_(max_chunk_size), extra_padding_size_(extra_padding_size), cache_(system_allocator->UseGpu()), - system_allocator_(std::move(system_allocator)) {} + system_allocator_(std::move(system_allocator)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (!dev_type.empty()) { + init_allocate_size_func_ = [dev_type]() { + return platform::DeviceManager::GetInitAllocSize( + platform::PlaceHelper::CreatePlace(dev_type)); + }; + re_allocate_size_func_ = [dev_type]() { + return platform::DeviceManager::GetReallocSize( + platform::PlaceHelper::CreatePlace(dev_type)); + }; + } else { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + init_allocate_size_func_ = &platform::GpuInitAllocSize; + re_allocate_size_func_ = &platform::GpuReallocSize; +#elif defined(PADDLE_WITH_ASCEND_CL) + init_allocate_size_func_ = &platform::NPUInitAllocSize; + re_allocate_size_func_ = &platform::NPUReallocSize; +#elif defined(PADDLE_WITH_MLU) + init_allocate_size_func_ = &platform::MLUInitAllocSize; + re_allocate_size_func_ = &platform::MLUReallocSize; +#endif + } +#endif +} BuddyAllocator::~BuddyAllocator() { VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " @@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( size_t allocate_bytes = max_chunk_size_; size_t index = 0; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + allocate_bytes = DeviceAllocateSize(init_allocate_size_func_, + re_allocate_size_func_, request_bytes); +#else #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); @@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( #elif defined(PADDLE_WITH_MLU) allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes); +#endif #endif // Allocate a new block diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 0d736f68050..5296192b8fd 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -39,7 +39,8 @@ class BuddyAllocator { public: BuddyAllocator(std::unique_ptr system_allocator, size_t min_chunk_size, size_t max_chunk_size, - size_t extra_padding_size = 0); + size_t extra_padding_size = 0, + const std::string dev_type = ""); ~BuddyAllocator(); @@ -123,6 +124,9 @@ class BuddyAllocator { /*! Allocate CPU/GPU memory from system */ std::unique_ptr system_allocator_; std::mutex mutex_; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + std::function init_allocate_size_func_, re_allocate_size_func_; +#endif }; } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 773122de6c3..a61f98c4e1a 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -38,6 +38,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "paddle/fluid/platform/device/device_wrapper.h" + DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); @@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) { bool MLUAllocator::UseGpu() const { return true; } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +void* CustomAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + auto place = platform::CustomPlace(dev_type_, dev_id_); + auto device = platform::DeviceManager::GetDeviceWithPlace(place); + p = device->MemoryAllocate(size); + if (LIKELY(p)) { + VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size; + *index = 0; + plug_alloc_size += size; + } else { + size_t avail, total; + + platform::DeviceManager::MemoryStats(place, &total, &avail); + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on %s %d. " + "total memory is %s, used memory is %s, " + "available memory is only %s.\n\n", + dev_type_, dev_id_, string::HumanReadableSize(total), + string::HumanReadableSize(total - avail), + string::HumanReadableSize(avail))); + } + return p; +} + +void CustomAllocator::Free(void* p, size_t size, size_t index) { + VLOG(4) << "CustomAllocator::Free " << p << " size " << size; + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(plug_alloc_size, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, plug_alloc_size)); + plug_alloc_size -= size; + auto place = platform::CustomPlace(dev_type_, dev_id_); + auto device = platform::DeviceManager::GetDeviceWithPlace(place); + device->MemoryDeallocate(p, size); +} + +bool CustomAllocator::UseGpu() const { return true; } +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index 975e2891b24..f6ff6282a61 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include // for size_t +#include namespace paddle { namespace memory { @@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator { }; #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class CustomAllocator : public SystemAllocator { + public: + explicit CustomAllocator(const std::string& device_type, size_t dev_id) + : dev_type_(device_type), dev_id_(dev_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t plug_alloc_size = 0; + std::string dev_type_; + size_t dev_id_; +}; +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index d2ab438fd29..d857b1c1671 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -19,9 +19,88 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/pten/common/place.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + namespace paddle { namespace memory { +#ifdef PADDLE_WITH_CUSTOM_DEVICE +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place, + const void* src, size_t num, void* stream) { + if (UNLIKELY(num == 0)) return; + + auto src_type = platform::PlaceHelper::GetDeviceType(src_place); + auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); + std::string msg = "Memcpy:" + src_type + "->" + dst_type; + platform::RecordEvent record_event(msg); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << ", stream=" << stream; + + platform::DeviceManager::SetDevice(src_place); + platform::stream::Stream stream_wrapper(src_place, stream); + platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( + dst, src, num, &stream_wrapper); +} + +template <> +void Copy( + platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, void* stream) { + if (UNLIKELY(num == 0)) return; + auto src_type = platform::PlaceHelper::GetDeviceType(src_place); + auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); + std::string msg = "Memcpy:" + src_type + "->" + dst_type; + platform::RecordEvent record_event(msg); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << ", stream=" << stream; + + platform::DeviceManager::SetDevice(dst_place); + platform::stream::Stream stream_wrapper(dst_place, stream); + platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( + dst, src, num, &stream_wrapper); +} + +template <> +void Copy( + platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place, + const void* src, size_t num, void* stream) { + if (UNLIKELY(num == 0)) return; + + auto src_type = platform::PlaceHelper::GetDeviceType(src_place); + auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); + std::string msg = "Memcpy:" + src_type + "->" + dst_type; + platform::RecordEvent record_event(msg); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << ", stream=" << stream; + + if (src_type == dst_type) { + platform::DeviceManager::SetDevice(src_place); + platform::stream::Stream stream_wrapper(src_place, stream); + + auto src_id = platform::PlaceHelper::GetDeviceId(src_place); + auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place); + if (src_id == dst_id) { + platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( + dst, src, num, &stream_wrapper); + } else { + platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( + dst_place, dst, src, num, &stream_wrapper); + } + } else { + PADDLE_THROW(platform::errors::Unavailable( + "Copy between %s and %s is not supported.", src_type, dst_type)); + } +} +#endif // PADDLE_WITH_CUSTOM_DEVICE + template <> void Copy(platform::CPUPlace, void* dst, platform::CPUPlace, @@ -158,7 +237,7 @@ void Copy(platform::NPUPlace dst_place, void* dst, platform::CPUPlace src_place, const void* src, size_t num, - aclrtStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -168,7 +247,8 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation // after async is not ok, since the async operation may not done. @@ -186,7 +266,7 @@ void Copy(platform::CPUPlace dst_place, void* dst, platform::NPUPlace src_place, const void* src, size_t num, - aclrtStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -196,7 +276,8 @@ void Copy(platform::CPUPlace dst_place, if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -211,7 +292,7 @@ void Copy(platform::NPUPlace dst_place, void* dst, platform::NPUPlace src_place, const void* src, size_t num, - aclrtStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -221,7 +302,7 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, - stream); + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -239,7 +320,7 @@ void Copy(platform::NPUPlace dst_place, // TODO(zhiqiu): support peer access? platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, - stream); + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -284,7 +365,7 @@ void Copy( template <> void Copy( platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, aclrtStream stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -294,7 +375,8 @@ void Copy( if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -307,7 +389,7 @@ void Copy( template <> void Copy( platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num, aclrtStream stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -317,7 +399,8 @@ void Copy( if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation // after async is not ok, since the async operation may not done. @@ -379,6 +462,23 @@ void Copy(pten::Place dst_place, void* dst, platform::NPUPinnedPlace place_dst; platform::NPUPlace place_src(src_place.GetDeviceId()); return Copy(place_dst, dst, place_src, src, num, stream); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); +#endif } } @@ -492,7 +592,7 @@ inline void SyncCUDAStream() { template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, gpuStream_t stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); @@ -501,9 +601,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); @@ -522,7 +624,7 @@ void Copy( template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, gpuStream_t stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -531,9 +633,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); @@ -552,7 +656,7 @@ void Copy( template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, gpuStream_t stream) { + const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -562,9 +666,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); @@ -578,7 +684,7 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, stream); + num, reinterpret_cast(stream)); } else { platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, @@ -620,8 +726,7 @@ void Copy( template <> void Copy( platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + platform::CUDAPlace src_place, const void* src, size_t num, void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -629,9 +734,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); @@ -647,7 +754,7 @@ template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CUDAPinnedPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -656,9 +763,11 @@ void Copy( if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + reinterpret_cast(stream)); #endif } else { platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); @@ -674,7 +783,7 @@ void Copy( template <> void Copy(pten::Place dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, gpuStream_t stream) { + size_t num, void* stream) { if (src_place.GetType() == pten::AllocationType::CPU && dst_place.GetType() == pten::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -719,6 +828,23 @@ void Copy(pten::Place dst_place, void* dst, platform::CUDAPinnedPlace place_dst; platform::CUDAPlace place_src(src_place.GetDeviceId()); return Copy(place_dst, dst, place_src, src, num, stream); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); +#endif } } @@ -726,7 +852,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::CPUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, gpuStream_t stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); } @@ -735,7 +861,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::CPUPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); } @@ -743,7 +869,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::GPUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, gpuStream_t stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, src, num, stream); } @@ -753,7 +879,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::GPUPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, stream); @@ -764,7 +890,7 @@ template <> void Copy(pten::GPUPinnedPlace dst_place, void* dst, pten::Place src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); } @@ -773,7 +899,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::GPUPinnedPlace src_place, const void* src, size_t num, - gpuStream_t stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); } @@ -800,7 +926,7 @@ void Copy(platform::CPUPlace dst_place, void* dst, platform::MLUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetMLUDeviceId(src_place.device); @@ -808,7 +934,8 @@ void Copy(platform::CPUPlace dst_place, VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); - platform::MLUMemcpyD2HAsync(dst, src, num, stream); + platform::MLUMemcpyD2HAsync(dst, src, num, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -825,7 +952,7 @@ void Copy(platform::MLUPlace dst_place, void* dst, platform::CPUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetMLUDeviceId(dst_place.device); @@ -833,7 +960,8 @@ void Copy(platform::MLUPlace dst_place, VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); - platform::MLUMemcpyH2DAsync(dst, src, num, stream); + platform::MLUMemcpyH2DAsync(dst, src, num, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -850,7 +978,7 @@ void Copy(platform::MLUPlace dst_place, void* dst, platform::MLUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { if (UNLIKELY(num == 0)) return; if (dst_place == src_place) { @@ -860,7 +988,8 @@ void Copy(platform::MLUPlace dst_place, << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event( "MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); - platform::MLUMemcpyD2DAsync(dst, src, num, stream); + platform::MLUMemcpyD2DAsync(dst, src, num, + reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -877,7 +1006,7 @@ void Copy(platform::MLUPlace dst_place, << " to " << dst_place << " by mlu stream(" << stream << ")"; platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU"); platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, stream); + num, reinterpret_cast(stream)); } else { VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; @@ -892,7 +1021,7 @@ void Copy(platform::MLUPlace dst_place, template <> void Copy(pten::Place dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, mluStream stream) { + size_t num, void* stream) { if (src_place.GetType() == pten::AllocationType::CPU && dst_place.GetType() == pten::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -912,6 +1041,23 @@ void Copy(pten::Place dst_place, void* dst, platform::MLUPlace place_src(src_place.GetDeviceId()); platform::MLUPlace place_dst(dst_place.GetDeviceId()); return Copy(place_dst, dst, place_src, src, num, stream); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); +#endif } } @@ -919,7 +1065,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::MLUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, mluStream stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, src, num, stream); } @@ -929,7 +1075,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::MLUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, stream); @@ -939,7 +1085,7 @@ void Copy(pten::Place dst_place, void* dst, template <> void Copy(pten::CPUPlace dst_place, void* dst, pten::Place src_place, const void* src, - size_t num, mluStream stream) { + size_t num, void* stream) { Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); } @@ -948,7 +1094,7 @@ template <> void Copy(pten::Place dst_place, void* dst, pten::CPUPlace src_place, const void* src, size_t num, - mluStream stream) { + void* stream) { Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); } @@ -1013,7 +1159,7 @@ void Copy(pten::Place dst_place, void* dst, } #endif #ifdef PADDLE_WITH_IPU - else if (src_place.GetType() == pten::AllocationType::CPU && + else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT dst_place.GetType() == pten::AllocationType::IPU) { platform::IPUPlace place_dst(dst_place.GetDeviceId()); platform::CPUPlace place_src; @@ -1048,5 +1194,48 @@ void Copy(pten::CPUPlace dst_place, void* dst, Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num); } +#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) && \ + !defined(PADDLE_WITH_MLU) + +template <> +void Copy(pten::Place dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, void* stream) { + if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CPUPlace place_src; + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CPU) { + platform::CustomPlace place_src(src_place); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, stream); + } else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT + dst_place.GetType() == pten::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place); + platform::CustomPlace place_dst(dst_place); + return Copy(place_dst, dst, place_src, src, num, stream); + } +} + +template <> +void Copy(pten::CPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, void* stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). +template <> +void Copy(pten::Place dst_place, void* dst, + pten::CPUPlace src_place, + const void* src, size_t num, + void* stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} +#endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 31d1a50e778..dd861a15b5c 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -36,66 +36,25 @@ namespace memory { template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -/** - * \brief Copy memory from one place to another place. - * - * \param[in] DstPlace Destination allocation place (CPU or GPU). - * \param[in] dst Destination memory address. - * \param[in] SrcPlace Source allocation place (CPU or GPU). - * \param[in] src Source memory address. - * \param[in] num memory size in bytes to copy. - * \param[in] stream CUDA stream. - * - * \note For GPU memory copy, CUDA stream need to be specified - * for asynchronously memory copy. - * - */ -template -void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, - gpuStream_t stream); -#endif - -#ifdef PADDLE_WITH_ASCEND_CL -/** - * \brief Copy memory from one place to another place. - * - * \param[in] DstPlace Destination allocation place (CPU or NPU). - * \param[in] dst Destination memory address. - * \param[in] SrcPlace Source allocation place (CPU or NPU). - * \param[in] src Source memory address. - * \param[in] num memory size in bytes to copy. - * \param[in] stream NPU stream. - * - * \note For NPU memory copy, NPU stream need to be specified - * for asynchronously memory copy. - * - */ -template -void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, - aclrtStream stream); -#endif - -#ifdef PADDLE_WITH_MLU /** * \brief Copy memory from one place to another place. * - * \param[in] DstPlace Destination allocation place (CPU or MLU). + * \param[in] DstPlace Destination allocation place (CPU or GPU or XPU or + * CustomDevice). * \param[in] dst Destination memory address. - * \param[in] SrcPlace Source allocation place (CPU or MLU). + * \param[in] SrcPlace Source allocation place (CPU or GPU or XPU or + * CustomDevice). * \param[in] src Source memory address. * \param[in] num memory size in bytes to copy. - * \param[in] stream MLU stream. + * \param[in] stream stream for asynchronously memory copy. * - * \note For MLU memory copy, MLU stream need to be specified - * for asynchronously memory copy. + * \note For GPU/XPU/CustomDevice memory copy, stream need to be specified + * for asynchronously memory copy, and type is restored in the + * implementation. * */ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, - mluStream stream); -#endif - + void* stream); } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc new file mode 100644 index 00000000000..506b5718696 --- /dev/null +++ b/paddle/fluid/operators/math/math_function.cc @@ -0,0 +1,313 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef PADDLE_USE_OPENBLAS +#include +#endif + +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace operators { +namespace math { + +using float16 = paddle::platform::float16; + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; + +#ifdef PADDLE_WITH_XPU +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; +#endif + +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, RANK>; \ + template struct Transpose, RANK>; + +DEFINE_CPU_TRANS(1); +DEFINE_CPU_TRANS(2); +DEFINE_CPU_TRANS(3); +DEFINE_CPU_TRANS(4); +DEFINE_CPU_TRANS(5); +DEFINE_CPU_TRANS(6); + +template +struct TransposeNormal { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& in, framework::Tensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = framework::stride(in.dims()); + auto out_stride = framework::stride(out->dims()); + const T* in_ptr = in.data(); + T* out_ptr = out->data(); + + auto transpose_helper = [&](int64_t beg, int64_t end) { + for (int64_t out_idx = beg; out_idx < end; ++out_idx) { + int64_t in_idx = 0; + int64_t tmp_idx = out_idx; + // calculate the input index + for (int i = 0; i < rank; ++i) { + const int64_t coordinate = tmp_idx / out_stride[i]; + tmp_idx -= coordinate * out_stride[i]; + in_idx += coordinate * in_stride[axis[i]]; + } + out_ptr[out_idx] = in_ptr[in_idx]; + } + }; + transpose_helper(0, out->numel()); + } +}; + +// define transpose normal +#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ + template struct TransposeNormal + +DEFINE_CPU_TRANS_NORMAL(platform::float16); +DEFINE_CPU_TRANS_NORMAL(platform::bfloat16); +DEFINE_CPU_TRANS_NORMAL(float); +DEFINE_CPU_TRANS_NORMAL(double); +DEFINE_CPU_TRANS_NORMAL(int); +DEFINE_CPU_TRANS_NORMAL(int64_t); +DEFINE_CPU_TRANS_NORMAL(bool); +DEFINE_CPU_TRANS_NORMAL(int16_t); +DEFINE_CPU_TRANS_NORMAL(uint8_t); +DEFINE_CPU_TRANS_NORMAL(int8_t); +DEFINE_CPU_TRANS_NORMAL(platform::complex); +DEFINE_CPU_TRANS_NORMAL(platform::complex); + +struct TensorSetConstantCPU { + TensorSetConstantCPU(framework::Tensor* tensor, float value) + : tensor_(tensor), value_(value) {} + template + void apply() const { + auto cpu = platform::CPUPlace(); + auto* begin = tensor_->mutable_data(cpu); + std::fill(begin, begin + tensor_->numel(), static_cast(value_)); + } + framework::Tensor* tensor_; + float value_; +}; + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW( + platform::errors::Unimplemented("NPUPinnedPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported")); +} + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); +} + +struct TensorSetConstantWithPlace : public boost::static_visitor { + TensorSetConstantWithPlace(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) + : context_(context), tensor_(tensor), value_(value) {} + + template + void operator()(Place place) const { + set_constant_with_place(context_, tensor_, value_); + } + + const platform::DeviceContext& context_; + framework::Tensor* tensor_; + float value_; +}; + +void set_constant(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) { + TensorSetConstantWithPlace func(context, tensor, value); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // tensor->place().apply_visitor(func); + paddle::platform::VisitPlace(tensor->place(), func); +#else + func(platform::CPUPlace()); +#endif +} + +template +struct RowwiseAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ( + vector.numel(), size, + platform::errors::InvalidArgument( + "The input vector size" + " should be equal to the size of each row of input tensor." + " Expected vector size=%d, but received %d", + size, vector.numel())); + const char* in_dims_cstr = in_dims.to_str().c_str(); + const char* out_dims_cstr = out_dims.to_str().c_str(); + PADDLE_ENFORCE_EQ(out_dims, in_dims, + platform::errors::InvalidArgument( + "The output tensor shape should be same as the input" + " tensor shape. Expected output tensor shape: %s," + " but received %s", + in_dims_cstr, out_dims_cstr)); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(vector); + auto out = framework::EigenMatrix::From(*output); + + for (int64_t i = 0; i < in_dims[0]; ++i) { + out.chip(i, 0) = in.chip(i, 0) + vec; + } + } +}; + +template struct RowwiseAdd; +template struct RowwiseAdd; + +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; + +template struct RowwiseSum; +template struct RowwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + +template +struct ElementwiseAddTo { + void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src, + framework::Tensor* dst) { + auto in = framework::EigenVector::Flatten(src); + auto out = framework::EigenVector::Flatten(*dst); + auto& place = *(ctx->eigen_device()); + out.device(place) = out + in; + } +}; + +template struct ElementwiseAddTo; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index 2cd068badf5..ecad5340d71 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,3 +1,18 @@ +IF(WITH_CUSTOM_DEVICE) +cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place) + +cc_library(device_guard SRCS device_guard.cc DEPS enforce place) + +cc_library(stream SRCS stream.cc DEPS callback_manager) + +cc_library(event SRCS event.cc DEPS enforce place) + +cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags) + +ENDIF() + +set(DEV_LIBS custom_device) + # GPU IF(WITH_GPU OR WITH_ROCM) add_subdirectory(gpu) @@ -22,3 +37,11 @@ ENDIF() IF(WITH_MLU) add_subdirectory(mlu) ENDIF() + +# CUSTOM +IF(WITH_CUSTOM_DEVICE) + add_subdirectory(custom) + + cc_library(device_manager SRCS device_manager.cc DEPS custom_device) + set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library") +ENDIF() diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/fluid/platform/device/callback_manager.cc new file mode 100644 index 00000000000..c677bc0262f --- /dev/null +++ b/paddle/fluid/platform/device/callback_manager.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +CallbackManager::CallbackManager(stream::Stream *stream) + : stream_(stream), thread_pool_(1) {} + +void CallbackManager::AddCallback(std::function callback) const { + auto *callback_func = new std::function(std::move(callback)); + auto *func = new std::function([this, callback_func] { + std::lock_guard lock(mtx_); + last_future_ = thread_pool_.enqueue([callback_func] { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); + }); + + platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) + ->AddCallback(stream_, func); +} + +void CallbackManager::Wait() const { + platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) + ->SynchronizeStream(stream_); + + { + std::lock_guard lock(mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/fluid/platform/device/callback_manager.h new file mode 100644 index 00000000000..0edc694c94b --- /dev/null +++ b/paddle/fluid/platform/device/callback_manager.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + +#include +#include // NOLINT +#include +#include // NOLINT + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +namespace stream { +class Stream; +} // namespace stream + +// NOTE(zjl): clean CallbackManager to make compilation faster +// Make CallbackManager thread-safe +class CallbackManager { + public: + explicit CallbackManager(stream::Stream* stream); + + ~CallbackManager() = default; + + void AddCallback(std::function callback) const; + + void Wait() const; + + private: + stream::Stream* stream_; + mutable ::ThreadPool thread_pool_; + mutable std::mutex mtx_; + mutable std::future last_future_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt new file mode 100644 index 00000000000..f39c60c0c68 --- /dev/null +++ b/paddle/fluid/platform/device/custom/CMakeLists.txt @@ -0,0 +1,4 @@ +IF(WITH_CUSTOM_DEVICE) +cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context) +cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context ) +ENDIF() diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/fluid/platform/device/custom/custom_device.cc new file mode 100644 index 00000000000..c5b98d3e228 --- /dev/null +++ b/paddle/fluid/platform/device/custom/custom_device.cc @@ -0,0 +1,672 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/device_base.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/device_context.h" + +static bool operator==(const C_Device_st& d1, const C_Device_st& d2) { + return d1.id == d2.id; +} + +namespace paddle { +namespace platform { + +class CustomDevice : public DeviceInterface { + public: + CustomDevice(const std::string& type, int priority, bool is_custom, + std::unique_ptr pimpl, void* dso_handle) + : DeviceInterface(type, priority, is_custom), + pimpl_(std::move(pimpl)), + dso_handle_(dso_handle) { + Initialize(); + } + + ~CustomDevice() override { Finalize(); } + + size_t GetDeviceCount() override { + size_t count; + if (pimpl_->get_device_count(&count) != C_SUCCESS) { + count = 0; + } + return count; + } + + std::vector GetDeviceList() override { + size_t count = GetDeviceCount(); + std::vector devices(count); + pimpl_->get_device_list(devices.data()); + return devices; + } + + C_DeviceInterface* Impl() { return pimpl_.get(); } + + void SynchronizeDevice(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_device(device)); + } + + void Initialize() override { + if (pimpl_->initialize && pimpl_->initialize() != C_SUCCESS) { + LOG(ERROR) << "Initialize " << Type() << " Failed\n"; + exit(-1); + } + auto devices = GetDeviceList(); + for (auto dev_id : devices) { + C_Device_st device; + device.id = dev_id; + devices_pool[dev_id] = device; + InitDevice(dev_id); + } + } + + void Finalize() override { + auto devices = GetDeviceList(); + for (auto dev_id : devices) { + // SetDevice(dev_id); + // SynchronizeDevice(dev_id); + DeInitDevice(dev_id); + } + + bool ok = true; + if (pimpl_->finalize && pimpl_->finalize() != C_SUCCESS) { + LOG(ERROR) << "Finalize " << Type() << " Failed\n"; + ok = false; + } + if (dso_handle_) { + dlclose(dso_handle_); + dso_handle_ = nullptr; + } + if (!ok) { + exit(1); + } + } + + void InitDevice(size_t dev_id) override { + if (pimpl_->init_device) { + // Core set logical id, and Plugin replace it with physical id + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->init_device(device)); + } + } + + void DeInitDevice(size_t dev_id) override { + if (pimpl_->deinit_device) { + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->deinit_device(device)); + } + } + + void SetDevice(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->set_device(device)); + } + + int GetDevice() override { + C_Device_st device; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->get_device(&device)); + return device.id; + } + + void CreateStream(size_t dev_id, stream::Stream* stream, + const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, + const stream::Stream::Flag& flag = + stream::Stream::Flag::kDefaultFlag) override { + if (priority != stream::Stream::Priority::kNormal || + flag != stream::Stream::Flag::kDefaultFlag) { + PADDLE_THROW(platform::errors::Unavailable( + "priority != stream::Stream::Priority::kNormal || flag != " + "stream::Stream::Flag::kDefaultFlag is not allowed on " + "CustomDevice.")); + } + const auto device = &devices_pool[dev_id]; + C_Stream c_stream; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->create_stream(device, &c_stream)); + stream->set_stream(c_stream); + } + + void DestroyStream(size_t dev_id, stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream( + device, reinterpret_cast(stream->raw_stream()))); + } + + void SynchronizeStream(size_t dev_id, const stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream( + device, reinterpret_cast(stream->raw_stream()))); + } + + bool QueryStream(size_t dev_id, const stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->query_stream) { + SynchronizeStream(dev_id, stream); + return true; + } + if (pimpl_->query_stream(device, reinterpret_cast( + stream->raw_stream())) == C_SUCCESS) { + return true; + } + return false; + } + + void AddCallback(size_t dev_id, stream::Stream* stream, + stream::Stream::Callback* callback) override { + if (!pimpl_->stream_add_callback) { + PADDLE_THROW(platform::errors::Unavailable( + "AddCallback is not supported on %s.", Type())); + } else { + const auto device = &devices_pool[dev_id]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback( + device, reinterpret_cast(stream->raw_stream()), + [](C_Device device, C_Stream stream, void* user_data, + C_Status* status) { + std::unique_ptr> func( + reinterpret_cast*>(user_data)); + (*func)(); + }, + callback)); + } + } + + void CreateEvent(size_t dev_id, event::Event* event, + event::Event::Flag flags) override { + const auto device = &devices_pool[dev_id]; + C_Event c_event; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->create_event(device, &c_event)); + event->set_event(c_event); + } + + void DestroyEvent(size_t dev_id, event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_event( + device, reinterpret_cast(event->raw_event()))); + } + + void RecordEvent(size_t dev_id, const event::Event* event, + const stream::Stream* stream) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event( + device, reinterpret_cast(stream->raw_stream()), + reinterpret_cast(event->raw_event()))); + } + + void SynchronizeEvent(size_t dev_id, const event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_event( + device, reinterpret_cast(event->raw_event()))); + } + + bool QueryEvent(size_t dev_id, const event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->query_event) { + SynchronizeEvent(dev_id, event); + return true; + } + if (pimpl_->query_event(device, reinterpret_cast( + event->raw_event())) == C_SUCCESS) { + return true; + } + return false; + } + + void StreamWaitEvent(size_t dev_id, const stream::Stream* stream, + const event::Event* event) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event( + device, reinterpret_cast(stream->raw_stream()), + reinterpret_cast(event->raw_event()))); + } + + void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + const auto device = &devices_pool[dev_id]; + auto place = platform::CustomPlace(Type(), dev_id); + + if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) { + C_Stream c_stream = reinterpret_cast(stream->raw_stream()); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_h2d(device, dst, src, size)); + } + } + + void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + const auto device = &devices_pool[dev_id]; + auto place = platform::CustomPlace(Type(), dev_id); + + if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) { + C_Stream c_stream = reinterpret_cast(stream->raw_stream()); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_d2h(device, dst, src, size)); + } + } + + void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + const auto device = &devices_pool[dev_id]; + auto place = platform::CustomPlace(Type(), dev_id); + + if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) { + C_Stream c_stream = reinterpret_cast(stream->raw_stream()); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size)); + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_d2d(device, dst, src, size)); + } + } + + void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id, + const void* src, size_t size, + const stream::Stream* stream = nullptr) override { + int dst_dev_id = PlaceToId(dst_place); + auto dst_device = &devices_pool[dst_dev_id]; + auto src_device = &devices_pool[src_dev_id]; + + if (stream && stream->raw_stream()) { + if (!pimpl_->async_memory_copy_p2p) { + MemoryCopyP2P(dst_place, dst, src_dev_id, src, size); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p( + dst_device, src_device, + reinterpret_cast(stream->raw_stream()), dst, src, size)); + } + } else { + if (!pimpl_->memory_copy_p2p) { + std::unique_ptr tmp(new uint8_t[size]); + MemoryCopyD2H(src_dev_id, tmp.get(), src, size); + MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size); + } else { + auto src_place = platform::CustomPlace(Type(), src_dev_id); + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pool.Get(src_place)->Wait(); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size)); + } + } + } + + void* MemoryAllocate(size_t dev_id, size_t size) override { + void* ptr = nullptr; + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_allocate(device, &ptr, size)); + return ptr; + } + + void MemoryDeallocate(size_t dev_id, void* ptr, size_t size) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_deallocate(device, ptr, size)); + } + + void* MemoryAllocateHost(size_t dev_id, size_t size) override { + void* ptr = nullptr; + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->unified_memory_allocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Host is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->host_memory_allocate(device, &ptr, size)); + } + return ptr; + } + + void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->host_memory_deallocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Host is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->host_memory_deallocate(device, ptr, size)); + } + } + + void* MemoryAllocateUnified(size_t dev_id, size_t size) override { + void* ptr = nullptr; + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->unified_memory_allocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Unified is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->unified_memory_allocate(device, &ptr, size)); + } + return ptr; + } + + void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size) override { + const auto device = &devices_pool[dev_id]; + + if (!pimpl_->unified_memory_deallocate) { + PADDLE_THROW(platform::errors::Unavailable( + "MemoryAllocKind::Host is not supported on %s.", Type())); + } else { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->unified_memory_deallocate(device, ptr, size)); + } + } + + void MemorySet(size_t dev_id, void* ptr, uint8_t value, + size_t size) override { + const auto device = &devices_pool[dev_id]; + + if (pimpl_->device_memory_set) { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_set(device, ptr, value, size)); + } else { + std::unique_ptr tmp(new uint8_t[size]); + memset(tmp.get(), value, size); + MemoryCopyH2D(dev_id, ptr, tmp.get(), size); + } + } + + void MemoryStats(size_t dev_id, size_t* total, size_t* free) override { + const auto device = &devices_pool[dev_id]; + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_memory_stats(device, total, free)); + + size_t used = *total - *free; + VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/" + << (*total >> 20) << "M, " << (*free >> 20) + << "M available to allocate"; + } + + size_t GetMinChunkSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t size = 0; + pimpl_->device_min_chunk_size(device, &size); + VLOG(10) << Type() << " min chunk size " << size << "B"; + return size; + } + + size_t GetMaxChunkSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t size = 0; + if (pimpl_->device_max_chunk_size) { + pimpl_->device_max_chunk_size(device, &size); + VLOG(10) << Type() << " max chunk size " << size << "B"; + } else { + return DeviceInterface::GetMaxChunkSize(dev_id); + } + return size; + } + + size_t GetMaxAllocSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t size = 0; + if (pimpl_->device_max_alloc_size) { + pimpl_->device_max_alloc_size(device, &size); + VLOG(10) << Type() << " max alloc size " << (size >> 20) << "M"; + } else { + return DeviceInterface::GetMaxAllocSize(dev_id); + } + return size; + } + + size_t GetInitAllocSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + size_t size = 0; + if (pimpl_->device_init_alloc_size) { + pimpl_->device_init_alloc_size(device, &size); + VLOG(10) << Type() << " init alloc size " << (size >> 20) << "M"; + } else { + return DeviceInterface::GetInitAllocSize(dev_id); + } + return size; + } + + size_t GetReallocSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + size_t size = 0; + if (pimpl_->device_realloc_size) { + pimpl_->device_realloc_size(device, &size); + VLOG(10) << Type() << " realloc size " << (size >> 20) << "M"; + } else { + return DeviceInterface::GetReallocSize(dev_id); + } + return size; + } + + size_t GetExtraPaddingSize(size_t dev_id) override { + const auto device = &devices_pool[dev_id]; + + size_t padding_size = 0; + if (pimpl_->device_extra_padding_size) { + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->device_extra_padding_size(device, &padding_size)); + VLOG(10) << Type() << " extra padding size " << (padding_size >> 20) + << "M"; + } else { + return DeviceInterface::GetExtraPaddingSize(dev_id); + } + return 0; + } + + size_t GetComputeCapability() override { + size_t compute_capability = 0; + if (pimpl_->get_compute_capability) { + pimpl_->get_compute_capability(&compute_capability); + } + VLOG(10) << Type() << " get compute capability " << compute_capability; + return compute_capability; + } + + size_t GetRuntimeVersion() override { + size_t version = 0; + if (pimpl_->get_runtime_version) { + pimpl_->get_runtime_version(&version); + } + VLOG(10) << Type() << " get runtime version " << version; + return version; + } + + size_t GetDriverVersion() override { + size_t version = 0; + if (pimpl_->get_driver_version) { + pimpl_->get_driver_version(&version); + } + VLOG(10) << Type() << " get driver version " << version; + return version; + } + + private: + inline int PlaceToIdNoCheck(const Place& place) { + int dev_id = place.GetDeviceId(); + return dev_id; + } + + inline int PlaceToId(const Place& place) { + int dev_id = PlaceToIdNoCheck(place); + PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(), + platform::errors::NotFound( + "Cannot found %s %d, please check visible devices", + Type(), dev_id)); + return dev_id; + } + + std::unique_ptr pimpl_; + void* dso_handle_; + std::unordered_map devices_pool; +}; + +bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) { +#define CHECK_PTR(ptr, required) \ + if (params->interface->ptr == nullptr && required) { \ + LOG(WARNING) << "CustomRuntime [type: " << params->device_type \ + << "] pointer: " << #ptr << " is not set."; \ + return false; \ + } + + int version = params->version.major * 10000 + params->version.minor * 100 + + params->version.patch; + const int runtime_version = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION * 10000 + + PADDLE_CUSTOM_RUNTIME_MINOR_VERSION * 100 + + PADDLE_CUSTOM_RUNTIME_PATCH_VERSION; + + if (version < runtime_version) { + LOG(WARNING) << "CustomRuntime [type: " << params->device_type + << "] version: " << version + << " < PADDLE_CUSTOM_RUNTIME_VERSION " << runtime_version; + return false; + } + + CHECK_PTR(initialize, false); + CHECK_PTR(finalize, false) + + CHECK_PTR(init_device, false); + CHECK_PTR(set_device, true); + CHECK_PTR(get_device, true); + CHECK_PTR(deinit_device, false); + + CHECK_PTR(create_stream, true); + CHECK_PTR(destroy_stream, true); + CHECK_PTR(query_stream, false); + CHECK_PTR(stream_add_callback, false); + + CHECK_PTR(create_event, true); + CHECK_PTR(record_event, true); + CHECK_PTR(destroy_event, true); + CHECK_PTR(query_event, false); + + CHECK_PTR(synchronize_device, false); + CHECK_PTR(synchronize_stream, true); + CHECK_PTR(synchronize_event, true); + CHECK_PTR(stream_wait_event, true); + + CHECK_PTR(device_memory_allocate, true); + CHECK_PTR(device_memory_deallocate, true); + CHECK_PTR(host_memory_allocate, false); + CHECK_PTR(host_memory_deallocate, false); + CHECK_PTR(unified_memory_allocate, false); + CHECK_PTR(unified_memory_deallocate, false); + CHECK_PTR(memory_copy_h2d, true); + CHECK_PTR(memory_copy_d2h, true); + CHECK_PTR(memory_copy_d2d, true); + CHECK_PTR(memory_copy_p2p, false); + CHECK_PTR(async_memory_copy_h2d, false); + CHECK_PTR(async_memory_copy_d2h, false); + CHECK_PTR(async_memory_copy_d2d, false); + CHECK_PTR(async_memory_copy_p2p, false); + + CHECK_PTR(get_device_count, true); + CHECK_PTR(get_device_list, true); + CHECK_PTR(device_memory_stats, true); + + CHECK_PTR(device_min_chunk_size, true); + CHECK_PTR(device_max_chunk_size, false); + CHECK_PTR(device_max_alloc_size, false); + CHECK_PTR(device_extra_padding_size, false); + CHECK_PTR(get_compute_capability, false); + CHECK_PTR(get_runtime_version, false); + CHECK_PTR(get_driver_version, false); + + return true; +#undef CHECK_PTR +} + +typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params); + +bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params, + std::unique_ptr device_interface, + void* dso_handle) { + if (ValidCustomCustomRuntimeParams(&runtime_params)) { + auto device = + std::make_unique(runtime_params.device_type, 255, true, + std::move(device_interface), dso_handle); + if (false == DeviceManager::Register(std::move(device))) { + LOG(WARNING) << "Skip this library. Register failed!!! there may be a " + "Custom Runtime with the same name."; + return false; + } + } else { + LOG(WARNING) + << "Skip this library. Wrong parameters!!! please check the version " + "compatibility between PaddlePaddle and Custom Runtime."; + return false; + } + return true; +} + +bool LoadCustomRuntimeLib(void* dso_handle) { + CustomRuntimeParams runtime_params; + std::memset(&runtime_params, 0, sizeof(CustomRuntimeParams)); + runtime_params.size = sizeof(CustomRuntimeParams); + auto device_interface = std::make_unique(); + runtime_params.interface = device_interface.get(); + std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface)); + runtime_params.interface->size = sizeof(C_DeviceInterface); + + RegisterDevicePluginFn init_plugin_fn = + reinterpret_cast(dlsym(dso_handle, "InitPlugin")); + if (!init_plugin_fn) { + LOG(WARNING) << "Skip this library. InitPlugin symbol not found."; + return false; + } + init_plugin_fn(&runtime_params); + if (runtime_params.device_type == nullptr) { + LOG(WARNING) + << "Skip this library. InitPlugin failed!!! please check the version " + "compatibility between PaddlePaddle and Custom Runtime."; + return false; + } + return LoadCustomRuntimeLib(runtime_params, std::move(device_interface), + dso_handle); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/fluid/platform/device/custom/custom_device_test.cc new file mode 100644 index 00000000000..6a874ea2212 --- /dev/null +++ b/paddle/fluid/platform/device/custom/custom_device_test.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device/custom/fake_cpu_device.h" +#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/fluid/platform/device_context.h" + +void RegisterDevice() { + CustomRuntimeParams runtime_params; + runtime_params.size = sizeof(CustomRuntimeParams); + auto device_interface = std::make_unique(); + runtime_params.interface = device_interface.get(); + std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface)); + runtime_params.interface->size = sizeof(C_DeviceInterface); + + InitFakeCPUDevice(&runtime_params); + EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib( + runtime_params, std::move(device_interface), nullptr)); +} + +void InitDevice() { + RegisterDevice(); + EXPECT_GT(static_cast( + paddle::platform::DeviceManager::GetAllDeviceTypes().size()), + 0); + auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0); + auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place); + EXPECT_NE(device, nullptr); + + std::vector places; + auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes(); + for (auto dev_type : device_types) { + auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type); + for (auto dev_id : devices) { + places.push_back( + paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id)); + } + } + EXPECT_GT(static_cast(places.size()), 0); + + paddle::platform::DeviceContextPool::Init(places); +} + +void TestDeviceInterface(const paddle::platform::Place& place) { + std::cout << "TestDeviceInterface on " << place << std::endl; + if (paddle::platform::is_custom_place(place)) { + auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place); + auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place); + auto p1 = device->MemoryAllocate( + paddle::platform::DeviceManager::GetMinChunkSize(place)); + EXPECT_NE(p1, nullptr); + + paddle::platform::DeviceManager::SetDevice(place); + auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type); + EXPECT_EQ(dev_id, place.GetDeviceId()); + } +} + +void TestTensorMutableData(const paddle::platform::Place& place) { + std::cout << "TestTensorInitialization on " << place << std::endl; + paddle::framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(paddle::framework::make_ddim({1, 2, 3}), + place); + auto p1_holder = src_tensor.Holder(); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(paddle::framework::make_ddim({3, 1024}), + place); + auto p2_holder = src_tensor.Holder(); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1_holder.get(), p2_holder.get()); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(paddle::framework::make_ddim({2, 2, 3}), + place); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(paddle::framework::make_ddim({2, 2}), + place); + EXPECT_EQ(p1, p2); +} + +void TestTensorShareDataWith(const paddle::platform::Place& place) { + std::cout << "TestTensorShareDataWith on " << place << std::endl; + paddle::framework::Tensor src_tensor; + paddle::framework::Tensor dst_tensor; + src_tensor.mutable_data(paddle::framework::make_ddim({2, 3, 4}), place); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); +} + +void TestTensorUtils(const paddle::platform::Place& place) { + if (paddle::platform::is_custom_place(place) == false) { + return; + } + paddle::framework::Tensor src_tensor; + paddle::framework::Tensor gpu_tensor; + paddle::framework::Tensor dst_tensor; + + int* src_ptr = src_tensor.mutable_data( + paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + paddle::platform::CustomDeviceContext gpu_ctx(place); + paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor); +#if 0 + // GPU Tensor to CPU Tensor + auto cpu_place = new paddle::platform::CPUPlace(); + paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* dst_ptr = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + // Copy the same tensor + paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + + paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + EXPECT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); +#endif +} + +TEST(CustomDevice, Tensor) { + InitDevice(); + auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes(); + for (const auto& dev_type : dev_types) { + std::cout << "Test on " << dev_type << std::endl; + EXPECT_GT(static_cast( + paddle::platform::DeviceManager::GetDeviceCount(dev_type)), + 0); + auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type); + + TestDeviceInterface(place); + TestTensorMutableData(place); + TestTensorShareDataWith(place); + TestTensorUtils(place); + } +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h new file mode 100644 index 00000000000..fbdb4627aba --- /dev/null +++ b/paddle/fluid/platform/device/custom/enforce_custom.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/device_ext.h" + +namespace paddle { +namespace platform { +namespace details { +template +struct CustomDeviceStatusType {}; + +#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \ + template <> \ + struct CustomDeviceStatusType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS); +} // namespace details + +inline std::string build_custom_device_error_msg(C_Status stat) { + std::ostringstream sout; + sout << " CustomDevice error, the error code is : " << stat << ". "; + return sout.str(); +} + +#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CustomDeviceStatusType< \ + __CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + ::paddle::platform::build_custom_device_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) +} // namespace platform +} // namespace paddle +#endif // PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/fluid/platform/device/custom/fake_cpu_device.h new file mode 100644 index 00000000000..c6d8ade4b08 --- /dev/null +++ b/paddle/fluid/platform/device/custom/fake_cpu_device.h @@ -0,0 +1,185 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/platform/device/device_ext.h" + +constexpr size_t global_total_memory = 1024 * 1024UL; +static size_t global_free_memory = global_total_memory; + +C_Status Init() { return C_SUCCESS; } + +C_Status InitDevice(const C_Device device) { return C_SUCCESS; } + +C_Status SetDevice(const C_Device device) { return C_SUCCESS; } + +C_Status GetDevice(const C_Device device) { + device->id = 0; + return C_SUCCESS; +} + +C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; } + +C_Status Finalize() { return C_SUCCESS; } + +C_Status GetDevicesCount(size_t *count) { + *count = 1; + return C_SUCCESS; +} + +C_Status GetDevicesList(size_t *device) { + *device = 0; + return C_SUCCESS; +} + +C_Status MemCpy(const C_Device device, void *dst, const void *src, + size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst, + const void *src, size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status Allocate(const C_Device device, void **ptr, size_t size) { + if (global_free_memory >= size) { + *ptr = malloc(size); + global_free_memory -= size; + return C_SUCCESS; + } else { + *ptr = nullptr; + return C_FAILED; + } +} + +C_Status Deallocate(const C_Device device, void *ptr, size_t size) { + free(ptr); + global_free_memory += size; + return C_SUCCESS; +} + +C_Status CreateStream(const C_Device device, C_Stream *stream) { + return C_SUCCESS; +} + +C_Status DestroyStream(const C_Device device, C_Stream stream) { + return C_SUCCESS; +} + +C_Status CreateEvent(const C_Device device, C_Event *event) { + return C_SUCCESS; +} + +C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) { + return C_SUCCESS; +} + +C_Status DestroyEvent(const C_Device device, C_Event event) { + return C_SUCCESS; +} + +C_Status SyncDevice(const C_Device device) { return C_SUCCESS; } + +C_Status SyncStream(const C_Device device, C_Stream stream) { + return C_SUCCESS; +} + +C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; } + +C_Status StreamWaitEvent(const C_Device device, C_Stream stream, + C_Event event) { + return C_SUCCESS; +} + +C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; } + +C_Status DeviceMemStats(const C_Device device, size_t *total_memory, + size_t *free_memory) { + *total_memory = global_total_memory; + *free_memory = global_free_memory; + return C_SUCCESS; +} + +C_Status DeviceMinChunkSize(const C_Device device, size_t *size) { + *size = 4 * 1024; + return C_SUCCESS; +} + +C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) { + *size = 64 * 1024; + return C_SUCCESS; +} + +C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) { + *size = global_total_memory * 0.95; + return C_SUCCESS; +} + +#define DEVICE_TYPE "FakeCPU" +#define SUB_DEVICE_TYPE "V100" + +void InitFakeCPUDevice(CustomRuntimeParams *params) { + params->device_type = const_cast(DEVICE_TYPE); + params->sub_device_type = const_cast(SUB_DEVICE_TYPE); + params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION; + params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION; + params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION; + + memset(reinterpret_cast(params->interface), 0, + sizeof(C_DeviceInterface)); + + params->interface->initialize = Init; + params->interface->finalize = Finalize; + + params->interface->init_device = InitDevice; + params->interface->set_device = SetDevice; + params->interface->get_device = GetDevice; + params->interface->deinit_device = DestroyDevice; + + params->interface->create_stream = CreateStream; + params->interface->destroy_stream = DestroyStream; + + params->interface->create_event = CreateEvent; + params->interface->destroy_event = DestroyEvent; + params->interface->record_event = RecordEvent; + + params->interface->synchronize_device = SyncDevice; + params->interface->synchronize_stream = SyncStream; + params->interface->synchronize_event = SyncEvent; + params->interface->stream_wait_event = StreamWaitEvent; + + params->interface->memory_copy_h2d = MemCpy; + params->interface->memory_copy_d2d = MemCpy; + params->interface->memory_copy_d2h = MemCpy; + params->interface->async_memory_copy_h2d = AsyncMemCpy; + params->interface->async_memory_copy_d2d = AsyncMemCpy; + params->interface->async_memory_copy_d2h = AsyncMemCpy; + params->interface->device_memory_allocate = Allocate; + params->interface->host_memory_allocate = Allocate; + params->interface->unified_memory_allocate = Allocate; + params->interface->device_memory_deallocate = Deallocate; + params->interface->host_memory_deallocate = Deallocate; + params->interface->unified_memory_deallocate = Deallocate; + + params->interface->get_device_count = GetDevicesCount; + params->interface->get_device_list = GetDevicesList; + params->interface->device_memory_stats = DeviceMemStats; + + params->interface->device_max_chunk_size = DeviceMaxChunkSize; + params->interface->device_min_chunk_size = DeviceMinChunkSize; + params->interface->device_max_alloc_size = DeviceMaxAllocSize; +} diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/fluid/platform/device/device_base.cc new file mode 100644 index 00000000000..6234c961268 --- /dev/null +++ b/paddle/fluid/platform/device/device_base.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/device_base.h" +#include "gflags/gflags.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); + +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +namespace paddle { +namespace platform { + +#define INTERFACE_UNIMPLEMENT \ + PADDLE_THROW(platform::errors::Unimplemented( \ + "%s is not implemented on %s device.", __func__, Type())); + +// info +size_t DeviceInterface::GetComputeCapability() { + VLOG(10) << Type() + " get compute capability " << 0; + return 0; +} + +size_t DeviceInterface::GetRuntimeVersion() { + VLOG(10) << Type() + " get runtime version " << 0; + return 0; +} + +size_t DeviceInterface::GetDriverVersion() { + VLOG(10) << Type() + " get driver version " << 0; + return 0; +} + +// device manage +void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::SynchronizeDevice(size_t dev_id) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } + +int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; } + +// stream manage +void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream, + const stream::Stream::Priority& priority, + const stream::Stream::Flag& flag) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::SynchronizeStream(size_t dev_id, + const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; + return true; +} + +void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream, + stream::Stream::Callback* callback) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::StreamWaitEvent(size_t dev_id, + const stream::Stream* stream, + const event::Event* event) { + INTERFACE_UNIMPLEMENT; +} + +// event manage +void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event, + event::Event::Flag flags) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event, + const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::SynchronizeEvent(size_t dev_id, + const event::Event* event) { + INTERFACE_UNIMPLEMENT; +} + +bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) { + INTERFACE_UNIMPLEMENT; + return true; +} + +// memery manage +void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst, + size_t src_id, const void* src, size_t size, + const stream::Stream* stream) { + INTERFACE_UNIMPLEMENT; +} + +void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) { + INTERFACE_UNIMPLEMENT; + return nullptr; +} + +void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) { + INTERFACE_UNIMPLEMENT; + return nullptr; +} + +void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr, + size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) { + INTERFACE_UNIMPLEMENT; + return nullptr; +} + +void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr, + size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value, + size_t size) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) { + INTERFACE_UNIMPLEMENT; +} + +size_t DeviceInterface::GetMinChunkSize(size_t dev_id) { + INTERFACE_UNIMPLEMENT; +} + +size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) { + size_t available_to_alloc = AvailableAllocSize(dev_id); + PADDLE_ENFORCE_GT(available_to_alloc, 0, + platform::errors::ResourceExhausted( + "Not enough available %s memory.", Type())); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes, + platform::errors::ResourceExhausted( + "Not enough available %s memory.", Type())); + return alloc_bytes; +} + +size_t DeviceInterface::AvailableAllocSize(size_t dev_id) { + size_t total = 0; + size_t available = 0; + MemoryStats(dev_id, &total, &available); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = GetMinChunkSize(dev_id); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + return available_to_alloc; +} + +size_t DeviceInterface::GetInitAllocSize(size_t dev_id) { + size_t init_alloc_size = AllocSize(dev_id, false); + VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M"; + return init_alloc_size; +} + +size_t DeviceInterface::GetReallocSize(size_t dev_id) { + size_t realloc_size = AllocSize(dev_id, true); + VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M"; + return realloc_size; +} + +size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) { + size_t max_alloc_size = + std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id)); + VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M"; + return max_alloc_size; +} + +size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) { + size_t max_chunk_size = GetMaxAllocSize(dev_id); + VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; +} + +size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) { + VLOG(10) << Type() + " extra padding size " << 0; + return 0; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/fluid/platform/device/device_base.h new file mode 100644 index 00000000000..d70b02be80e --- /dev/null +++ b/paddle/fluid/platform/device/device_base.h @@ -0,0 +1,166 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" + +namespace paddle { +namespace platform { + +class DeviceInterface { // Driver / Runtime + public: + DeviceInterface(const std::string& type, uint8_t priority, bool is_custom) + : type_(type), priority_(priority), is_custom_(is_custom) {} + uint8_t Priority() { return priority_; } + std::string Type() { return type_; } + bool IsCustom() { return is_custom_; } + + virtual ~DeviceInterface() {} + + // Info + virtual size_t GetComputeCapability(); + + virtual size_t GetRuntimeVersion(); + + virtual size_t GetDriverVersion(); + + // Platform + //! Initialize + virtual void Initialize(); + + //! Finalize + virtual void Finalize(); + + // Device + virtual size_t GetDeviceCount() = 0; + virtual std::vector GetDeviceList() = 0; + + //! Wait for compute device to finish. + virtual void SynchronizeDevice(size_t dev_id); + + //! Initialize device. + virtual void InitDevice(size_t dev_id); + + //! Deinitialize device. + virtual void DeInitDevice(size_t dev_id); + + // ! Set device to be used. + virtual void SetDevice(size_t dev_id); + + // ! Returns which device is currently being used. + virtual int GetDevice(); + + // Stream + // ! Create an asynchronous stream + virtual void CreateStream( + size_t dev_id, stream::Stream* stream, + const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, + const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag); + + // ! Destroys an asynchronous stream. + virtual void DestroyStream(size_t dev_id, stream::Stream* stream); + + // ! Waits for stream tasks to complete. + virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream); + + // ! Queries an asynchronous stream for completion status. + virtual bool QueryStream(size_t dev_id, const stream::Stream* stream); + + // ! Add a callback to a compute stream. + virtual void AddCallback(size_t dev_id, stream::Stream* stream, + stream::Stream::Callback* callback); + + // Event + // ! Create an event. + virtual void CreateEvent(size_t dev_id, event::Event* event, + event::Event::Flag flags); + + // ! Destroy an event. + virtual void DestroyEvent(size_t dev_id, event::Event* event); + + // ! Records an event. + virtual void RecordEvent(size_t dev_id, const event::Event* event, + const stream::Stream* stream); + + // ! Waits for event to complete. + virtual void SynchronizeEvent(size_t dev_id, const event::Event* event); + // ! Queries an event for completion status. + virtual bool QueryEvent(size_t dev_id, const event::Event* event); + + // ! Make a compute stream wait on an event + virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream, + const event::Event* event); + + // Memory + virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, + size_t size, + const stream::Stream* stream = nullptr); + + virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, + size_t size, + const stream::Stream* stream = nullptr); + + virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, + size_t size, + const stream::Stream* stream = nullptr); + + virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id, + const void* src, size_t size, + const stream::Stream* stream = nullptr); + + virtual void* MemoryAllocate(size_t dev_id, size_t size); + + virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size); + + virtual void* MemoryAllocateHost(size_t dev_id, size_t size); + + virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size); + + virtual void* MemoryAllocateUnified(size_t dev_id, size_t size); + + virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size); + + virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size); + + virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free); + + virtual size_t GetMinChunkSize(size_t dev_id); + + virtual size_t GetInitAllocSize(size_t dev_id); + + virtual size_t GetReallocSize(size_t dev_id); + + virtual size_t GetMaxAllocSize(size_t dev_id); + + virtual size_t GetMaxChunkSize(size_t dev_id); + + virtual size_t GetExtraPaddingSize(size_t dev_id); + + private: + const std::string type_; + const uint8_t priority_; + const bool is_custom_; + + size_t AllocSize(size_t dev_id, bool realloc); + + size_t AvailableAllocSize(size_t dev_id); +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/fluid/platform/device/device_ext.h new file mode 100644 index 00000000000..d1e1340f74b --- /dev/null +++ b/paddle/fluid/platform/device/device_ext.h @@ -0,0 +1,497 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#if !defined(_WIN32) && !defined(__APPLE__) +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0 +#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1 +#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1 + +typedef enum { + C_SUCCESS = 0, // success + C_WARNING, // results may not meet expectation (such as an asynchronous + // interface is actually synchronous) + C_FAILED, // resource exhausted/query failed + C_ERROR, // invalid argument/wrong usage/uninitialized + C_INTERNAL_ERROR // plugin error +} C_Status; + +typedef struct C_Device_st { int id; } * C_Device; + +typedef struct C_Stream_st* C_Stream; + +typedef struct C_Event_st* C_Event; + +typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data, + C_Status* status); + +struct C_DeviceInterface { + // Core fill it and plugin must to check it + size_t size; + + /////////////////////// + // device manage api // + /////////////////////// + + /** + * @brief Initialize hardware + * + */ + C_Status (*initialize)(); + + /** + * @brief Deinitialize hardware + * + */ + C_Status (*finalize)(); + + /** + * @brief Initialize device + * + * @param[C_Device] device Core fill it with a logical id, and then plugin + * must replace it with a physical id + */ + C_Status (*init_device)(const C_Device device); + + /** + * @brief Set current device + * + * @param[C_Device] device Core fill it with a physical id + */ + C_Status (*set_device)(const C_Device device); + + /** + * @brief Get current device + * + * @param[C_Device] device Plugin fill it with a physical id + */ + C_Status (*get_device)(const C_Device device); + + /** + * @brief Deinitialize device + * + * @param[C_Device] device Core fill it with a physical id + */ + C_Status (*deinit_device)(const C_Device device); + + /** + * @brief Create a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream*] stream Plugin create a stream and fill it + */ + C_Status (*create_stream)(const C_Device device, C_Stream* stream); + + /** + * @brief Destroy a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + */ + C_Status (*destroy_stream)(const C_Device device, C_Stream stream); + + /** + * @brief Query a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + */ + C_Status (*query_stream)(const C_Device device, C_Stream stream); + + /** + * @brief Add a callback to stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[C_Callback] callback + * @param[void*] user_data + */ + C_Status (*stream_add_callback)(const C_Device device, C_Stream stream, + C_Callback callback, void* user_data); + + /** + * @brief Create an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event*] event Plugin create an event and fill it + */ + C_Status (*create_event)(const C_Device device, C_Event* event); + + /** + * @brief Record an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[C_Event] event + */ + C_Status (*record_event)(const C_Device device, C_Stream stream, + C_Event event); + + /** + * @brief Destroy an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event] event + */ + C_Status (*destroy_event)(const C_Device device, C_Event event); + + /** + * @brief Query an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event] event + */ + C_Status (*query_event)(const C_Device device, C_Event event); + + /** + * @brief Synchronize a device + * + * @param[C_Device] device Core fill it with a physical id + */ + C_Status (*synchronize_device)(const C_Device device); + + /** + * @brief Synchronize a stream + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + */ + C_Status (*synchronize_stream)(const C_Device device, C_Stream stream); + + /** + * @brief Synchronize an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Event] event + */ + C_Status (*synchronize_event)(const C_Device device, C_Event event); + + /** + * @brief Make a stream wait on an event + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[C_Event] event + */ + C_Status (*stream_wait_event)(const C_Device device, C_Stream stream, + C_Event event); + + void* reserved_dev_api[8]; + + /////////////////////// + // memory manage api // + /////////////////////// + + /** + * @brief Device memory allocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void**] ptr Plugin allocate an address and fill it + * @param[size_t] size + */ + C_Status (*device_memory_allocate)(const C_Device device, void** ptr, + size_t size); + + /** + * @brief Device memory deallocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[size_t] size + */ + C_Status (*device_memory_deallocate)(const C_Device device, void* ptr, + size_t size); + + /** + * @brief Device memory set + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[unsigned char] value + * @param[size_t] size + */ + C_Status (*device_memory_set)(const C_Device device, void* ptr, + unsigned char value, size_t size); + + /** + * @brief Host memory allocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void**] ptr Plugin allocate an address and fill it + * @param[size_t] size + */ + C_Status (*host_memory_allocate)(const C_Device device, void** ptr, + size_t size); + + /** + * @brief Host memory deallocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[size_t] size + */ + C_Status (*host_memory_deallocate)(const C_Device device, void* ptr, + size_t size); + + /** + * @brief Unified memory allocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void**] ptr Plugin allocate an address and fill it + * @param[size_t] size + */ + C_Status (*unified_memory_allocate)(const C_Device device, void** ptr, + size_t size); + + /** + * @brief Unified memory deallocate + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] ptr + * @param[size_t] size + */ + C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr, + size_t size); + + /** + * @brief Memory copy from host to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src, + size_t size); + + /** + * @brief Memory copy from device to host + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src, + size_t size); + + /** + * @brief Memory copy from device to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src, + size_t size); + + /** + * @brief Peer memory copy from device to device + * + * @param[C_Device] dst_device Core fill it with a physical id + * @param[C_Device] src_device Core fill it with a physical id + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*memory_copy_p2p)(const C_Device dst_device, + const C_Device src_device, void* dst, + const void* src, size_t size); + + /** + * @brief Asynchonrize memory copy from host to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream, + void* dst, const void* src, size_t size); + + /** + * @brief Asynchonrize memory copy from device to host + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream, + void* dst, const void* src, size_t size); + + /** + * @brief Asynchonrize memory copy from device to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream, + void* dst, const void* src, size_t size); + + /** + * @brief Peer asynchonrize memory copy from host to device + * + * @param[C_Device] device Core fill it with a physical id + * @param[C_Stream] stream + * @param[void*] dst + * @param[void*] src + * @param[size_t] size + */ + C_Status (*async_memory_copy_p2p)(const C_Device dst_device, + const C_Device src_device, C_Stream stream, + void* dst, const void* src, size_t size); + + void* reserved_mem_api[8]; + + ////////////// + // info api // + ////////////// + + /** + * @brief Get visible device count + * + * @param[size_t*] count Plugin fill it + */ + C_Status (*get_device_count)(size_t* count); + + /** + * @brief Get visible device list + * + * @param[size_t*] devices Plugin fill it + */ + C_Status (*get_device_list)(size_t* devices); + + /** + * @brief Device memory statistic + * + * @param[C_Device] device Core fill it with a physical id + * @param[size_t*] total_memory + * @param[size_t*] free_memory + * @param[size_t*] used_memory + */ + C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory, + size_t* free_memory); + + /** + * @brief Device minimum chunk size + * + * @param[size_t*] count + */ + C_Status (*device_min_chunk_size)(const C_Device device, size_t* count); + + /** + * @brief Device maximum chunk size + * + * @param[size_t*] count + */ + C_Status (*device_max_chunk_size)(const C_Device device, size_t* count); + + /** + * @brief Device maximum alloc size + * + * @param[size_t*] count + */ + C_Status (*device_max_alloc_size)(const C_Device device, size_t* count); + + /** + * @brief Device extra padding size + * + * @param[size_t*] size + */ + C_Status (*device_extra_padding_size)(const C_Device device, size_t* size); + + /** + * @brief Device initial allocated size + * + * @param[size_t*] size + */ + C_Status (*device_init_alloc_size)(const C_Device device, size_t* size); + + /** + * @brief Device reallocated size + * + * @param[size_t*] size + */ + C_Status (*device_realloc_size)(const C_Device device, size_t* size); + + /** + * @brief Get compute capability + * + * @param[size_t*] compute_capability + */ + C_Status (*get_compute_capability)(size_t* compute_capability); + + /** + * @brief Get runtime version + * + * @param[size_t*] version + */ + C_Status (*get_runtime_version)(size_t* version); + + /** + * @brief Get driver version + * + * @param[size_t*] version + */ + C_Status (*get_driver_version)(size_t* version); + + void* reserved_info_api[8]; + + /////////////// + // other api // + /////////////// + + void* reserved_other_api[8]; +}; + +struct CustomRuntimeVersion { + size_t major, minor, patch; +}; + +struct CustomRuntimeParams { + // Core fill it and plugin must to check it + size_t size; + // Plugin fill it + C_DeviceInterface* interface; + // Plugin fill it and Core will to check it + CustomRuntimeVersion version; + // Plugin fill it + char* device_type; + // Plugin fill it + char* sub_device_type; + + char reserved[32]; +}; + +// Plugin implement it and fill CustomRuntimeParams +void InitPlugin(CustomRuntimeParams*); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/fluid/platform/device/device_guard.cc new file mode 100644 index 00000000000..55d8b9dc6a9 --- /dev/null +++ b/paddle/fluid/platform/device/device_guard.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/device_guard.h" + +namespace paddle { +namespace platform { +// Even this source file does not contains any code, it is better to keep this +// source file for cmake dependency. +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/fluid/platform/device/device_guard.h new file mode 100644 index 00000000000..638e9c984b4 --- /dev/null +++ b/paddle/fluid/platform/device/device_guard.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/platform/device/device_manager.h" + +namespace paddle { +namespace platform { + +class DeviceGuard { + public: + explicit inline DeviceGuard(const Place& place) + : dev_type_(PlaceHelper::GetDeviceType(place)) { + prev_id = DeviceManager::GetDevice(dev_type_); + cur_id = PlaceHelper::GetDeviceId(place); + + if (cur_id != prev_id) { + DeviceManager::SetDevice(dev_type_, cur_id); + } + } + + inline ~DeviceGuard() { + if (cur_id != prev_id) { + DeviceManager::SetDevice(dev_type_, prev_id); + } + } + + DeviceGuard(const DeviceGuard& o) = delete; + DeviceGuard& operator=(const DeviceGuard& o) = delete; + + private: + size_t prev_id, cur_id; + std::string dev_type_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/fluid/platform/device/device_manager.cc new file mode 100644 index 00000000000..38dcb721b1f --- /dev/null +++ b/paddle/fluid/platform/device/device_manager.cc @@ -0,0 +1,420 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/device_manager.h" + +#if !defined(_WIN32) +#include +#else + +#endif + +#include +#include + +namespace paddle { +namespace platform { + +void Device::CreateStream(stream::Stream* stream, + const stream::Stream::Priority& priority, + const stream::Stream::Flag& flag) { + impl_->CreateStream(dev_id_, stream, priority, flag); +} + +void Device::DestroyStream(stream::Stream* stream) { + impl_->DestroyStream(dev_id_, stream); +} + +void Device::SynchronizeStream(const stream::Stream* stream) { + impl_->SynchronizeStream(dev_id_, stream); +} + +bool Device::QueryStream(const stream::Stream* stream) { + return impl_->QueryStream(dev_id_, stream); +} + +void Device::AddCallback(stream::Stream* stream, + stream::Stream::Callback* callback) { + impl_->AddCallback(dev_id_, stream, callback); +} + +void Device::CreateEvent(event::Event* event, event::Event::Flag flags) { + impl_->CreateEvent(dev_id_, event, flags); +} + +void Device::DestroyEvent(event::Event* event) { + impl_->DestroyEvent(dev_id_, event); +} + +void Device::RecordEvent(const event::Event* event, + const stream::Stream* stream) { + impl_->RecordEvent(dev_id_, event, stream); +} + +void Device::SynchronizeEvent(const event::Event* event) { + impl_->SynchronizeEvent(dev_id_, event); +} + +bool Device::QueryEvent(const event::Event* event) { + return impl_->QueryEvent(dev_id_, event); +} + +void Device::StreamWaitEvent(const stream::Stream* stream, + const event::Event* event) { + impl_->StreamWaitEvent(dev_id_, stream, event); +} + +void Device::MemoryCopyH2D(void* dst, const void* src, size_t size, + const stream::Stream* stream) { + impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream); +} + +void Device::MemoryCopyD2H(void* dst, const void* src, size_t size, + const stream::Stream* stream) { + impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream); +} + +void Device::MemoryCopyD2D(void* dst, const void* src, size_t size, + const stream::Stream* stream) { + impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream); +} + +void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src, + size_t size, const stream::Stream* stream) { + impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream); +} + +void* Device::MemoryAllocate(size_t size) { + return impl_->MemoryAllocate(dev_id_, size); +} + +void Device::MemoryDeallocate(void* ptr, size_t size) { + impl_->MemoryDeallocate(dev_id_, ptr, size); +} + +void* Device::MemoryAllocateHost(size_t size) { + return impl_->MemoryAllocateHost(dev_id_, size); +} + +void Device::MemoryDeallocateHost(void* ptr, size_t size) { + impl_->MemoryDeallocateHost(dev_id_, ptr, size); +} + +void* Device::MemoryAllocateUnified(size_t size) { + return impl_->MemoryAllocateUnified(dev_id_, size); +} + +void Device::MemoryDeallocateUnified(void* ptr, size_t size) { + impl_->MemoryDeallocateUnified(dev_id_, ptr, size); +} + +void Device::MemorySet(void* ptr, uint8_t value, size_t size) { + impl_->MemorySet(dev_id_, ptr, value, size); +} + +std::string Device::Type() { return impl_->Type(); } + +static pten::RWLock _global_device_manager_rw_lock; + +bool DeviceManager::Register(std::unique_ptr device_impl) { + pten::AutoWRLock lock(&_global_device_manager_rw_lock); + VLOG(4) << "Register Device - " << device_impl->Type(); + auto device_type = device_impl->Type(); + auto& dev_impl_map = Instance().device_impl_map_; + auto& dev_map = Instance().device_map_; + + if (dev_impl_map.find(device_type) == dev_impl_map.end()) { + dev_impl_map.insert( + std::pair>( + device_type, std::move(device_impl))); + auto& dev_impl = dev_impl_map[device_type]; + auto& dev_vec = dev_map[device_type]; + VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount(); + for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) { + dev_vec.emplace_back(new Device(i, dev_impl.get())); + } + } else { + auto& plat = dev_impl_map[device_type]; + if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) { + dev_impl_map[device_type] = std::move(device_impl); + auto& dev_impl = dev_impl_map[device_type]; + auto& dev_vec = dev_map[device_type]; + dev_vec.clear(); + VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount(); + for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) { + dev_vec.emplace_back(new Device(i, dev_impl.get())); + } + } else { + return false; + } + } + return true; +} + +DeviceInterface* DeviceManager::GetDeviceInterfaceWithType( + const std::string& device_type) { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + + auto& dev_impl_map = Instance().device_impl_map_; + if (dev_impl_map.find(device_type) != dev_impl_map.end()) { + return dev_impl_map.at(device_type).get(); + } else { + LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n"; + PADDLE_THROW( + platform::errors::Fatal("Unregistered device type %s.", device_type)); + return nullptr; + } +} + +Device* DeviceManager::GetDeviceWithPlace(const Place& place) { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + + auto& dev_map = Instance().device_map_; + auto dev_type = PlaceHelper::GetDeviceType(place); + auto dev_id = PlaceHelper::GetDeviceId(place); + PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(), + platform::errors::NotFound( + "Unable to find Device with type %s.", dev_type)); + auto& dev_vec = dev_map[dev_type]; + PADDLE_ENFORCE_LT( + dev_id, dev_vec.size(), + platform::errors::OutOfRange( + "The visible devices count of type %s is %d, but dev_id is %d.", + dev_type, dev_vec.size(), dev_id)); + return dev_vec[dev_id].get(); +} + +std::vector DeviceManager::GetAllDeviceTypes() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + devices.push_back(iter->first); + } + return devices; +} + +std::vector DeviceManager::GetAllCustomDeviceTypes() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + if (iter->second->IsCustom()) { + devices.push_back(iter->first); + } + } + return devices; +} + +std::vector DeviceManager::GetAllDeviceList() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + size_t device_count = iter->second->GetDeviceCount(); + std::string dev_type = iter->second->Type(); + if (device_count == 1) { + devices.push_back(dev_type); + } else { + for (size_t i = 0; i < device_count; ++i) { + devices.push_back(dev_type + ":" + std::to_string(i)); + } + } + } + return devices; +} + +std::vector DeviceManager::GetAllCustomDeviceList() { + pten::AutoRDLock lock(&_global_device_manager_rw_lock); + auto& dev_impl_map = Instance().device_impl_map_; + std::vector devices; + for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) { + size_t device_count = iter->second->GetDeviceCount(); + std::string dev_type = iter->second->Type(); + if (iter->second->IsCustom()) { + if (device_count == 1) { + devices.push_back(dev_type); + } else { + for (size_t i = 0; i < device_count; ++i) { + devices.push_back(dev_type + ":" + std::to_string(i)); + } + } + } + } + return devices; +} + +bool DeviceManager::HasDeviceType(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl != nullptr; +} + +bool DeviceManager::IsCustom(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->IsCustom(); +} + +void DeviceManager::Initialize(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->Initialize(); +} + +void DeviceManager::Finalize(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->Finalize(); +} + +void DeviceManager::SynchronizeDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->SynchronizeDevice(device_id); +} + +void DeviceManager::InitDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->InitDevice(device_id); +} + +void DeviceManager::DeInitDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->DeInitDevice(device_id); +} + +void DeviceManager::SetDevice(const std::string& device_type, + size_t device_id) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->SetDevice(device_id); +} + +void DeviceManager::SetDevice(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + DeviceManager::SetDevice(device_type, device_id); +} + +int DeviceManager::GetDevice(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetDevice(); +} + +size_t DeviceManager::GetMinChunkSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetMinChunkSize(device_id); +} + +size_t DeviceManager::GetMaxChunkSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetMaxChunkSize(device_id); +} + +size_t DeviceManager::GetMaxAllocSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetMaxAllocSize(device_id); +} + +size_t DeviceManager::GetInitAllocSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetInitAllocSize(device_id); +} + +size_t DeviceManager::GetReallocSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetReallocSize(device_id); +} + +size_t DeviceManager::GetExtraPaddingSize(const Place& place) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetExtraPaddingSize(device_id); +} + +void DeviceManager::MemoryStats(const Place& place, size_t* total, + size_t* free) { + auto device_type = PlaceHelper::GetDeviceType(place); + auto device_id = PlaceHelper::GetDeviceId(place); + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->MemoryStats(device_id, total, free); +} + +size_t DeviceManager::GetDeviceCount(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetDeviceCount(); +} + +std::vector DeviceManager::GetDeviceList( + const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + return dev_impl->GetDeviceList(); +} + +DeviceManager& DeviceManager::Instance() { + static DeviceManager platform_manager; + return platform_manager; +} + +std::vector ListAllLibraries(const std::string& library_dir) { + std::vector libraries; + std::regex express(".*\\.so"); + std::match_results results; + DIR* dir = nullptr; + dirent* ptr = nullptr; + + dir = opendir(library_dir.c_str()); + if (dir == nullptr) { + VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed"; + } else { + while ((ptr = readdir(dir)) != nullptr) { + std::string filename(ptr->d_name); + if (std::regex_match(filename.begin(), filename.end(), results, + express)) { + libraries.push_back(library_dir + '/' + filename); + VLOG(4) << "found CustomDevice library: " << libraries.back() + << std::endl; + } + } + closedir(dir); + } + + return libraries; +} + +bool LoadCustomDevice(const std::string& library_dir) { + std::vector libs = ListAllLibraries(library_dir); + for (const auto& lib_path : libs) { + auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW); + LoadCustomRuntimeLib(dso_handle); + } + return true; +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/fluid/platform/device/device_manager.h new file mode 100644 index 00000000000..ad910605d98 --- /dev/null +++ b/paddle/fluid/platform/device/device_manager.h @@ -0,0 +1,186 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE + +#include "paddle/fluid/platform/device/device_base.h" +#include "paddle/fluid/platform/device/device_ext.h" +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/pten/backends/dynload/port.h" +#include "paddle/pten/core/utils/rw_lock.h" + +namespace paddle { +namespace platform { +class Device final { + public: + Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {} + + // Stream + // ! Create an asynchronous stream + void CreateStream( + stream::Stream* stream, const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, + const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag); + + // ! Destroys an asynchronous stream. + void DestroyStream(stream::Stream* stream); + + // ! Waits for stream tasks to complete. + void SynchronizeStream(const stream::Stream* stream); + + // ! Queries an asynchronous stream for completion status. + bool QueryStream(const stream::Stream* stream); + + // ! Add a callback to a compute stream. + void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback); + + // Event + // ! Create an event. + void CreateEvent(event::Event* event, event::Event::Flag flags); + + // ! Destroy an event. + void DestroyEvent(event::Event* event); + + // ! Records an event. + void RecordEvent(const event::Event* event, const stream::Stream* stream); + + // ! Waits for event to complete. + void SynchronizeEvent(const event::Event* event); + + // ! Queries an event for completion status. + bool QueryEvent(const event::Event* event); + + // ! Make a compute stream wait on an event + void StreamWaitEvent(const stream::Stream* stream, const event::Event* event); + + // Memory + void MemoryCopyH2D(void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr); + + void MemoryCopyD2H(void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr); + + void MemoryCopyD2D(void* dst, const void* src, size_t size, + const stream::Stream* stream = nullptr); + + void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src, + size_t size, const stream::Stream* stream = nullptr); + + void* MemoryAllocate(size_t size); + + void MemoryDeallocate(void* ptr, size_t size); + + void* MemoryAllocateHost(size_t size); + + void MemoryDeallocateHost(void* ptr, size_t size); + + void* MemoryAllocateUnified(size_t size); + + void MemoryDeallocateUnified(void* ptr, size_t size); + + void MemorySet(void* ptr, uint8_t value, size_t size); + + std::string Type(); + + private: + size_t dev_id_; + DeviceInterface* impl_; +}; + +class DeviceManager { + public: + static bool Register(std::unique_ptr device); + static bool RegisterPinnedDevice(DeviceInterface* device); + static Device* GetDeviceWithPlace(const Place& place); + static std::vector GetAllDeviceTypes(); + static std::vector GetAllCustomDeviceTypes(); + static std::vector GetAllDeviceList(); + static std::vector GetAllCustomDeviceList(); + static bool HasDeviceType(const std::string& device_type); + static bool IsCustom(const std::string& device_type); + + // platform & device + static void Initialize(const std::string& device_type); + + static void Finalize(const std::string& device_type); + + static void SynchronizeDevice(const Place& place); + + static void InitDevice(const Place& place); + + static void DeInitDevice(const Place& place); + + static void SetDevice(const std::string& device_type, size_t device_id); + + static void SetDevice(const Place& place); + + static int GetDevice(const std::string& device_type); + + static size_t GetMinChunkSize(const Place& place); + + static size_t GetMaxChunkSize(const Place& place); + + static size_t GetMaxAllocSize(const Place& place); + + static size_t GetInitAllocSize(const Place& place); + + static size_t GetReallocSize(const Place& place); + + static size_t GetExtraPaddingSize(const Place& place); + + static void MemoryStats(const Place& place, size_t* total, size_t* free); + + static size_t GetDeviceCount(const std::string& device_type); + + static std::vector GetDeviceList(const std::string& device_type); + + private: + DISABLE_COPY_AND_ASSIGN(DeviceManager); + DeviceManager() {} + static DeviceManager& Instance(); + static DeviceInterface* GetDeviceInterfaceWithType( + const std::string& device_type); + + std::unordered_map> + device_impl_map_; + std::unordered_map>> + device_map_; +}; + +bool LoadCustomRuntimeLib(void* dso_handle); + +bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params, + std::unique_ptr device_interface, + void* dso_handle); + +bool LoadCustomDevice(const std::string& library_path); + +class Registrar { + public: + template + explicit Registrar(DeviceT* device_ptr) { + DeviceManager::Register(std::unique_ptr(device_ptr)); + } + + void Touch() {} +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index 4f8bbb2d268..ba3461d8c14 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -38,3 +38,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_IPU #include "paddle/fluid/platform/device/ipu/ipu_info.h" #endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/fluid/platform/device/custom/enforce_custom.h" +#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/stream.h" +#endif diff --git a/paddle/fluid/platform/device/event.cc b/paddle/fluid/platform/device/event.cc new file mode 100644 index 00000000000..6e6316ea16d --- /dev/null +++ b/paddle/fluid/platform/device/event.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/event.h" +#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/stream.h" + +namespace paddle { +namespace platform { +namespace event { + +event_t Event::raw_event() const { return event_; } + +void Event::set_event(event_t event) { event_ = event; } + +Event::Event(const Place& place, event_t event) + : place_(place), + device_(platform::DeviceManager::GetDeviceWithPlace(place)), + event_(event), + own_data_(false) {} + +Event::~Event() { Destroy(); } + +bool Event::Init(const Place& place, Flag flags) { + place_ = place; + DeviceGuard guard(place_); + device_->CreateEvent(this, flags); + VLOG(3) << "Init Event: " << event_ << ", place: " << place_ + << ", flag:" << static_cast(flags); + own_data_ = true; + return true; +} + +void Event::Destroy() { + if (own_data_) { + DeviceGuard guard(place_); + device_->DestroyEvent(this); + own_data_ = false; + } +} + +void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); } + +bool Event::Query() const { return device_->QueryEvent(this); } + +void Event::Synchonrize() const { device_->SynchronizeEvent(this); } + +const Place& Event::GetPlace() const { return place_; } + +} // namespace event +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/event.h b/paddle/fluid/platform/device/event.h new file mode 100644 index 00000000000..376d73eb666 --- /dev/null +++ b/paddle/fluid/platform/device/event.h @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +class Device; + +namespace stream { +class Stream; +} // namespace stream + +namespace event { +using event_t = void*; + +class Event { + public: + enum Flag { + Default = 0x0, + BlockingSync = 0x1, + DisableTiming = 0x2, + Interprocess = 0x4, + }; + + // For compatible + Event(const Place& place, event_t event); + ~Event(); + event_t raw_event() const; + void set_event(event_t event); + bool Init(const Place& place, Flag flags = Flag::Default); + void Destroy(); + void Record(const stream::Stream* stream); + bool Query() const; + void Synchonrize() const; + const Place& GetPlace() const; + + private: + DISABLE_COPY_AND_ASSIGN(Event); + Place place_; + Device* device_; + event_t event_; + bool own_data_ = true; +}; +} // namespace event + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/fluid/platform/device/stream.cc new file mode 100644 index 00000000000..7f867e5ee77 --- /dev/null +++ b/paddle/fluid/platform/device/stream.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/event.h" + +namespace paddle { +namespace platform { +namespace stream { + +Stream::~Stream() { Destroy(); } + +const stream_t& Stream::raw_stream() const { return stream_; } + +void Stream::set_stream(stream_t stream) { stream_ = stream; } + +// For compatiable +Stream::Stream(const Place& place, stream_t stream) + : place_(place), + device_(platform::DeviceManager::GetDeviceWithPlace(place)), + stream_(stream), + callback_manager_(new CallbackManager(this)), + own_data_(false) {} + +bool Stream::Init(const Place& place, const Priority& priority, + const Flag& flag) { + place_ = place; + device_ = platform::DeviceManager::GetDeviceWithPlace(place); + DeviceGuard guard(place_); + device_->CreateStream(this, priority, flag); + + callback_manager_.reset(new CallbackManager(this)); + VLOG(3) << "Init Stream: " << stream_ << ", place: " << place_ + << ", priority: " << static_cast(priority) + << ", flag:" << static_cast(flag); + own_data_ = true; + return true; +} + +void Stream::RecordEvent(event::Event* event, Callback callback) const { + callback(); + device_->RecordEvent(event, this); +} + +void Stream::RecordEvent(event::Event* event) const { + device_->RecordEvent(event, this); +} + +void Stream::WaitEvent(event::Event* event) const { + device_->StreamWaitEvent(this, event); +} + +void Stream::Wait() const { +#if !defined(_WIN32) + device_->SynchronizeStream(this); +#else + while (1) { + if (device_->QueryStream(this)) { + break; + } + } +#endif +} + +void Stream::WaitCallback() const { callback_manager_->Wait(); } + +void Stream::Destroy() { + if (own_data_) { + DeviceGuard guard(place_); + device_->DestroyStream(this); + own_data_ = false; + } +} + +bool Stream::Query() const { return device_->QueryStream(this); } + +void Stream::Synchronize() const { device_->SynchronizeStream(this); } + +const Place& Stream::GetPlace() const { return place_; } + +} // namespace stream +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/stream.h b/paddle/fluid/platform/device/stream.h new file mode 100644 index 00000000000..25cf705ee09 --- /dev/null +++ b/paddle/fluid/platform/device/stream.h @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +class Device; + +namespace event { +class Event; +} // namespace event + +namespace stream { +using stream_t = void*; +class Stream { + public: + enum class Priority : uint8_t { + kNull = 0x0, + kHigh = 0x1, + kNormal = 0x2, + }; + + enum class Flag : uint8_t { + kDefaultFlag = 0x0, + kStreamNonBlocking = 0x1, + }; + + using Callback = std::function; + + Stream() = default; + // For compatiable + Stream(const Place& place, stream_t stream); + ~Stream(); + const stream_t& raw_stream() const; + void set_stream(stream_t stream); + bool Init(const Place& place, const Priority& priority = Priority::kNormal, + const Flag& flag = Flag::kDefaultFlag); + template + void AddCallback(Callback&& callback) const { + callback_manager_->AddCallback(callback); + } + void RecordEvent(event::Event* event, Callback callback) const; + void RecordEvent(event::Event* event) const; + void WaitEvent(event::Event* event) const; + void Wait() const; + void WaitCallback() const; + void Destroy(); + bool Query() const; + void Synchronize() const; + const Place& GetPlace() const; + + private: + DISABLE_COPY_AND_ASSIGN(Stream); + Place place_; + Device* device_; + stream_t stream_; + std::unique_ptr callback_manager_; + bool own_data_ = true; +}; + +} // namespace stream +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a0a853a2f05..d448df0702a 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/expect.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -256,6 +257,15 @@ DeviceContextPool::DeviceContextPool( "NPUPinnedPlace is not supported. Please re-compile with " "WITH_ASCEND_CL " "option.")); +#endif + } else if (platform::is_custom_place(p)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + EmplaceDeviceContext(&device_contexts_, p); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CustomPlace is not supported. Please re-compile with " + "WITH_CUSTOM_DEVICE " + "option.")); #endif } } @@ -885,6 +895,24 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( return key_it->second; } +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) { + DeviceGuard guard(place_); + stream_.reset(new stream::Stream()); + stream_->Init(place_); +} + +CustomDeviceContext::~CustomDeviceContext() {} + +const Place& CustomDeviceContext::GetPlace() const { return place_; } + +void CustomDeviceContext::Wait() const { + // platform::RecordEvent record_event("NPUDeviceContext/wait"); + VLOG(4) << "CustomDevice context(" << this << ") Wait"; + stream_->Wait(); +} #endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 80dcf6d2ec2..1d51383f683 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -70,6 +70,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/device/npu/npu_stream.h" #endif + +#include "paddle/fluid/platform/device/device_ext.h" +#include "paddle/fluid/platform/device/stream.h" #include "unsupported/Eigen/CXX11/Tensor" namespace Eigen { @@ -815,6 +818,47 @@ class MKLDNNDeviceContext : public CPUDeviceContext { }; #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class CustomDeviceContext : public DeviceContext { + public: + explicit CustomDeviceContext(CustomPlace place); + virtual ~CustomDeviceContext(); + + const Place& GetPlace() const override; + void Wait() const override; + Eigen::DefaultDevice* eigen_device() const { return nullptr; } + C_Stream stream() const { + return reinterpret_cast(stream_->raw_stream()); + } + + template + void AddStreamCallback(Callback&& callback) const { + return stream_->AddCallback(callback); + } + + void WaitStreamCallback() const { return stream_->WaitCallback(); } + + private: + std::string device_type_; + + CustomPlace place_; + + std::shared_ptr stream_; + + CustomDeviceContext(); + DISABLE_COPY_AND_ASSIGN(CustomDeviceContext); +}; +template <> +struct DefaultDeviceContextType { + using TYPE = CustomDeviceContext; +}; +#else +template <> +struct DefaultDeviceContextType { + using TYPE = DeviceContext; +}; +#endif + /*! \brief device context pool singleton */ class DeviceContextPool { public: diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index b969ba971b6..39f95a92956 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) /** * Memory related FLAG diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index f7a86e5aac7..5d0fccf9e9d 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -25,6 +25,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cupti.h" #endif +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" @@ -234,6 +235,19 @@ void InitDevices(const std::vector devices) { if (!custom_kernel_root.empty()) { LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root; framework::LoadCustomKernel(custom_kernel_root); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::LoadCustomDevice(custom_kernel_root)) { + auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + for (auto &dev_type : device_types) { + VLOG(1) << "Device type: " << dev_type << ", visible devices count: " + << platform::DeviceManager::GetDeviceCount(dev_type); + for (size_t i = 0; + i < platform::DeviceManager::GetDeviceCount(dev_type); i++) { + places.push_back(platform::CustomPlace(dev_type, i)); + } + } + } +#endif } else { VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty."; } diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index e73e3736f64..b73e2e398f2 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -56,7 +56,16 @@ bool is_npu_pinned_place(const Place &p) { return p.GetType() == pten::AllocationType::NPUPINNED; } +bool is_custom_place(const Place &p) { + return p.GetType() == pten::AllocationType::CUSTOM; +} + bool places_are_same_class(const Place &p1, const Place &p2) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (is_custom_place(p1) && is_custom_place(p2)) { + return p1.GetDeviceType() == p2.GetDeviceType(); + } +#endif return p1.GetType() == p2.GetType(); } @@ -73,6 +82,8 @@ bool is_same_place(const Place &p1, const Place &p2) { return p1 == p2; } else if (is_ipu_place(p1)) { return p1 == p2; + } else if (is_custom_place(p1)) { + return p1 == p2; } else { return p1 == p2; } @@ -81,5 +92,43 @@ bool is_same_place(const Place &p1, const Place &p2) { } } +#ifdef PADDLE_WITH_CUSTOM_DEVICE +std::string PlaceHelper::GetDeviceType(const Place &place) { + if (is_cpu_place(place)) { + return "cpu"; + } else if (is_gpu_place(place)) { + return "gpu"; + } else if (is_npu_place(place)) { + return "npu"; + } else if (is_xpu_place(place)) { + return "xpu"; + } else if (is_custom_place(place)) { + return place.GetDeviceType(); + } else { + PADDLE_THROW(platform::errors::Fatal( + "Unknown device type. Please check available devices by " + "paddle.device.get_available_device()")); + } +} + +size_t PlaceHelper::GetDeviceId(const Place &place) { + return place.GetDeviceId(); +} + +Place PlaceHelper::CreatePlace(const std::string &dev_type, size_t dev_id) { + if (dev_type == "cpu") { + return platform::CPUPlace(); + } else if (dev_type == "gpu") { + return platform::CUDAPlace(dev_id); + } else if (dev_type == "npu") { + return platform::NPUPlace(dev_id); + } else if (dev_type == "xpu") { + return platform::XPUPlace(dev_id); + } else { + return platform::CustomPlace(dev_type, dev_id); + } +} +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 80bbeac2518..278bfad003c 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -36,9 +36,19 @@ using NPUPinnedPlace = pten::NPUPinnedPlace; using XPUPlace = pten::XPUPlace; using IPUPlace = pten::IPUPlace; using MLUPlace = pten::MLUPlace; +using CustomPlace = pten::CustomPlace; using PlaceList = std::vector; +#ifdef PADDLE_WITH_CUSTOM_DEVICE +class PlaceHelper { + public: + static std::string GetDeviceType(const Place &place); + static size_t GetDeviceId(const Place &place); + static Place CreatePlace(const std::string &dev_type, size_t dev_id = 0); +}; +#endif + bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); bool is_npu_place(const Place &); @@ -47,6 +57,7 @@ bool is_ipu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool is_npu_pinned_place(const Place &); +bool is_custom_place(const Place &p); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); @@ -121,6 +132,15 @@ typename Visitor::result_type VisitPlace(const Place &place, #else PADDLE_THROW(platform::errors::Unavailable( "Paddle is not compiled with MLU. Cannot visit mlu device")); +#endif + } + case pten::AllocationType::CUSTOM: { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + platform::CustomPlace p(place.GetDeviceType(), place.GetDeviceId()); + return visitor(p); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUSTOM. Cannot visit custom device")); #endif } default: { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 265f0fba8f3..b1fe9f99b5d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -284,7 +284,7 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB}) if(NOT APPLE AND NOT WIN32) target_link_libraries(paddle_pybind rt) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index c84a71d8aaa..f4ed1ee3424 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -136,10 +136,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); + } else if (py::isinstance(place_obj)) { + return place_obj.cast(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace")); + "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/" + "CustomPlace")); } } @@ -183,6 +186,9 @@ static void InitVarBaseAndTensor( SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_mlu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); + } else if (platform::is_custom_place(place)) { + SetTensorFromPyArray(tensor, array, place, + zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " @@ -941,6 +947,10 @@ void BindImperative(py::module *m_ptr) { py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("zero_copy") = false, py::arg("name") = "", py::arg("stop_gradient") = -1) + .def("__init__", &InitVarBaseFromNumpyWithArg, + py::arg("value"), py::arg("place"), py::arg("persistable") = false, + py::arg("zero_copy") = false, py::arg("name") = "", + py::arg("stop_gradient") = -1) .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value")) .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"), py::arg("name") = "") @@ -956,6 +966,8 @@ void BindImperative(py::module *m_ptr) { py::arg("tensor"), py::arg("place"), py::arg("name") = "") .def("__init__", &InitVarBaseFromTensorWithArg, py::arg("tensor"), py::arg("place"), py::arg("name") = "") + .def("__init__", &InitVarBaseFromTensorWithArg, + py::arg("tensor"), py::arg("place"), py::arg("name") = "") .def("__init__", &InitVarBaseFromNumpyWithKwargs) .def( "__setitem_varbase__", @@ -2258,6 +2270,11 @@ void BindImperative(py::module *m_ptr) { self.SetExpectedPlace(*p); VLOG(4) << "Tracer(" << &self << ")" << " set expected place " << *p; + } else if (py::isinstance(obj)) { + auto p = obj.cast(); + self.SetExpectedPlace(*p); + VLOG(4) << "Tracer(" << &self << ")" + << " set expected place " << *p; } else if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); @@ -2301,6 +2318,21 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetMutableAllowOps()), *(imperative::AmpOperators::Instance().GetMutableBlockOps())); }) + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::CustomPlace &place, + bool trace_backward, + const std::map &inplace_map = {}) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp( + type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward, inplace_map); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 959e34afe3d..5289b862dc9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -69,6 +69,7 @@ limitations under the License. */ #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" @@ -1667,6 +1668,139 @@ All parameter, weight, gradient are variables in Paddle. #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_(m, "Communicator").def(py::init<>()); #endif + m.def("get_all_device_type", []() { + std::vector device_types; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + device_types = platform::DeviceManager::GetAllDeviceTypes(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_all_device_type because you have installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_all_device_type, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return device_types; + }); + m.def("get_all_custom_device_type", []() { + std::vector device_types; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_all_custom_device_type because you have installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_all_custom_device_type, please try to " + "install CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return device_types; + }); + m.def("get_available_device", [] { + std::vector devices; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + devices = platform::DeviceManager::GetAllDeviceList(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_available_device because you have installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_available_device, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return devices; + }); + m.def("get_available_custom_device", [] { + std::vector devices; +#ifdef PADDLE_WITH_CUSTOM_DEVICE + devices = platform::DeviceManager::GetAllCustomDeviceList(); +#else + LOG(WARNING) << string::Sprintf( + "Cannot use get_available_custom_device because you have " + "installed" + "CPU/GPU version PaddlePaddle.\n" + "If you want to use get_available_custom_device, please try to " + "install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n"); +#endif + return devices; + }); + py::class_(m, "CustomPlace", + R"DOC( + CustomPlace is a descriptor of a device. + It represents a custom device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + + import paddle + fake_cpu_place = paddle.CustomPlace("FakeCPU", 0) + )DOC") + .def("__init__", + [](platform::CustomPlace &self, const std::string &device_type, + int dev_id) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), device id must be 0 " + "or " + "positive integer", + device_type, dev_id); + std::exit(-1); + } + + if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) && + platform::DeviceManager::IsCustom(device_type))) { + int dev_count = static_cast( + platform::DeviceManager::GetDeviceCount(device_type)); + if (UNLIKELY(dev_id >= dev_count)) { + if (dev_count == 0) { + LOG(ERROR) << "Cannot use " << device_type + << " because there is no " << device_type + << " detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), dev_id must " + "inside " + "[0, %d), because %s " + "number on your machine is %d", + device_type, dev_id, dev_count, device_type, dev_count); + std::exit(-1); + } + } + new (&self) platform::CustomPlace(device_type, dev_id); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), the device type is " + "not registered " + "as a custom device.", + device_type, dev_id); + std::exit(-1); + } +#else + LOG(ERROR) << string::Sprintf( + "Cannot use CustomDevice because you have installed CPU/GPU" + "version PaddlePaddle.\n" + "If you want to use CustomDevice, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle-core\n" + "If you only have CPU, please change " + "CustomPlace(%s, %d) to be CPUPlace().\n", + device_type, dev_id); + std::exit(-1); +#endif + }) + .def("get_device_id", + [](const platform::CustomPlace &self) { return self.GetDeviceId(); }) + .def("get_device_type", + [](const platform::CustomPlace &self) { + return self.GetDeviceType(); + }) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); py::class_ cudaplace(m, "CUDAPlace", R"DOC( CUDAPlace is a descriptor of a device. @@ -2118,11 +2252,16 @@ All parameter, weight, gradient are variables in Paddle. }) .def("is_mlu_place", [](platform::Place &self) { return platform::is_mlu_place(self); }) + .def( + "is_custom_place", + [](platform::Place &self) { return platform::is_custom_place(self); }) .def("gpu_device_id", [](platform::Place &self) { return self.device; }) .def("xpu_device_id", [](platform::Place &self) { return self.device; }) .def("npu_device_id", [](platform::Place &self) { return self.device; }) .def("ipu_device_id", [](platform::Place &self) { return self.device; }) .def("mlu_device_id", [](platform::Place &self) { return self.device; }) + .def("custom_device_id", + [](platform::Place &self) { return self.device; }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", @@ -2154,6 +2293,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self, const platform::MLUPlace &mlu_place) { self = mlu_place; }) + .def("set_place", + [](platform::Place &self, const platform::CustomPlace &plug_place) { + self = plug_place; + }) .def("__repr__", string::to_string) .def("__str__", string::to_string); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 9a11c5946f3..f1983175bdf 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -247,6 +248,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { auto p = self.place(); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); +#endif + } else if (platform::is_custom_place(self.place())) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + const T *a = self.data(); + auto p = self.place(); + paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), + nullptr); #endif } VLOG(10) << "TensorGetElement, place: " << self.place() @@ -289,6 +297,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); +#endif + } else if (platform::is_custom_place(self->place())) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + auto p = self->place(); + T *a = self->mutable_data(p); + paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), + nullptr); #endif } } @@ -368,6 +383,24 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use MLUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with MLU support.")); +#endif + } else if (paddle::platform::is_custom_place(place)) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + platform::Place tmp_place = place; + platform::DeviceGuard guard(tmp_place); + auto dst = self->mutable_data(place); + + platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D( + reinterpret_cast(dst), + const_cast(reinterpret_cast(array.data())), + array.nbytes()); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); + ctx.Wait(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CustomDevice in CPU/GPU/XPU version. " + "Please recompile or reinstall Paddle with CustomDevice support.")); #endif } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -757,6 +790,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); bool is_npu_tensor = platform::is_npu_place(tensor.place()); bool is_mlu_tensor = platform::is_mlu_place(tensor.place()); + bool is_custom_device_tensor = platform::is_custom_place(tensor.place()); const auto &tensor_dims = tensor.dims(); auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype()); size_t sizeof_dtype = framework::SizeOfType(tensor_dtype); @@ -776,7 +810,8 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, std::string py_dtype_str = details::TensorDTypeToPyDTypeStr( framework::TransToProtoVarType(tensor.dtype())); - if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor) { + if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor && + !is_custom_device_tensor) { if (!need_deep_copy) { auto base = py::cast(std::move(tensor)); return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides, @@ -900,6 +935,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, " "Please recompile or reinstall Paddle with MLU support.")); +#endif + } else if (is_custom_device_tensor) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); + PADDLE_ENFORCE_EQ(py_arr.writeable(), true, + platform::errors::InvalidArgument( + "PyArray is not writable, in which case memory leak " + "or double free would occur")); + PADDLE_ENFORCE_EQ( + py_arr.owndata(), true, + platform::errors::InvalidArgument( + "PyArray does not own data, in which case memory leak " + "or double free would occur")); + + size_t copy_bytes = sizeof_dtype * numel; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(tensor.place()); + paddle::memory::Copy( + platform::CPUPlace(), py_arr.mutable_data(), tensor.place(), + tensor_buf_ptr, copy_bytes, + reinterpret_cast(ctx).stream()); + ctx.Wait(); + return py_arr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, " + "Please recompile or reinstall Paddle with CustomPlace " + "support.")); #endif } PADDLE_THROW(platform::errors::Unimplemented("Place is not supported")); diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc index e2cb934f0a1..0a3bfccb16a 100644 --- a/paddle/pten/common/place.cc +++ b/paddle/pten/common/place.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/pten/api/ext/exception.h" @@ -50,7 +51,11 @@ const char *AllocationTypeStr(AllocationType type) { std::string Place::DebugString() const { std::ostringstream os; os << "Place("; - os << AllocationTypeStr(alloc_type_); + if (alloc_type_ == AllocationType::CUSTOM) { + os << GetGlobalDeviceType(device_type_id_); + } else { + os << AllocationTypeStr(alloc_type_); + } if (alloc_type_ == AllocationType::GPUPINNED || alloc_type_ == AllocationType::NPUPINNED || alloc_type_ == AllocationType::CPU) { @@ -66,4 +71,23 @@ std::ostream &operator<<(std::ostream &os, const Place &p) { return os; } +static std::unordered_map global_registered_device_type_id; +static std::unordered_map global_registered_device_type; + +size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) { + if (device_type.empty()) return 0; + if (global_registered_device_type_id.find(device_type) == + global_registered_device_type_id.end()) { + size_t device_type_id = global_registered_device_type_id.size() + 1; + global_registered_device_type_id[device_type] = device_type_id; + global_registered_device_type[device_type_id] = device_type; + } + return global_registered_device_type_id[device_type]; +} + +std::string GetGlobalDeviceType(size_t device_type_id) { + if (device_type_id == 0) return ""; + return global_registered_device_type[device_type_id]; +} + } // namespace pten diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h index 75f1f4de998..6b7d1ea55d5 100644 --- a/paddle/pten/common/place.h +++ b/paddle/pten/common/place.h @@ -28,29 +28,49 @@ enum class AllocationType : int8_t { NPUPINNED = 6, IPU = 7, MLU = 8, + CUSTOM = 9, }; const char* AllocationTypeStr(AllocationType type); +size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); +std::string GetGlobalDeviceType(size_t device_type_id_); + /// \brief The place is used to specify where the data is stored. class Place { public: Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {} - explicit Place(AllocationType type, int8_t id) - : device(id), alloc_type_(type) {} - - explicit Place(AllocationType type) : device(0), alloc_type_(type) {} - - void Reset(AllocationType type, int8_t device_id = 0) noexcept { + explicit Place(AllocationType type, + int8_t id, + const std::string& dev_type = "") + : device(id), + alloc_type_(type), + device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + + explicit Place(AllocationType type, const std::string& dev_type = "") + : device(0), + alloc_type_(type), + device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + + void Reset(AllocationType type, + int8_t device_id = 0, + const std::string& dev_type = "") noexcept { alloc_type_ = type; device = device_id; + if (!dev_type.empty()) { + device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type); + } } AllocationType GetType() const { return alloc_type_; } int8_t GetDeviceId() const { return device; } + std::string GetDeviceType() const { + return GetGlobalDeviceType(device_type_id_); + } + std::string DebugString() const; inline bool operator==(const Place& rhs) const { @@ -62,6 +82,10 @@ class Place { alloc_type_ == AllocationType::NPUPINNED) { return true; } + if (alloc_type_ == AllocationType::CUSTOM) { + return device_type_id_ == rhs.device_type_id_ && + device == rhs.GetDeviceId(); + } return device == rhs.GetDeviceId(); } inline bool operator!=(const Place& rhs) const { return !(*this == rhs); } @@ -69,6 +93,10 @@ class Place { if (alloc_type_ != rhs.GetType()) { return static_cast(alloc_type_) < static_cast(rhs.GetType()); } + if (alloc_type_ == AllocationType::CUSTOM && + device_type_id_ != rhs.device_type_id_) { + return device_type_id_ < rhs.device_type_id_; + } return device < rhs.GetDeviceId(); } @@ -79,6 +107,7 @@ class Place { private: AllocationType alloc_type_{AllocationType::UNDEFINED}; + size_t device_type_id_; }; class CPUPlace : public Place { @@ -157,6 +186,22 @@ class MLUPlace : public Place { : Place(AllocationType::MLU, place.GetDeviceId()) {} }; +class CustomPlace : public Place { + public: + explicit CustomPlace(const std::string dev_type) + : Place(AllocationType::CUSTOM, 0, dev_type) {} + CustomPlace(const std::string dev_type, int device_id) + : Place(AllocationType::CUSTOM, device_id, dev_type) {} + + CustomPlace(const CustomPlace&) = default; + CustomPlace(const Place& place) { // NOLINT + if (place.GetType() == AllocationType::CUSTOM) { + this->Reset( + AllocationType::CUSTOM, place.GetDeviceId(), place.GetDeviceType()); + } + } +}; + std::ostream& operator<<(std::ostream&, const Place&); } // namespace pten diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc index 780068e0381..09717ee65e0 100644 --- a/paddle/pten/kernels/funcs/math_function.cc +++ b/paddle/pten/kernels/funcs/math_function.cc @@ -215,6 +215,15 @@ void set_constant_with_place( paddle::platform::errors::Unimplemented("IPUPlace is not supported")); } +template <> +void set_constant_with_place( + const paddle::platform::DeviceContext& context, + paddle::framework::Tensor* tensor, + float value) { + PADDLE_THROW( + paddle::platform::errors::Unimplemented("CustomPlace is not supported")); +} + template <> void set_constant_with_place( const paddle::platform::DeviceContext& context, diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 8ce9716b169..12d31aee41e 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -293,6 +293,7 @@ from .framework import CUDAPlace # noqa: F401 from .framework import NPUPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401 from .framework import MLUPlace # noqa: F401 +from .framework import CustomPlace # noqa: F401 from .autograd import grad # noqa: F401 from .autograd import no_grad # noqa: F401 diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index d102473fef7..89e0ae49fc4 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -36,7 +36,11 @@ __all__ = [ # noqa 'is_compiled_with_cuda', 'is_compiled_with_rocm', 'is_compiled_with_npu', - 'is_compiled_with_mlu' + 'is_compiled_with_mlu', + 'get_all_device_type', + 'get_all_custom_device_type', + 'get_available_device', + 'get_available_custom_device', ] _cudnn_version = None @@ -225,15 +229,26 @@ def _convert_to_place(device): selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") device_id = int(selected_mlus[0]) place = core.MLUPlace(device_id) + elif device in core.get_all_custom_device_type(): + place = core.CustomPlace(device, 0) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device: - raise ValueError( - "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'mlu', 'mlu:x', 'npu', 'npu:x' or ipu" - ) + device_info_list = device.split(':', 1) + device_type = device_info_list[0] + if device_type in core.get_all_custom_device_type(): + device_id = device_info_list[1] + device_id = int(device_id) + place = core.CustomPlace(device_type, device_id) + else: + raise ValueError( + "The device must be a string which is like 'cpu', {}". + format(', '.join("'{}', '{}:x'".format(x, x) + for x in ['gpu', 'xpu', 'npu', 'mlu'] + + core.get_all_custom_device_type()))) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( @@ -338,3 +353,103 @@ def get_device(): raise ValueError("The device specification {} is invalid".format(place)) return device + + +def get_all_device_type(): + """ + Get all available device types. + + Returns: + A list of all available device types. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_all_device_type() + + # Case 1: paddlepaddle-cpu package installed, and no custom device registerd. + # Output: ['cpu'] + + # Case 2: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: ['cpu', 'gpu'] + + # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd. + # Output: ['cpu', 'CustomCPU'] + + # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['cpu', 'gpu', 'CustomCPU', 'CustomGPU'] + """ + return core.get_all_device_type() + + +def get_all_custom_device_type(): + """ + Get all available custom device types. + + Returns: + A list of all available custom device types. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_all_custom_device_type() + + # Case 1: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: None + + # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['CustomCPU', 'CustomGPU'] + """ + return core.get_all_custom_device_type() + + +def get_available_device(): + """ + Get all available devices. + + Returns: + A list of all available devices. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_available_device() + + # Case 1: paddlepaddle-cpu package installed, and no custom device registerd. + # Output: ['cpu'] + + # Case 2: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: ['cpu', 'gpu:0', 'gpu:1'] + + # Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd. + # Output: ['cpu', 'CustomCPU'] + + # Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['cpu', 'gpu:0', 'gpu:1', 'CustomCPU', 'CustomGPU:0', 'CustomGPU:1'] + """ + return core.get_available_device() + + +def get_available_custom_device(): + """ + Get all available custom devices. + + Returns: + A list of all available custom devices. + + Examples: + .. code-block:: python + + import paddle + paddle.device.get_available_custom_device() + + # Case 1: paddlepaddle-gpu package installed, and no custom device registerd. + # Output: None + + # Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd. + # Output: ['CustomCPU', 'CustomGPU:0', 'CustomGPU:1'] + """ + return core.get_available_custom_device() diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index db6faa1a1b1..997075590e5 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -71,7 +71,7 @@ from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, Scope, _Scope -from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace +from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace, CustomPlace from .incubate import fleet from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bb77f6031f7..b8854dfd2ad 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6918,7 +6918,7 @@ def _get_paddle_place(place): return place if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace, - core.IPUPlace, core.MLUPlace)): + core.IPUPlace, core.MLUPlace, core.CustomPlace)): return place if not isinstance(place, str): diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index a0503322806..72e8e73ce7c 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -29,6 +29,7 @@ from ..fluid.core import CUDAPlace # noqa: F401 from ..fluid.core import CUDAPinnedPlace # noqa: F401 from ..fluid.core import NPUPlace # noqa: F401 from ..fluid.core import MLUPlace # noqa: F401 +from ..fluid.core import CustomPlace # noqa: F401 from ..fluid.core import VarBase # noqa: F401 from paddle.fluid import core # noqa: F401 diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index cd1faf64f3e..c121d7b6b83 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -106,9 +106,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): if place is None: place = _current_expected_place() elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, - core.CUDAPlace, core.NPUPlace, core.XPUPlace)): + core.CUDAPlace, core.NPUPlace, core.XPUPlace, + core.CustomPlace)): raise ValueError( - "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace" + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace" ) #Todo(zhouwei): Support allocate tensor on any other specified card diff --git a/python/setup.py.in b/python/setup.py.in index 8f42beaf1c0..9977ddeb26b 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -579,7 +579,8 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) + # pten core headers # utila api headers ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] + - ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h']) + ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] + + ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h']) if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn @@ -624,6 +625,8 @@ class InstallHeaders(Command): elif 'third_party' not in header: # paddle headers install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) + if 'device_ext.h' in header: + install_dir = "paddle/" else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) -- GitLab