未验证 提交 3e7825f3 编写于 作者: R ronnywang 提交者: GitHub

[PluggableDevice] Add custom runtime support (#38740)

* [CustomRuntime] Add DeviceManager

* [CustomRuntime] Add DeviceInterface

* [CustomRuntime] Add Stream, Event, DeviceGuard, CallbackManager

* [CustomRuntime] Add plug-in device

* [CustomRuntime] Memory module support PluggableDevice

* [CustomRuntime] Add WITH_PLUGGABLE_DEVICE cmake option

* update

* [API] update API doc based on comments, test=develop
Co-authored-by: Nqili93 <qili93@qq.com>
上级 0d46a108
...@@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup ji ...@@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup ji
option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
option(WITH_POCKETFFT "Compile with pocketfft support" ON) option(WITH_POCKETFFT "Compile with pocketfft support" ON)
option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF)
option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
if(WITH_RECORD_BUILDTIME) if(WITH_RECORD_BUILDTIME)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
...@@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr ...@@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr
return() return()
endif() endif()
if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
set(WITH_CUSTOM_DEVICE ON)
endif()
if(WIN32) if(WIN32)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
MESSAGE(WARNING MESSAGE(WARNING
......
...@@ -219,3 +219,7 @@ endif(ON_INFER) ...@@ -219,3 +219,7 @@ endif(ON_INFER)
if(WITH_CRYPTO) if(WITH_CRYPTO)
add_definitions(-DPADDLE_WITH_CRYPTO) add_definitions(-DPADDLE_WITH_CRYPTO)
endif(WITH_CRYPTO) endif(WITH_CRYPTO)
if(WITH_CUSTOM_DEVICE AND NOT WIN32)
add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
endif()
...@@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { ...@@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
platform::errors::Unimplemented("platform::MLUPlace is not supported")); platform::errors::Unimplemented("platform::MLUPlace is not supported"));
} }
inline ::DLDevice operator()(const platform::CustomPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::CustomPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CUDAPlace &place) const { inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLDevice device; ::DLDevice device;
......
...@@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle")); platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
#endif
} else if (platform::is_custom_place(place_)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for " << place_ << ".";
gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_,
max_memory_size));
} else {
VLOG(4) << "Use default stream gc for " << place_ << ".";
gc.reset(
new CustomDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found"));
#endif #endif
} }
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#endif #endif
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
DECLARE_double(eager_delete_tensor_gb); DECLARE_double(eager_delete_tensor_gb);
DECLARE_double(memory_fraction_of_eager_deletion); DECLARE_double(memory_fraction_of_eager_deletion);
...@@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback( ...@@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback(
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void CustomDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void CustomDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
CustomStreamGarbageCollector::CustomStreamGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {
platform::DeviceGuard guard(place);
stream_.reset(new platform::stream::Stream);
stream_->Init(place);
callback_manager_.reset(new platform::CallbackManager(stream_.get()));
}
CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
stream_->Synchronize();
stream_->Destroy();
}
platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
return stream_.get();
}
void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
void CustomStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback_manager_->AddCallback(callback);
}
#endif
int64_t GetEagerDeletionThreshold() { int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0 return FLAGS_eager_delete_tensor_gb < 0
? -1 ? -1
......
...@@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector { ...@@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector {
}; };
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDefaultStreamGarbageCollector : public GarbageCollector {
public:
CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector {
public:
CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class CustomStreamGarbageCollector : public GarbageCollector {
public:
CustomStreamGarbageCollector(const platform::CustomPlace &place,
size_t max_memory_size);
~CustomStreamGarbageCollector();
void Wait() const override;
platform::stream::Stream *stream() const;
protected:
void ClearCallback(const std::function<void()> &callback) override;
private:
std::unique_ptr<platform::stream::Stream> stream_;
std::unique_ptr<platform::CallbackManager> callback_manager_;
};
#endif
template <typename Container> template <typename Container>
void GarbageCollector::Add(Container &&objs) { void GarbageCollector::Add(Container &&objs) {
Add(std::forward<Container>(objs), []() {}); Add(std::forward<Container>(objs), []() {});
......
...@@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { ...@@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
"Too many OpKernel attribute values, expected maximum " "Too many OpKernel attribute values, expected maximum "
"value is 64, received value is %d.", "value is 64, received value is %d.",
cur_loc)); cur_loc));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
std::hash<int> hasher;
size_t seed =
hasher(place + data_type + data_layout + library_type + customized_value);
if (platform::is_custom_place(key.place_)) {
seed ^= std::hash<std::string>{}(key.place_.GetDeviceType()) + 0x9e3779b9 +
(seed << 6) + (seed >> 2) + 4;
}
return seed;
#else
std::hash<int> hasher; std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type + return hasher(place + data_type + data_layout + library_type +
customized_value); customized_value);
#endif
} }
bool OpKernelType::operator==(const OpKernelType& o) const { bool OpKernelType::operator==(const OpKernelType& o) const {
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/unused_var_check.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/common/scalar.h" #include "paddle/pten/common/scalar.h"
...@@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else #else
auto dev_id = place.device; auto dev_id = place.device;
platform::SetMLUDeviceId(dev_id); platform::SetMLUDeviceId(dev_id);
#endif
} else if (platform::is_custom_place(place)) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with CustomDevice support.",
place));
#else
platform::DeviceManager::SetDevice(place);
#endif #endif
} }
......
...@@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use XPU device since it's not compiled with XPU," "Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(
new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new CustomStreamGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use custom device since it's not compiled with "
"CustomDevice,"
"Please recompile or reinstall Paddle with CustomDevice support."));
#endif #endif
} else if (platform::is_cpu_place(place)) { } else if (platform::is_cpu_place(place)) {
gc.reset(new CPUGarbageCollector(place, max_memory_size)); gc.reset(new CPUGarbageCollector(place, max_memory_size));
......
...@@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, ...@@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
...@@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, ...@@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx; const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) || if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
platform::is_mlu_place(dst_place)) { platform::is_mlu_place(dst_place) ||
platform::is_custom_place(dst_place)) {
dev_ctx = pool.Get(dst_place); dev_ctx = pool.Get(dst_place);
} else { } else {
dev_ctx = pool.Get(src.place()); dev_ctx = pool.Get(src.place());
...@@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_custom_place(
dst_place)) { /* custom_device -> custom_device*/
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
...@@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor<bool> { ...@@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
const platform::CUDAPinnedPlace& cpu) const { const platform::CUDAPinnedPlace& cpu) const {
return *out.data<bool>(); return *out.data<bool>();
} }
bool GetResult(const framework::Tensor& out,
const platform::CustomPlace& custom_dev) const {
PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ",
custom_dev));
return false;
}
}; };
template <typename Predicate> template <typename Predicate>
...@@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> { ...@@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> {
out_ptr[i] = lhs && rhs; out_ptr[i] = lhs && rhs;
} }
} }
void VisitorImpl(const platform::CustomPlace& custom_dev) const {
PADDLE_THROW(
platform::errors::Unimplemented("CustomPlace is not supported"));
}
}; };
void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
...@@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, ...@@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU")); "NPUPlace is not supported when not compiled with NPU"));
#endif
} else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& custom_device_context =
static_cast<const platform::CustomDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(), tensor.place(),
reinterpret_cast<const void*>(data), size_to_write,
custom_device_context.stream());
custom_device_context.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CustomPlace is not supported when not compiled with "
"CustomDevice"));
#endif #endif
} else { } else {
os.write(static_cast<const char*>(data_ptr), os.write(static_cast<const char*>(data_ptr),
...@@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_mlu_place(dev_ctx.GetPlace()) || platform::is_mlu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) { platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
Tensor cpu_tensor; Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape)); cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType( framework::VisitDataType(
...@@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size); is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace(); auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) { if (platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
dev_ctx.Wait(); dev_ctx.Wait();
} }
#else #else
...@@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_mlu_place(dev_ctx.GetPlace()) || platform::is_mlu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) { platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
Tensor cpu_tensor; Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims)); cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType( framework::VisitDataType(
...@@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size); is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace(); auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) { if (platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
dev_ctx.Wait(); dev_ctx.Wait();
} }
#else #else
...@@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) { } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU")); "MLUPlace is not supported when not compiled with MLU"));
} else { } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU")); "NPUPlace is not supported when not compiled with NPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"CutomPlace is not supported when not compiled with CustomDevice"));
} }
#endif #endif
} else { } else {
......
...@@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size, ...@@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy(
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorFromArray on %s is not supported.", dst_place));
}
} }
template <typename T> template <typename T>
...@@ -241,6 +252,17 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -241,6 +252,17 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy(
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorFromVector on %s is not supported.", dst_place));
}
} }
// The fully specialized function should be inline to avoid // The fully specialized function should be inline to avoid
...@@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector<bool>& src, ...@@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector<bool>& src,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorFromVector on %s is not supported.", dst_place));
}
delete[] array; delete[] array;
} }
...@@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorToVector on %s is not supported.", src.place()));
}
} }
template <> template <>
...@@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src, ...@@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src,
dst_place, dst_ptr, src.place(), src_ptr, size, dst_place, dst_ptr, src.place(), src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
} }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif #endif
for (unsigned int i = 0; i < src.numel(); i++) { for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]); (*dst)[i] = static_cast<bool>(array[i]);
......
...@@ -180,6 +180,12 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -180,6 +180,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
"is not supported in imperative mode", "is not supported in imperative mode",
place)); place));
} }
void operator()(const platform::CustomPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
private: private:
int64_t numel_; int64_t numel_;
...@@ -331,7 +337,14 @@ void TensorAdd(const VarType& src, VarType* dst) { ...@@ -331,7 +337,14 @@ void TensorAdd(const VarType& src, VarType* dst) {
return; return;
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::is_custom_place(place)) {
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type), place));
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(place)) { if (platform::is_xpu_place(place)) {
if (data_type == framework::DataTypeTrait<float>::DataType()) { if (data_type == framework::DataTypeTrait<float>::DataType()) {
......
...@@ -278,6 +278,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -278,6 +278,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
expected_kernel_key.place_ = platform::CPUPlace(); expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (kernel_iter == kernels.end() &&
paddle::platform::is_custom_place(expected_kernel_key.place_)) {
VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif #endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
// case // case
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/imperative/op_base.h"
#include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/denormal.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
...@@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU," "Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support.")); "Please recompile or reinstall Paddle with MLU support."));
#endif
} else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use CustomDevice since it's not compiled with "
"CustomDevice,"
"Please recompile or reinstall Paddle with CustomDevice "
"support."));
#endif #endif
} else { } else {
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
...@@ -222,6 +234,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins, ...@@ -222,6 +234,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU if use MLUPlace.")); "PaddlePaddle should compile with MLU if use MLUPlace."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
platform::DeviceManager::SetDevice(place);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with CustomDevice if use "
"CustomPlace."));
#endif #endif
} }
if (!override_default_attr_map) { if (!override_default_attr_map) {
......
...@@ -58,6 +58,11 @@ else () ...@@ -58,6 +58,11 @@ else ()
set(AllocatorFacadeDeps) set(AllocatorFacadeDeps)
endif() endif()
if (WITH_CUSTOM_DEVICE)
cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
endif()
if (WITH_GPU) if (WITH_GPU)
nv_test(best_fit_allocator_test nv_test(best_fit_allocator_test
SRCS best_fit_allocator_test.cc SRCS best_fit_allocator_test.cc
......
...@@ -62,6 +62,11 @@ ...@@ -62,6 +62,11 @@
#include "paddle/fluid/platform/device/mlu/mlu_info.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#endif
PADDLE_DEFINE_EXPORTED_int64( PADDLE_DEFINE_EXPORTED_int64(
gpu_allocator_retry_time, 10000, gpu_allocator_retry_time, 10000,
"The retry time (milliseconds) when allocator fails " "The retry time (milliseconds) when allocator fails "
...@@ -186,6 +191,17 @@ class AllocatorFacadePrivate { ...@@ -186,6 +191,17 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
} }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
for (size_t dev_id = 0;
dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
++dev_id) {
InitNaiveBestFitCustomDeviceAllocator(
platform::CustomPlace(dev_type, dev_id));
}
}
#endif #endif
break; break;
} }
...@@ -222,6 +238,17 @@ class AllocatorFacadePrivate { ...@@ -222,6 +238,17 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
} }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
for (size_t dev_id = 0;
dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
++dev_id) {
InitAutoGrowthCustomDeviceAllocator(
platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
}
}
#endif #endif
break; break;
} }
...@@ -700,6 +727,21 @@ class AllocatorFacadePrivate { ...@@ -700,6 +727,21 @@ class AllocatorFacadePrivate {
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
bool allow_free_idle_chunk) {
auto custom_allocator =
std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
allow_free_idle_chunk);
}
#endif
void InitSystemAllocators() { void InitSystemAllocators() {
if (!system_allocators_.empty()) return; if (!system_allocators_.empty()) return;
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>(); system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
...@@ -770,6 +812,16 @@ class AllocatorFacadePrivate { ...@@ -770,6 +812,16 @@ class AllocatorFacadePrivate {
places.emplace_back(platform::MLUPlace(dev_id)); places.emplace_back(platform::MLUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
for (size_t dev_id = 0;
dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
dev_id++) {
places.emplace_back(platform::CustomPlace(dev_type, dev_id));
}
}
#endif
for (auto& p : places) { for (auto& p : places) {
zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p); zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
...@@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, ...@@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
} }
#endif #endif
platform::CUDAPlace p(place.GetDeviceId()); platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
bool CustomAllocator::IsAllocThreadSafe() const { return true; }
void CustomAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
allocation->place(), place_,
platform::errors::PermissionDenied("CustomDevice memory is "
"freed in incorrect device. "
"This may be a bug"));
delete allocation;
}
pten::Allocation* CustomAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_,
[this] { platform::DeviceManager::SetDevice(place_); });
void* ptr =
platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
if (LIKELY(ptr)) {
return new Allocation(ptr, size, place_);
}
size_t avail, total;
platform::DeviceManager::MemoryStats(place_, &total, &avail);
auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on %s:%d. "
"Cannot allocate %s memory on %s:%d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using %s:%d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another %s.\n"
"2. If no, please decrease the batch size of your model.\n\n",
dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id,
string::HumanReadableSize(avail), dev_type, dev_id, dev_type));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class CustomAllocator : public Allocator {
public:
explicit CustomAllocator(const platform::CustomPlace& place)
: place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(pten::Allocation* allocation) override;
pten::Allocation* AllocateImpl(size_t size) override;
private:
platform::Place place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -30,7 +31,6 @@ ...@@ -30,7 +31,6 @@
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#include "paddle/fluid/platform/device/device_wrapper.h"
PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false, init_allocated_mem, false,
...@@ -733,6 +733,136 @@ uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) { ...@@ -733,6 +733,136 @@ uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
#endif #endif
} }
// For CustomDevice
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class BuddyAllocatorList {
private:
explicit BuddyAllocatorList(const std::string &device_type)
: device_type_(device_type) {
auto devices = platform::DeviceManager::GetDeviceList(device_type);
for (auto dev_id : devices) {
init_flags_[dev_id].reset(new std::once_flag());
}
}
static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) {
return new BuddyAllocatorList(device_type);
}
public:
static BuddyAllocatorList *Instance(const std::string &device_type) {
// DeviceType -> AllocatorList
static std::unordered_map<std::string, BuddyAllocatorList *> pool;
if (pool.find(device_type) == pool.end()) {
pool[device_type] = CreateNewInstance(device_type);
}
return pool[device_type];
}
BuddyAllocator *Get(int dev_id) {
PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
platform::errors::OutOfRange(
"Cannot find %s %d, please check visible devices.",
device_type_, dev_id));
std::call_once(*init_flags_[dev_id], [this, dev_id] {
platform::DeviceManager::SetDevice(device_type_, dev_id);
platform::CustomPlace place(device_type_, dev_id);
allocators_[dev_id].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::CustomAllocator(device_type_, dev_id)),
platform::DeviceManager::GetMinChunkSize(place),
platform::DeviceManager::GetMaxChunkSize(place),
platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
});
return allocators_[dev_id].get();
}
private:
std::string device_type_;
std::unordered_map<size_t, std::unique_ptr<std::once_flag>> init_flags_;
std::unordered_map<size_t, std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetBuddyAllocator(const platform::Place &place) {
VLOG(10) << "GetBuddyAllocator place = " << place;
if (platform::is_custom_place(place)) {
return BuddyAllocatorList::Instance(
platform::PlaceHelper::GetDeviceType(place))
->Get(platform::PlaceHelper::GetDeviceId(place));
} else {
PADDLE_THROW(
platform::errors::InvalidArgument("place must be CustomPlace"));
}
}
#endif
template <>
void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
auto *buddy_allocator = GetBuddyAllocator(place);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::DeviceGuard guard(place);
size_t avail, total;
platform::DeviceManager::MemoryStats(place, &total, &avail);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
"%s. ",
string::HumanReadableSize(size), place.GetDeviceType(), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(total - avail)));
} else {
if (FLAGS_init_allocated_mem) {
platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
size);
}
}
VLOG(10) << " pointer=" << ptr;
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetBuddyAllocator(place)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::CustomPlace>(const platform::CustomPlace &place) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
return GetBuddyAllocator(place)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
template <>
size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
return GetBuddyAllocator(place)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
struct AllocVisitor : public boost::static_visitor<void *> { struct AllocVisitor : public boost::static_visitor<void *> {
inline explicit AllocVisitor(size_t size) : size_(size) {} inline explicit AllocVisitor(size_t size) : size_(size) {}
......
...@@ -25,9 +25,7 @@ limitations under the License. */ ...@@ -25,9 +25,7 @@ limitations under the License. */
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif #endif
#ifdef PADDLE_WITH_MLU #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -35,12 +33,37 @@ namespace detail { ...@@ -35,12 +33,37 @@ namespace detail {
BuddyAllocator::BuddyAllocator( BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size, std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t max_chunk_size, size_t extra_padding_size) size_t max_chunk_size, size_t extra_padding_size,
const std::string dev_type)
: min_chunk_size_(min_chunk_size), : min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size), max_chunk_size_(max_chunk_size),
extra_padding_size_(extra_padding_size), extra_padding_size_(extra_padding_size),
cache_(system_allocator->UseGpu()), cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {} system_allocator_(std::move(system_allocator)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (!dev_type.empty()) {
init_allocate_size_func_ = [dev_type]() {
return platform::DeviceManager::GetInitAllocSize(
platform::PlaceHelper::CreatePlace(dev_type));
};
re_allocate_size_func_ = [dev_type]() {
return platform::DeviceManager::GetReallocSize(
platform::PlaceHelper::CreatePlace(dev_type));
};
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_ = &platform::GpuInitAllocSize;
re_allocate_size_func_ = &platform::GpuReallocSize;
#elif defined(PADDLE_WITH_ASCEND_CL)
init_allocate_size_func_ = &platform::NPUInitAllocSize;
re_allocate_size_func_ = &platform::NPUReallocSize;
#elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_ = &platform::MLUInitAllocSize;
re_allocate_size_func_ = &platform::MLUReallocSize;
#endif
}
#endif
}
BuddyAllocator::~BuddyAllocator() { BuddyAllocator::~BuddyAllocator() {
VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
...@@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t allocate_bytes = max_chunk_size_; size_t allocate_bytes = max_chunk_size_;
size_t index = 0; size_t index = 0;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
allocate_bytes = DeviceAllocateSize(init_allocate_size_func_,
re_allocate_size_func_, request_bytes);
#else
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize, allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
&platform::GpuReallocSize, request_bytes); &platform::GpuReallocSize, request_bytes);
...@@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize, allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
&platform::MLUReallocSize, request_bytes); &platform::MLUReallocSize, request_bytes);
#endif
#endif #endif
// Allocate a new block // Allocate a new block
......
...@@ -39,7 +39,8 @@ class BuddyAllocator { ...@@ -39,7 +39,8 @@ class BuddyAllocator {
public: public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator, BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t max_chunk_size, size_t min_chunk_size, size_t max_chunk_size,
size_t extra_padding_size = 0); size_t extra_padding_size = 0,
const std::string dev_type = "");
~BuddyAllocator(); ~BuddyAllocator();
...@@ -123,6 +124,9 @@ class BuddyAllocator { ...@@ -123,6 +124,9 @@ class BuddyAllocator {
/*! Allocate CPU/GPU memory from system */ /*! Allocate CPU/GPU memory from system */
std::unique_ptr<SystemAllocator> system_allocator_; std::unique_ptr<SystemAllocator> system_allocator_;
std::mutex mutex_; std::mutex mutex_;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
std::function<size_t()> init_allocate_size_func_, re_allocate_size_func_;
#endif
}; };
} // namespace detail } // namespace detail
......
...@@ -38,6 +38,8 @@ limitations under the License. */ ...@@ -38,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#include "paddle/fluid/platform/device/device_wrapper.h"
DECLARE_bool(use_pinned_memory); DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
...@@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) {
bool MLUAllocator::UseGpu() const { return true; } bool MLUAllocator::UseGpu() const { return true; }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void* CustomAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = platform::DeviceManager::GetDeviceWithPlace(place);
p = device->MemoryAllocate(size);
if (LIKELY(p)) {
VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
*index = 0;
plug_alloc_size += size;
} else {
size_t avail, total;
platform::DeviceManager::MemoryStats(place, &total, &avail);
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on %s %d. "
"total memory is %s, used memory is %s, "
"available memory is only %s.\n\n",
dev_type_, dev_id_, string::HumanReadableSize(total),
string::HumanReadableSize(total - avail),
string::HumanReadableSize(avail)));
}
return p;
}
void CustomAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(plug_alloc_size, size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, plug_alloc_size));
plug_alloc_size -= size;
auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = platform::DeviceManager::GetDeviceWithPlace(place);
device->MemoryDeallocate(p, size);
}
bool CustomAllocator::UseGpu() const { return true; }
#endif
} // namespace detail } // namespace detail
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stddef.h> // for size_t #include <stddef.h> // for size_t
#include <string>
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator { ...@@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator {
}; };
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomAllocator : public SystemAllocator {
public:
explicit CustomAllocator(const std::string& device_type, size_t dev_id)
: dev_type_(device_type), dev_id_(dev_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t plug_alloc_size = 0;
std::string dev_type_;
size_t dev_id_;
};
#endif
} // namespace detail } // namespace detail
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -19,9 +19,88 @@ limitations under the License. */ ...@@ -19,9 +19,88 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/common/place.h" #include "paddle/pten/common/place.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
template <>
void Copy<platform::CPUPlace, platform::CustomPlace>(
platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
std::string msg = "Memcpy:" + src_type + "->" + dst_type;
platform::RecordEvent record_event(msg);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << ", stream=" << stream;
platform::DeviceManager::SetDevice(src_place);
platform::stream::Stream stream_wrapper(src_place, stream);
platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H(
dst, src, num, &stream_wrapper);
}
template <>
void Copy<platform::CustomPlace, platform::CPUPlace>(
platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
std::string msg = "Memcpy:" + src_type + "->" + dst_type;
platform::RecordEvent record_event(msg);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << ", stream=" << stream;
platform::DeviceManager::SetDevice(dst_place);
platform::stream::Stream stream_wrapper(dst_place, stream);
platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D(
dst, src, num, &stream_wrapper);
}
template <>
void Copy<platform::CustomPlace, platform::CustomPlace>(
platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
std::string msg = "Memcpy:" + src_type + "->" + dst_type;
platform::RecordEvent record_event(msg);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << ", stream=" << stream;
if (src_type == dst_type) {
platform::DeviceManager::SetDevice(src_place);
platform::stream::Stream stream_wrapper(src_place, stream);
auto src_id = platform::PlaceHelper::GetDeviceId(src_place);
auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place);
if (src_id == dst_id) {
platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D(
dst, src, num, &stream_wrapper);
} else {
platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P(
dst_place, dst, src, num, &stream_wrapper);
}
} else {
PADDLE_THROW(platform::errors::Unavailable(
"Copy between %s and %s is not supported.", src_type, dst_type));
}
}
#endif // PADDLE_WITH_CUSTOM_DEVICE
template <> template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace, platform::CPUPlace,
...@@ -158,7 +237,7 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -158,7 +237,7 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
aclrtStream stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device); platform::SetNPUDeviceId(dst_place.device);
...@@ -168,7 +247,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -168,7 +247,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else { } else {
// On NPU, async operation after sync operation is ok, while sync operation // On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done. // after async is not ok, since the async operation may not done.
...@@ -186,7 +266,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -186,7 +266,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::NPUPlace src_place, platform::NPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
aclrtStream stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device); platform::SetNPUDeviceId(src_place.device);
...@@ -196,7 +276,8 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -196,7 +276,8 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
...@@ -211,7 +292,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -211,7 +292,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
void* dst, void* dst,
platform::NPUPlace src_place, platform::NPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
aclrtStream stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -221,7 +302,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -221,7 +302,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -239,7 +320,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -239,7 +320,7 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
// TODO(zhiqiu): support peer access? // TODO(zhiqiu): support peer access?
platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -284,7 +365,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>( ...@@ -284,7 +365,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
template <> template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
const void* src, size_t num, aclrtStream stream) { const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device); platform::SetNPUDeviceId(src_place.device);
...@@ -294,7 +375,8 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( ...@@ -294,7 +375,8 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
...@@ -307,7 +389,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( ...@@ -307,7 +389,7 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
template <> template <>
void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
const void* src, size_t num, aclrtStream stream) { const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device); platform::SetNPUDeviceId(dst_place.device);
...@@ -317,7 +399,8 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( ...@@ -317,7 +399,8 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else { } else {
// On NPU, async operation after sync operation is ok, while sync operation // On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done. // after async is not ok, since the async operation may not done.
...@@ -379,6 +462,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, ...@@ -379,6 +462,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
platform::NPUPinnedPlace place_dst; platform::NPUPinnedPlace place_dst;
platform::NPUPlace place_src(src_place.GetDeviceId()); platform::NPUPlace place_src(src_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream); return Copy(place_dst, dst, place_src, src, num, stream);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
#endif
} }
} }
...@@ -492,7 +592,7 @@ inline void SyncCUDAStream() { ...@@ -492,7 +592,7 @@ inline void SyncCUDAStream() {
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, gpuStream_t stream) { const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
...@@ -501,9 +601,11 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -501,9 +601,11 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
...@@ -522,7 +624,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -522,7 +624,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>( void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, gpuStream_t stream) { const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
...@@ -531,9 +633,11 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -531,9 +633,11 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
...@@ -552,7 +656,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -552,7 +656,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>( void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, gpuStream_t stream) { const void* src, size_t num, void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -562,9 +666,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -562,9 +666,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
...@@ -578,7 +684,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -578,7 +684,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream); num, reinterpret_cast<gpuStream_t>(stream));
} else { } else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
...@@ -620,8 +726,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>( ...@@ -620,8 +726,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
template <> template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num, platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
gpuStream_t stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -629,9 +734,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -629,9 +734,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
...@@ -647,7 +754,7 @@ template <> ...@@ -647,7 +754,7 @@ template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num, platform::CUDAPinnedPlace src_place, const void* src, size_t num,
gpuStream_t stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
...@@ -656,9 +763,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -656,9 +763,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
...@@ -674,7 +783,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -674,7 +783,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
template <> template <>
void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
pten::Place src_place, const void* src, pten::Place src_place, const void* src,
size_t num, gpuStream_t stream) { size_t num, void* stream) {
if (src_place.GetType() == pten::AllocationType::CPU && if (src_place.GetType() == pten::AllocationType::CPU &&
dst_place.GetType() == pten::AllocationType::CPU) { dst_place.GetType() == pten::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src; platform::CPUPlace place_dst, place_src;
...@@ -719,6 +828,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, ...@@ -719,6 +828,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
platform::CUDAPinnedPlace place_dst; platform::CUDAPinnedPlace place_dst;
platform::CUDAPlace place_src(src_place.GetDeviceId()); platform::CUDAPlace place_src(src_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream); return Copy(place_dst, dst, place_src, src, num, stream);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
#endif
} }
} }
...@@ -726,7 +852,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, ...@@ -726,7 +852,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
template <> template <>
void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst, void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
pten::Place src_place, const void* src, pten::Place src_place, const void* src,
size_t num, gpuStream_t stream) { size_t num, void* stream) {
Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
...@@ -735,7 +861,7 @@ template <> ...@@ -735,7 +861,7 @@ template <>
void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
pten::CPUPlace src_place, pten::CPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
gpuStream_t stream) { void* stream) {
Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
} }
...@@ -743,7 +869,7 @@ void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst, ...@@ -743,7 +869,7 @@ void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
template <> template <>
void Copy<pten::GPUPlace, pten::Place>(pten::GPUPlace dst_place, void* dst, void Copy<pten::GPUPlace, pten::Place>(pten::GPUPlace dst_place, void* dst,
pten::Place src_place, const void* src, pten::Place src_place, const void* src,
size_t num, gpuStream_t stream) { size_t num, void* stream) {
Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
src_place, src, num, stream); src_place, src, num, stream);
} }
...@@ -753,7 +879,7 @@ template <> ...@@ -753,7 +879,7 @@ template <>
void Copy<pten::Place, pten::GPUPlace>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::GPUPlace>(pten::Place dst_place, void* dst,
pten::GPUPlace src_place, pten::GPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
gpuStream_t stream) { void* stream) {
Copy(dst_place, dst, Copy(dst_place, dst,
pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
stream); stream);
...@@ -764,7 +890,7 @@ template <> ...@@ -764,7 +890,7 @@ template <>
void Copy<pten::GPUPinnedPlace, pten::Place>(pten::GPUPinnedPlace dst_place, void Copy<pten::GPUPinnedPlace, pten::Place>(pten::GPUPinnedPlace dst_place,
void* dst, pten::Place src_place, void* dst, pten::Place src_place,
const void* src, size_t num, const void* src, size_t num,
gpuStream_t stream) { void* stream) {
Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
...@@ -773,7 +899,7 @@ template <> ...@@ -773,7 +899,7 @@ template <>
void Copy<pten::Place, pten::GPUPinnedPlace>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::GPUPinnedPlace>(pten::Place dst_place, void* dst,
pten::GPUPinnedPlace src_place, pten::GPUPinnedPlace src_place,
const void* src, size_t num, const void* src, size_t num,
gpuStream_t stream) { void* stream) {
Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
} }
...@@ -800,7 +926,7 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, ...@@ -800,7 +926,7 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::MLUPlace src_place, platform::MLUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
mluStream stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetMLUDeviceId(src_place.device); platform::SetMLUDeviceId(src_place.device);
...@@ -808,7 +934,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, ...@@ -808,7 +934,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
platform::MLUMemcpyD2HAsync(dst, src, num, stream); platform::MLUMemcpyD2HAsync(dst, src, num,
reinterpret_cast<mluStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
...@@ -825,7 +952,7 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, ...@@ -825,7 +952,7 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
mluStream stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetMLUDeviceId(dst_place.device); platform::SetMLUDeviceId(dst_place.device);
...@@ -833,7 +960,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, ...@@ -833,7 +960,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
platform::MLUMemcpyH2DAsync(dst, src, num, stream); platform::MLUMemcpyH2DAsync(dst, src, num,
reinterpret_cast<mluStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
...@@ -850,7 +978,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -850,7 +978,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
void* dst, void* dst,
platform::MLUPlace src_place, platform::MLUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
mluStream stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
if (dst_place == src_place) { if (dst_place == src_place) {
...@@ -860,7 +988,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -860,7 +988,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event( platform::RecordEvent record_event(
"MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
platform::MLUMemcpyD2DAsync(dst, src, num, stream); platform::MLUMemcpyD2DAsync(dst, src, num,
reinterpret_cast<mluStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -877,7 +1006,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -877,7 +1006,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU"); platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU");
platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream); num, reinterpret_cast<mluStream>(stream));
} else { } else {
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
...@@ -892,7 +1021,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -892,7 +1021,7 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
template <> template <>
void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
pten::Place src_place, const void* src, pten::Place src_place, const void* src,
size_t num, mluStream stream) { size_t num, void* stream) {
if (src_place.GetType() == pten::AllocationType::CPU && if (src_place.GetType() == pten::AllocationType::CPU &&
dst_place.GetType() == pten::AllocationType::CPU) { dst_place.GetType() == pten::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src; platform::CPUPlace place_dst, place_src;
...@@ -912,6 +1041,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, ...@@ -912,6 +1041,23 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
platform::MLUPlace place_src(src_place.GetDeviceId()); platform::MLUPlace place_src(src_place.GetDeviceId());
platform::MLUPlace place_dst(dst_place.GetDeviceId()); platform::MLUPlace place_dst(dst_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream); return Copy(place_dst, dst, place_src, src, num, stream);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
#endif
} }
} }
...@@ -919,7 +1065,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, ...@@ -919,7 +1065,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
template <> template <>
void Copy<pten::MLUPlace, pten::Place>(pten::MLUPlace dst_place, void* dst, void Copy<pten::MLUPlace, pten::Place>(pten::MLUPlace dst_place, void* dst,
pten::Place src_place, const void* src, pten::Place src_place, const void* src,
size_t num, mluStream stream) { size_t num, void* stream) {
Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, Copy(pten::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst,
src_place, src, num, stream); src_place, src, num, stream);
} }
...@@ -929,7 +1075,7 @@ template <> ...@@ -929,7 +1075,7 @@ template <>
void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
pten::MLUPlace src_place, pten::MLUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
mluStream stream) { void* stream) {
Copy(dst_place, dst, Copy(dst_place, dst,
pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num, pten::Place(src_place.GetType(), src_place.GetDeviceId()), src, num,
stream); stream);
...@@ -939,7 +1085,7 @@ void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst, ...@@ -939,7 +1085,7 @@ void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
template <> template <>
void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst, void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
pten::Place src_place, const void* src, pten::Place src_place, const void* src,
size_t num, mluStream stream) { size_t num, void* stream) {
Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
...@@ -948,7 +1094,7 @@ template <> ...@@ -948,7 +1094,7 @@ template <>
void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst, void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
pten::CPUPlace src_place, pten::CPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
mluStream stream) { void* stream) {
Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
} }
...@@ -1013,7 +1159,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst, ...@@ -1013,7 +1159,7 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
} }
#endif #endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
else if (src_place.GetType() == pten::AllocationType::CPU && else if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT
dst_place.GetType() == pten::AllocationType::IPU) { dst_place.GetType() == pten::AllocationType::IPU) {
platform::IPUPlace place_dst(dst_place.GetDeviceId()); platform::IPUPlace place_dst(dst_place.GetDeviceId());
platform::CPUPlace place_src; platform::CPUPlace place_src;
...@@ -1048,5 +1194,48 @@ void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst, ...@@ -1048,5 +1194,48 @@ void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num); Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num);
} }
#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) && \
!defined(PADDLE_WITH_MLU)
template <>
void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
pten::Place src_place, const void* src,
size_t num, void* stream) {
if (src_place.GetType() == pten::AllocationType::CPU && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == pten::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == pten::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
}
}
template <>
void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
pten::Place src_place, const void* src,
size_t num, void* stream) {
Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
template <>
void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
pten::CPUPlace src_place,
const void* src, size_t num,
void* stream) {
Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
}
#endif
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -36,66 +36,25 @@ namespace memory { ...@@ -36,66 +36,25 @@ namespace memory {
template <typename DstPlace, typename SrcPlace> template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or GPU).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or GPU).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream CUDA stream.
*
* \note For GPU memory copy, CUDA stream need to be specified
* for asynchronously memory copy.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
gpuStream_t stream);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or NPU).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or NPU).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream NPU stream.
*
* \note For NPU memory copy, NPU stream need to be specified
* for asynchronously memory copy.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
aclrtStream stream);
#endif
#ifdef PADDLE_WITH_MLU
/** /**
* \brief Copy memory from one place to another place. * \brief Copy memory from one place to another place.
* *
* \param[in] DstPlace Destination allocation place (CPU or MLU). * \param[in] DstPlace Destination allocation place (CPU or GPU or XPU or
* CustomDevice).
* \param[in] dst Destination memory address. * \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or MLU). * \param[in] SrcPlace Source allocation place (CPU or GPU or XPU or
* CustomDevice).
* \param[in] src Source memory address. * \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy. * \param[in] num memory size in bytes to copy.
* \param[in] stream MLU stream. * \param[in] stream stream for asynchronously memory copy.
* *
* \note For MLU memory copy, MLU stream need to be specified * \note For GPU/XPU/CustomDevice memory copy, stream need to be specified
* for asynchronously memory copy. * for asynchronously memory copy, and type is restored in the
* implementation.
* *
*/ */
template <typename DstPlace, typename SrcPlace> template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
mluStream stream); void* stream);
#endif
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int16_t>;
template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
template struct SetConstant<platform::CPUDeviceContext,
platform::complex<float>>;
template struct SetConstant<platform::CPUDeviceContext,
platform::complex<double>>;
template struct SetConstant<pten::CPUContext, platform::float16>;
template struct SetConstant<pten::CPUContext, platform::bfloat16>;
template struct SetConstant<pten::CPUContext, float>;
template struct SetConstant<pten::CPUContext, double>;
template struct SetConstant<pten::CPUContext, int16_t>;
template struct SetConstant<pten::CPUContext, int>;
template struct SetConstant<pten::CPUContext, int64_t>;
template struct SetConstant<pten::CPUContext, bool>;
template struct SetConstant<pten::CPUContext, uint8_t>;
template struct SetConstant<pten::CPUContext, platform::complex<float>>;
template struct SetConstant<pten::CPUContext, platform::complex<double>>;
#ifdef PADDLE_WITH_XPU
template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
template struct SetConstant<platform::XPUDeviceContext, float>;
template struct SetConstant<platform::XPUDeviceContext, double>;
template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
template struct SetConstant<platform::XPUDeviceContext, int16_t>;
template struct SetConstant<platform::XPUDeviceContext, int>;
template struct SetConstant<platform::XPUDeviceContext, int64_t>;
template struct SetConstant<platform::XPUDeviceContext, bool>;
template struct SetConstant<platform::XPUDeviceContext,
platform::complex<float>>;
template struct SetConstant<platform::XPUDeviceContext,
platform::complex<double>>;
#endif
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::complex<float>, RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::complex<double>, RANK>;
DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
DEFINE_CPU_TRANS(3);
DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6);
template <typename T>
struct TransposeNormal<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& in, framework::Tensor* out,
const std::vector<int>& axis) {
const int rank = axis.size();
auto in_stride = framework::stride(in.dims());
auto out_stride = framework::stride(out->dims());
const T* in_ptr = in.data<T>();
T* out_ptr = out->data<T>();
auto transpose_helper = [&](int64_t beg, int64_t end) {
for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
int64_t in_idx = 0;
int64_t tmp_idx = out_idx;
// calculate the input index
for (int i = 0; i < rank; ++i) {
const int64_t coordinate = tmp_idx / out_stride[i];
tmp_idx -= coordinate * out_stride[i];
in_idx += coordinate * in_stride[axis[i]];
}
out_ptr[out_idx] = in_ptr[in_idx];
}
};
transpose_helper(0, out->numel());
}
};
// define transpose normal
#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
DEFINE_CPU_TRANS_NORMAL(platform::float16);
DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
DEFINE_CPU_TRANS_NORMAL(float);
DEFINE_CPU_TRANS_NORMAL(double);
DEFINE_CPU_TRANS_NORMAL(int);
DEFINE_CPU_TRANS_NORMAL(int64_t);
DEFINE_CPU_TRANS_NORMAL(bool);
DEFINE_CPU_TRANS_NORMAL(int16_t);
DEFINE_CPU_TRANS_NORMAL(uint8_t);
DEFINE_CPU_TRANS_NORMAL(int8_t);
DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
struct TensorSetConstantCPU {
TensorSetConstantCPU(framework::Tensor* tensor, float value)
: tensor_(tensor), value_(value) {}
template <typename T>
void apply() const {
auto cpu = platform::CPUPlace();
auto* begin = tensor_->mutable_data<T>(cpu);
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
}
framework::Tensor* tensor_;
float value_;
};
template <>
void set_constant_with_place<platform::XPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPinnedPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(
platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<platform::IPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
}
template <>
void set_constant_with_place<platform::MLUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CustomPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CUDAPinnedPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
}
struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
TensorSetConstantWithPlace(const platform::DeviceContext& context,
framework::Tensor* tensor, float value)
: context_(context), tensor_(tensor), value_(value) {}
template <typename Place>
void operator()(Place place) const {
set_constant_with_place<Place>(context_, tensor_, value_);
}
const platform::DeviceContext& context_;
framework::Tensor* tensor_;
float value_;
};
void set_constant(const platform::DeviceContext& context,
framework::Tensor* tensor, float value) {
TensorSetConstantWithPlace func(context, tensor, value);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// tensor->place().apply_visitor(func);
paddle::platform::VisitPlace(tensor->place(), func);
#else
func(platform::CPUPlace());
#endif
}
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(
vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input"
" tensor shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
auto out = framework::EigenMatrix<T>::From(*output);
for (int64_t i = 0; i < in_dims[0]; ++i) {
out.chip(i, 0) = in.chip(i, 0) + vec;
}
}
};
template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, int>;
template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
IF(WITH_CUSTOM_DEVICE)
cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
cc_library(stream SRCS stream.cc DEPS callback_manager)
cc_library(event SRCS event.cc DEPS enforce place)
cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
ENDIF()
set(DEV_LIBS custom_device)
# GPU # GPU
IF(WITH_GPU OR WITH_ROCM) IF(WITH_GPU OR WITH_ROCM)
add_subdirectory(gpu) add_subdirectory(gpu)
...@@ -22,3 +37,11 @@ ENDIF() ...@@ -22,3 +37,11 @@ ENDIF()
IF(WITH_MLU) IF(WITH_MLU)
add_subdirectory(mlu) add_subdirectory(mlu)
ENDIF() ENDIF()
# CUSTOM
IF(WITH_CUSTOM_DEVICE)
add_subdirectory(custom)
cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
ENDIF()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/callback_manager.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
CallbackManager::CallbackManager(stream::Stream *stream)
: stream_(stream), thread_pool_(1) {}
void CallbackManager::AddCallback(std::function<void()> callback) const {
auto *callback_func = new std::function<void()>(std::move(callback));
auto *func = new std::function<void()>([this, callback_func] {
std::lock_guard<std::mutex> lock(mtx_);
last_future_ = thread_pool_.enqueue([callback_func] {
std::unique_ptr<std::function<void()>> releaser(callback_func);
(*callback_func)();
});
});
platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
->AddCallback(stream_, func);
}
void CallbackManager::Wait() const {
platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
->SynchronizeStream(stream_);
{
std::lock_guard<std::mutex> lock(mtx_);
if (last_future_.valid()) {
last_future_.wait();
}
}
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace stream {
class Stream;
} // namespace stream
// NOTE(zjl): clean CallbackManager to make compilation faster
// Make CallbackManager thread-safe
class CallbackManager {
public:
explicit CallbackManager(stream::Stream* stream);
~CallbackManager() = default;
void AddCallback(std::function<void()> callback) const;
void Wait() const;
private:
stream::Stream* stream_;
mutable ::ThreadPool thread_pool_;
mutable std::mutex mtx_;
mutable std::future<void> last_future_;
};
} // namespace platform
} // namespace paddle
IF(WITH_CUSTOM_DEVICE)
cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
ENDIF()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/device_base.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
#include "paddle/fluid/platform/device_context.h"
static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
return d1.id == d2.id;
}
namespace paddle {
namespace platform {
class CustomDevice : public DeviceInterface {
public:
CustomDevice(const std::string& type, int priority, bool is_custom,
std::unique_ptr<C_DeviceInterface> pimpl, void* dso_handle)
: DeviceInterface(type, priority, is_custom),
pimpl_(std::move(pimpl)),
dso_handle_(dso_handle) {
Initialize();
}
~CustomDevice() override { Finalize(); }
size_t GetDeviceCount() override {
size_t count;
if (pimpl_->get_device_count(&count) != C_SUCCESS) {
count = 0;
}
return count;
}
std::vector<size_t> GetDeviceList() override {
size_t count = GetDeviceCount();
std::vector<size_t> devices(count);
pimpl_->get_device_list(devices.data());
return devices;
}
C_DeviceInterface* Impl() { return pimpl_.get(); }
void SynchronizeDevice(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_device(device));
}
void Initialize() override {
if (pimpl_->initialize && pimpl_->initialize() != C_SUCCESS) {
LOG(ERROR) << "Initialize " << Type() << " Failed\n";
exit(-1);
}
auto devices = GetDeviceList();
for (auto dev_id : devices) {
C_Device_st device;
device.id = dev_id;
devices_pool[dev_id] = device;
InitDevice(dev_id);
}
}
void Finalize() override {
auto devices = GetDeviceList();
for (auto dev_id : devices) {
// SetDevice(dev_id);
// SynchronizeDevice(dev_id);
DeInitDevice(dev_id);
}
bool ok = true;
if (pimpl_->finalize && pimpl_->finalize() != C_SUCCESS) {
LOG(ERROR) << "Finalize " << Type() << " Failed\n";
ok = false;
}
if (dso_handle_) {
dlclose(dso_handle_);
dso_handle_ = nullptr;
}
if (!ok) {
exit(1);
}
}
void InitDevice(size_t dev_id) override {
if (pimpl_->init_device) {
// Core set logical id, and Plugin replace it with physical id
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->init_device(device));
}
}
void DeInitDevice(size_t dev_id) override {
if (pimpl_->deinit_device) {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->deinit_device(device));
}
}
void SetDevice(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->set_device(device));
}
int GetDevice() override {
C_Device_st device;
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->get_device(&device));
return device.id;
}
void CreateStream(size_t dev_id, stream::Stream* stream,
const stream::Stream::Priority& priority =
stream::Stream::Priority::kNormal,
const stream::Stream::Flag& flag =
stream::Stream::Flag::kDefaultFlag) override {
if (priority != stream::Stream::Priority::kNormal ||
flag != stream::Stream::Flag::kDefaultFlag) {
PADDLE_THROW(platform::errors::Unavailable(
"priority != stream::Stream::Priority::kNormal || flag != "
"stream::Stream::Flag::kDefaultFlag is not allowed on "
"CustomDevice."));
}
const auto device = &devices_pool[dev_id];
C_Stream c_stream;
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->create_stream(device, &c_stream));
stream->set_stream(c_stream);
}
void DestroyStream(size_t dev_id, stream::Stream* stream) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream(
device, reinterpret_cast<C_Stream>(stream->raw_stream())));
}
void SynchronizeStream(size_t dev_id, const stream::Stream* stream) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream(
device, reinterpret_cast<C_Stream>(stream->raw_stream())));
}
bool QueryStream(size_t dev_id, const stream::Stream* stream) override {
const auto device = &devices_pool[dev_id];
if (!pimpl_->query_stream) {
SynchronizeStream(dev_id, stream);
return true;
}
if (pimpl_->query_stream(device, reinterpret_cast<C_Stream>(
stream->raw_stream())) == C_SUCCESS) {
return true;
}
return false;
}
void AddCallback(size_t dev_id, stream::Stream* stream,
stream::Stream::Callback* callback) override {
if (!pimpl_->stream_add_callback) {
PADDLE_THROW(platform::errors::Unavailable(
"AddCallback is not supported on %s.", Type()));
} else {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback(
device, reinterpret_cast<C_Stream>(stream->raw_stream()),
[](C_Device device, C_Stream stream, void* user_data,
C_Status* status) {
std::unique_ptr<std::function<void()>> func(
reinterpret_cast<std::function<void()>*>(user_data));
(*func)();
},
callback));
}
}
void CreateEvent(size_t dev_id, event::Event* event,
event::Event::Flag flags) override {
const auto device = &devices_pool[dev_id];
C_Event c_event;
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->create_event(device, &c_event));
event->set_event(c_event);
}
void DestroyEvent(size_t dev_id, event::Event* event) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_event(
device, reinterpret_cast<C_Event>(event->raw_event())));
}
void RecordEvent(size_t dev_id, const event::Event* event,
const stream::Stream* stream) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event(
device, reinterpret_cast<C_Stream>(stream->raw_stream()),
reinterpret_cast<C_Event>(event->raw_event())));
}
void SynchronizeEvent(size_t dev_id, const event::Event* event) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_event(
device, reinterpret_cast<C_Event>(event->raw_event())));
}
bool QueryEvent(size_t dev_id, const event::Event* event) override {
const auto device = &devices_pool[dev_id];
if (!pimpl_->query_event) {
SynchronizeEvent(dev_id, event);
return true;
}
if (pimpl_->query_event(device, reinterpret_cast<C_Event>(
event->raw_event())) == C_SUCCESS) {
return true;
}
return false;
}
void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
const event::Event* event) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
device, reinterpret_cast<C_Stream>(stream->raw_stream()),
reinterpret_cast<C_Event>(event->raw_event())));
}
void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr) override {
const auto device = &devices_pool[dev_id];
auto place = platform::CustomPlace(Type(), dev_id);
if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) {
C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_h2d(device, dst, src, size));
}
}
void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr) override {
const auto device = &devices_pool[dev_id];
auto place = platform::CustomPlace(Type(), dev_id);
if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) {
C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_d2h(device, dst, src, size));
}
}
void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr) override {
const auto device = &devices_pool[dev_id];
auto place = platform::CustomPlace(Type(), dev_id);
if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) {
C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_d2d(device, dst, src, size));
}
}
void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id,
const void* src, size_t size,
const stream::Stream* stream = nullptr) override {
int dst_dev_id = PlaceToId(dst_place);
auto dst_device = &devices_pool[dst_dev_id];
auto src_device = &devices_pool[src_dev_id];
if (stream && stream->raw_stream()) {
if (!pimpl_->async_memory_copy_p2p) {
MemoryCopyP2P(dst_place, dst, src_dev_id, src, size);
} else {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p(
dst_device, src_device,
reinterpret_cast<C_Stream>(stream->raw_stream()), dst, src, size));
}
} else {
if (!pimpl_->memory_copy_p2p) {
std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
} else {
auto src_place = platform::CustomPlace(Type(), src_dev_id);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pool.Get(src_place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
}
}
}
void* MemoryAllocate(size_t dev_id, size_t size) override {
void* ptr = nullptr;
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->device_memory_allocate(device, &ptr, size));
return ptr;
}
void MemoryDeallocate(size_t dev_id, void* ptr, size_t size) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->device_memory_deallocate(device, ptr, size));
}
void* MemoryAllocateHost(size_t dev_id, size_t size) override {
void* ptr = nullptr;
const auto device = &devices_pool[dev_id];
if (!pimpl_->unified_memory_allocate) {
PADDLE_THROW(platform::errors::Unavailable(
"MemoryAllocKind::Host is not supported on %s.", Type()));
} else {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->host_memory_allocate(device, &ptr, size));
}
return ptr;
}
void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size) override {
const auto device = &devices_pool[dev_id];
if (!pimpl_->host_memory_deallocate) {
PADDLE_THROW(platform::errors::Unavailable(
"MemoryAllocKind::Host is not supported on %s.", Type()));
} else {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->host_memory_deallocate(device, ptr, size));
}
}
void* MemoryAllocateUnified(size_t dev_id, size_t size) override {
void* ptr = nullptr;
const auto device = &devices_pool[dev_id];
if (!pimpl_->unified_memory_allocate) {
PADDLE_THROW(platform::errors::Unavailable(
"MemoryAllocKind::Unified is not supported on %s.", Type()));
} else {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->unified_memory_allocate(device, &ptr, size));
}
return ptr;
}
void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size) override {
const auto device = &devices_pool[dev_id];
if (!pimpl_->unified_memory_deallocate) {
PADDLE_THROW(platform::errors::Unavailable(
"MemoryAllocKind::Host is not supported on %s.", Type()));
} else {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->unified_memory_deallocate(device, ptr, size));
}
}
void MemorySet(size_t dev_id, void* ptr, uint8_t value,
size_t size) override {
const auto device = &devices_pool[dev_id];
if (pimpl_->device_memory_set) {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->device_memory_set(device, ptr, value, size));
} else {
std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
memset(tmp.get(), value, size);
MemoryCopyH2D(dev_id, ptr, tmp.get(), size);
}
}
void MemoryStats(size_t dev_id, size_t* total, size_t* free) override {
const auto device = &devices_pool[dev_id];
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->device_memory_stats(device, total, free));
size_t used = *total - *free;
VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/"
<< (*total >> 20) << "M, " << (*free >> 20)
<< "M available to allocate";
}
size_t GetMinChunkSize(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
size_t size = 0;
pimpl_->device_min_chunk_size(device, &size);
VLOG(10) << Type() << " min chunk size " << size << "B";
return size;
}
size_t GetMaxChunkSize(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
size_t size = 0;
if (pimpl_->device_max_chunk_size) {
pimpl_->device_max_chunk_size(device, &size);
VLOG(10) << Type() << " max chunk size " << size << "B";
} else {
return DeviceInterface::GetMaxChunkSize(dev_id);
}
return size;
}
size_t GetMaxAllocSize(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
size_t size = 0;
if (pimpl_->device_max_alloc_size) {
pimpl_->device_max_alloc_size(device, &size);
VLOG(10) << Type() << " max alloc size " << (size >> 20) << "M";
} else {
return DeviceInterface::GetMaxAllocSize(dev_id);
}
return size;
}
size_t GetInitAllocSize(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
size_t size = 0;
if (pimpl_->device_init_alloc_size) {
pimpl_->device_init_alloc_size(device, &size);
VLOG(10) << Type() << " init alloc size " << (size >> 20) << "M";
} else {
return DeviceInterface::GetInitAllocSize(dev_id);
}
return size;
}
size_t GetReallocSize(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
size_t size = 0;
if (pimpl_->device_realloc_size) {
pimpl_->device_realloc_size(device, &size);
VLOG(10) << Type() << " realloc size " << (size >> 20) << "M";
} else {
return DeviceInterface::GetReallocSize(dev_id);
}
return size;
}
size_t GetExtraPaddingSize(size_t dev_id) override {
const auto device = &devices_pool[dev_id];
size_t padding_size = 0;
if (pimpl_->device_extra_padding_size) {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->device_extra_padding_size(device, &padding_size));
VLOG(10) << Type() << " extra padding size " << (padding_size >> 20)
<< "M";
} else {
return DeviceInterface::GetExtraPaddingSize(dev_id);
}
return 0;
}
size_t GetComputeCapability() override {
size_t compute_capability = 0;
if (pimpl_->get_compute_capability) {
pimpl_->get_compute_capability(&compute_capability);
}
VLOG(10) << Type() << " get compute capability " << compute_capability;
return compute_capability;
}
size_t GetRuntimeVersion() override {
size_t version = 0;
if (pimpl_->get_runtime_version) {
pimpl_->get_runtime_version(&version);
}
VLOG(10) << Type() << " get runtime version " << version;
return version;
}
size_t GetDriverVersion() override {
size_t version = 0;
if (pimpl_->get_driver_version) {
pimpl_->get_driver_version(&version);
}
VLOG(10) << Type() << " get driver version " << version;
return version;
}
private:
inline int PlaceToIdNoCheck(const Place& place) {
int dev_id = place.GetDeviceId();
return dev_id;
}
inline int PlaceToId(const Place& place) {
int dev_id = PlaceToIdNoCheck(place);
PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(),
platform::errors::NotFound(
"Cannot found %s %d, please check visible devices",
Type(), dev_id));
return dev_id;
}
std::unique_ptr<C_DeviceInterface> pimpl_;
void* dso_handle_;
std::unordered_map<size_t, C_Device_st> devices_pool;
};
bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
#define CHECK_PTR(ptr, required) \
if (params->interface->ptr == nullptr && required) { \
LOG(WARNING) << "CustomRuntime [type: " << params->device_type \
<< "] pointer: " << #ptr << " is not set."; \
return false; \
}
int version = params->version.major * 10000 + params->version.minor * 100 +
params->version.patch;
const int runtime_version = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION * 10000 +
PADDLE_CUSTOM_RUNTIME_MINOR_VERSION * 100 +
PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
if (version < runtime_version) {
LOG(WARNING) << "CustomRuntime [type: " << params->device_type
<< "] version: " << version
<< " < PADDLE_CUSTOM_RUNTIME_VERSION " << runtime_version;
return false;
}
CHECK_PTR(initialize, false);
CHECK_PTR(finalize, false)
CHECK_PTR(init_device, false);
CHECK_PTR(set_device, true);
CHECK_PTR(get_device, true);
CHECK_PTR(deinit_device, false);
CHECK_PTR(create_stream, true);
CHECK_PTR(destroy_stream, true);
CHECK_PTR(query_stream, false);
CHECK_PTR(stream_add_callback, false);
CHECK_PTR(create_event, true);
CHECK_PTR(record_event, true);
CHECK_PTR(destroy_event, true);
CHECK_PTR(query_event, false);
CHECK_PTR(synchronize_device, false);
CHECK_PTR(synchronize_stream, true);
CHECK_PTR(synchronize_event, true);
CHECK_PTR(stream_wait_event, true);
CHECK_PTR(device_memory_allocate, true);
CHECK_PTR(device_memory_deallocate, true);
CHECK_PTR(host_memory_allocate, false);
CHECK_PTR(host_memory_deallocate, false);
CHECK_PTR(unified_memory_allocate, false);
CHECK_PTR(unified_memory_deallocate, false);
CHECK_PTR(memory_copy_h2d, true);
CHECK_PTR(memory_copy_d2h, true);
CHECK_PTR(memory_copy_d2d, true);
CHECK_PTR(memory_copy_p2p, false);
CHECK_PTR(async_memory_copy_h2d, false);
CHECK_PTR(async_memory_copy_d2h, false);
CHECK_PTR(async_memory_copy_d2d, false);
CHECK_PTR(async_memory_copy_p2p, false);
CHECK_PTR(get_device_count, true);
CHECK_PTR(get_device_list, true);
CHECK_PTR(device_memory_stats, true);
CHECK_PTR(device_min_chunk_size, true);
CHECK_PTR(device_max_chunk_size, false);
CHECK_PTR(device_max_alloc_size, false);
CHECK_PTR(device_extra_padding_size, false);
CHECK_PTR(get_compute_capability, false);
CHECK_PTR(get_runtime_version, false);
CHECK_PTR(get_driver_version, false);
return true;
#undef CHECK_PTR
}
typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
std::unique_ptr<C_DeviceInterface> device_interface,
void* dso_handle) {
if (ValidCustomCustomRuntimeParams(&runtime_params)) {
auto device =
std::make_unique<CustomDevice>(runtime_params.device_type, 255, true,
std::move(device_interface), dso_handle);
if (false == DeviceManager::Register(std::move(device))) {
LOG(WARNING) << "Skip this library. Register failed!!! there may be a "
"Custom Runtime with the same name.";
return false;
}
} else {
LOG(WARNING)
<< "Skip this library. Wrong parameters!!! please check the version "
"compatibility between PaddlePaddle and Custom Runtime.";
return false;
}
return true;
}
bool LoadCustomRuntimeLib(void* dso_handle) {
CustomRuntimeParams runtime_params;
std::memset(&runtime_params, 0, sizeof(CustomRuntimeParams));
runtime_params.size = sizeof(CustomRuntimeParams);
auto device_interface = std::make_unique<C_DeviceInterface>();
runtime_params.interface = device_interface.get();
std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
runtime_params.interface->size = sizeof(C_DeviceInterface);
RegisterDevicePluginFn init_plugin_fn =
reinterpret_cast<RegisterDevicePluginFn>(dlsym(dso_handle, "InitPlugin"));
if (!init_plugin_fn) {
LOG(WARNING) << "Skip this library. InitPlugin symbol not found.";
return false;
}
init_plugin_fn(&runtime_params);
if (runtime_params.device_type == nullptr) {
LOG(WARNING)
<< "Skip this library. InitPlugin failed!!! please check the version "
"compatibility between PaddlePaddle and Custom Runtime.";
return false;
}
return LoadCustomRuntimeLib(runtime_params, std::move(device_interface),
dso_handle);
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
#include "paddle/fluid/platform/device/device_manager.h"
#include "paddle/fluid/platform/device_context.h"
void RegisterDevice() {
CustomRuntimeParams runtime_params;
runtime_params.size = sizeof(CustomRuntimeParams);
auto device_interface = std::make_unique<C_DeviceInterface>();
runtime_params.interface = device_interface.get();
std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
runtime_params.interface->size = sizeof(C_DeviceInterface);
InitFakeCPUDevice(&runtime_params);
EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib(
runtime_params, std::move(device_interface), nullptr));
}
void InitDevice() {
RegisterDevice();
EXPECT_GT(static_cast<int>(
paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
0);
auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
EXPECT_NE(device, nullptr);
std::vector<paddle::platform::Place> places;
auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
for (auto dev_type : device_types) {
auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
for (auto dev_id : devices) {
places.push_back(
paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
}
}
EXPECT_GT(static_cast<int>(places.size()), 0);
paddle::platform::DeviceContextPool::Init(places);
}
void TestDeviceInterface(const paddle::platform::Place& place) {
std::cout << "TestDeviceInterface on " << place << std::endl;
if (paddle::platform::is_custom_place(place)) {
auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
auto p1 = device->MemoryAllocate(
paddle::platform::DeviceManager::GetMinChunkSize(place));
EXPECT_NE(p1, nullptr);
paddle::platform::DeviceManager::SetDevice(place);
auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
EXPECT_EQ(dev_id, place.GetDeviceId());
}
}
void TestTensorMutableData(const paddle::platform::Place& place) {
std::cout << "TestTensorInitialization on " << place << std::endl;
paddle::framework::Tensor src_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({1, 2, 3}),
place);
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({3, 1024}),
place);
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2, 3}),
place);
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2}),
place);
EXPECT_EQ(p1, p2);
}
void TestTensorShareDataWith(const paddle::platform::Place& place) {
std::cout << "TestTensorShareDataWith on " << place << std::endl;
paddle::framework::Tensor src_tensor;
paddle::framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(paddle::framework::make_ddim({2, 3, 4}), place);
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
void TestTensorUtils(const paddle::platform::Place& place) {
if (paddle::platform::is_custom_place(place) == false) {
return;
}
paddle::framework::Tensor src_tensor;
paddle::framework::Tensor gpu_tensor;
paddle::framework::Tensor dst_tensor;
int* src_ptr = src_tensor.mutable_data<int>(
paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace());
int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
memcpy(src_ptr, arr, 9 * sizeof(int));
// CPU Tensor to GPU Tensor
paddle::platform::CustomDeviceContext gpu_ctx(place);
paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor);
#if 0
// GPU Tensor to CPU Tensor
auto cpu_place = new paddle::platform::CPUPlace();
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
const int* dst_ptr = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
// Copy the same tensor
paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor);
gpu_ctx.Wait();
const int* dst_ptr_tmp = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr_tmp);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
}
paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor
paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Slice Tensors
gpu_ctx.Wait();
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
EXPECT_NE(dst_ptr, slice_ptr);
for (size_t i = 0; i < 3; ++i) {
EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
}
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
#endif
}
TEST(CustomDevice, Tensor) {
InitDevice();
auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
for (const auto& dev_type : dev_types) {
std::cout << "Test on " << dev_type << std::endl;
EXPECT_GT(static_cast<int>(
paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
0);
auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
TestDeviceInterface(place);
TestTensorMutableData(place);
TestTensorShareDataWith(place);
TestTensorUtils(place);
}
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/device_ext.h"
namespace paddle {
namespace platform {
namespace details {
template <typename T>
struct CustomDeviceStatusType {};
#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \
template <> \
struct CustomDeviceStatusType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS);
} // namespace details
inline std::string build_custom_device_error_msg(C_Status stat) {
std::ostringstream sout;
sout << " CustomDevice error, the error code is : " << stat << ". ";
return sout.str();
}
#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CustomDeviceStatusType< \
__CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_custom_device_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
} // namespace platform
} // namespace paddle
#endif // PADDLE_WITH_CUSTOM_DEVICE
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/device_ext.h"
constexpr size_t global_total_memory = 1024 * 1024UL;
static size_t global_free_memory = global_total_memory;
C_Status Init() { return C_SUCCESS; }
C_Status InitDevice(const C_Device device) { return C_SUCCESS; }
C_Status SetDevice(const C_Device device) { return C_SUCCESS; }
C_Status GetDevice(const C_Device device) {
device->id = 0;
return C_SUCCESS;
}
C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; }
C_Status Finalize() { return C_SUCCESS; }
C_Status GetDevicesCount(size_t *count) {
*count = 1;
return C_SUCCESS;
}
C_Status GetDevicesList(size_t *device) {
*device = 0;
return C_SUCCESS;
}
C_Status MemCpy(const C_Device device, void *dst, const void *src,
size_t size) {
memcpy(dst, src, size);
return C_SUCCESS;
}
C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
const void *src, size_t size) {
memcpy(dst, src, size);
return C_SUCCESS;
}
C_Status Allocate(const C_Device device, void **ptr, size_t size) {
if (global_free_memory >= size) {
*ptr = malloc(size);
global_free_memory -= size;
return C_SUCCESS;
} else {
*ptr = nullptr;
return C_FAILED;
}
}
C_Status Deallocate(const C_Device device, void *ptr, size_t size) {
free(ptr);
global_free_memory += size;
return C_SUCCESS;
}
C_Status CreateStream(const C_Device device, C_Stream *stream) {
return C_SUCCESS;
}
C_Status DestroyStream(const C_Device device, C_Stream stream) {
return C_SUCCESS;
}
C_Status CreateEvent(const C_Device device, C_Event *event) {
return C_SUCCESS;
}
C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) {
return C_SUCCESS;
}
C_Status DestroyEvent(const C_Device device, C_Event event) {
return C_SUCCESS;
}
C_Status SyncDevice(const C_Device device) { return C_SUCCESS; }
C_Status SyncStream(const C_Device device, C_Stream stream) {
return C_SUCCESS;
}
C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
C_Event event) {
return C_SUCCESS;
}
C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
size_t *free_memory) {
*total_memory = global_total_memory;
*free_memory = global_free_memory;
return C_SUCCESS;
}
C_Status DeviceMinChunkSize(const C_Device device, size_t *size) {
*size = 4 * 1024;
return C_SUCCESS;
}
C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) {
*size = 64 * 1024;
return C_SUCCESS;
}
C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) {
*size = global_total_memory * 0.95;
return C_SUCCESS;
}
#define DEVICE_TYPE "FakeCPU"
#define SUB_DEVICE_TYPE "V100"
void InitFakeCPUDevice(CustomRuntimeParams *params) {
params->device_type = const_cast<char *>(DEVICE_TYPE);
params->sub_device_type = const_cast<char *>(SUB_DEVICE_TYPE);
params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION;
params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
memset(reinterpret_cast<void *>(params->interface), 0,
sizeof(C_DeviceInterface));
params->interface->initialize = Init;
params->interface->finalize = Finalize;
params->interface->init_device = InitDevice;
params->interface->set_device = SetDevice;
params->interface->get_device = GetDevice;
params->interface->deinit_device = DestroyDevice;
params->interface->create_stream = CreateStream;
params->interface->destroy_stream = DestroyStream;
params->interface->create_event = CreateEvent;
params->interface->destroy_event = DestroyEvent;
params->interface->record_event = RecordEvent;
params->interface->synchronize_device = SyncDevice;
params->interface->synchronize_stream = SyncStream;
params->interface->synchronize_event = SyncEvent;
params->interface->stream_wait_event = StreamWaitEvent;
params->interface->memory_copy_h2d = MemCpy;
params->interface->memory_copy_d2d = MemCpy;
params->interface->memory_copy_d2h = MemCpy;
params->interface->async_memory_copy_h2d = AsyncMemCpy;
params->interface->async_memory_copy_d2d = AsyncMemCpy;
params->interface->async_memory_copy_d2h = AsyncMemCpy;
params->interface->device_memory_allocate = Allocate;
params->interface->host_memory_allocate = Allocate;
params->interface->unified_memory_allocate = Allocate;
params->interface->device_memory_deallocate = Deallocate;
params->interface->host_memory_deallocate = Deallocate;
params->interface->unified_memory_deallocate = Deallocate;
params->interface->get_device_count = GetDevicesCount;
params->interface->get_device_list = GetDevicesList;
params->interface->device_memory_stats = DeviceMemStats;
params->interface->device_max_chunk_size = DeviceMaxChunkSize;
params->interface->device_min_chunk_size = DeviceMinChunkSize;
params->interface->device_max_alloc_size = DeviceMaxAllocSize;
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/device_base.h"
#include "gflags/gflags.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
namespace paddle {
namespace platform {
#define INTERFACE_UNIMPLEMENT \
PADDLE_THROW(platform::errors::Unimplemented( \
"%s is not implemented on %s device.", __func__, Type()));
// info
size_t DeviceInterface::GetComputeCapability() {
VLOG(10) << Type() + " get compute capability " << 0;
return 0;
}
size_t DeviceInterface::GetRuntimeVersion() {
VLOG(10) << Type() + " get runtime version " << 0;
return 0;
}
size_t DeviceInterface::GetDriverVersion() {
VLOG(10) << Type() + " get driver version " << 0;
return 0;
}
// device manage
void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::SynchronizeDevice(size_t dev_id) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
// stream manage
void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
const stream::Stream::Priority& priority,
const stream::Stream::Flag& flag) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::SynchronizeStream(size_t dev_id,
const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
return true;
}
void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
stream::Stream::Callback* callback) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::StreamWaitEvent(size_t dev_id,
const stream::Stream* stream,
const event::Event* event) {
INTERFACE_UNIMPLEMENT;
}
// event manage
void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
event::Event::Flag flags) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::SynchronizeEvent(size_t dev_id,
const event::Event* event) {
INTERFACE_UNIMPLEMENT;
}
bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
INTERFACE_UNIMPLEMENT;
return true;
}
// memery manage
void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
size_t src_id, const void* src, size_t size,
const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) {
INTERFACE_UNIMPLEMENT;
return nullptr;
}
void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) {
INTERFACE_UNIMPLEMENT;
}
void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
INTERFACE_UNIMPLEMENT;
return nullptr;
}
void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
size_t size) {
INTERFACE_UNIMPLEMENT;
}
void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
INTERFACE_UNIMPLEMENT;
return nullptr;
}
void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
size_t size) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
size_t size) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) {
INTERFACE_UNIMPLEMENT;
}
size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
INTERFACE_UNIMPLEMENT;
}
size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
size_t available_to_alloc = AvailableAllocSize(dev_id);
PADDLE_ENFORCE_GT(available_to_alloc, 0,
platform::errors::ResourceExhausted(
"Not enough available %s memory.", Type()));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
: FLAGS_initial_gpu_memory_in_mb;
size_t alloc_bytes =
(flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
platform::errors::ResourceExhausted(
"Not enough available %s memory.", Type()));
return alloc_bytes;
}
size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
size_t total = 0;
size_t available = 0;
MemoryStats(dev_id, &total, &available);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GetMinChunkSize(dev_id);
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
return available_to_alloc;
}
size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
size_t init_alloc_size = AllocSize(dev_id, false);
VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
return init_alloc_size;
}
size_t DeviceInterface::GetReallocSize(size_t dev_id) {
size_t realloc_size = AllocSize(dev_id, true);
VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
return realloc_size;
}
size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
size_t max_alloc_size =
std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
return max_alloc_size;
}
size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
size_t max_chunk_size = GetMaxAllocSize(dev_id);
VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
return max_chunk_size;
}
size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
VLOG(10) << Type() + " extra padding size " << 0;
return 0;
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
namespace paddle {
namespace platform {
class DeviceInterface { // Driver / Runtime
public:
DeviceInterface(const std::string& type, uint8_t priority, bool is_custom)
: type_(type), priority_(priority), is_custom_(is_custom) {}
uint8_t Priority() { return priority_; }
std::string Type() { return type_; }
bool IsCustom() { return is_custom_; }
virtual ~DeviceInterface() {}
// Info
virtual size_t GetComputeCapability();
virtual size_t GetRuntimeVersion();
virtual size_t GetDriverVersion();
// Platform
//! Initialize
virtual void Initialize();
//! Finalize
virtual void Finalize();
// Device
virtual size_t GetDeviceCount() = 0;
virtual std::vector<size_t> GetDeviceList() = 0;
//! Wait for compute device to finish.
virtual void SynchronizeDevice(size_t dev_id);
//! Initialize device.
virtual void InitDevice(size_t dev_id);
//! Deinitialize device.
virtual void DeInitDevice(size_t dev_id);
// ! Set device to be used.
virtual void SetDevice(size_t dev_id);
// ! Returns which device is currently being used.
virtual int GetDevice();
// Stream
// ! Create an asynchronous stream
virtual void CreateStream(
size_t dev_id, stream::Stream* stream,
const stream::Stream::Priority& priority =
stream::Stream::Priority::kNormal,
const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
// ! Destroys an asynchronous stream.
virtual void DestroyStream(size_t dev_id, stream::Stream* stream);
// ! Waits for stream tasks to complete.
virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream);
// ! Queries an asynchronous stream for completion status.
virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
// ! Add a callback to a compute stream.
virtual void AddCallback(size_t dev_id, stream::Stream* stream,
stream::Stream::Callback* callback);
// Event
// ! Create an event.
virtual void CreateEvent(size_t dev_id, event::Event* event,
event::Event::Flag flags);
// ! Destroy an event.
virtual void DestroyEvent(size_t dev_id, event::Event* event);
// ! Records an event.
virtual void RecordEvent(size_t dev_id, const event::Event* event,
const stream::Stream* stream);
// ! Waits for event to complete.
virtual void SynchronizeEvent(size_t dev_id, const event::Event* event);
// ! Queries an event for completion status.
virtual bool QueryEvent(size_t dev_id, const event::Event* event);
// ! Make a compute stream wait on an event
virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
const event::Event* event);
// Memory
virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
size_t size,
const stream::Stream* stream = nullptr);
virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
size_t size,
const stream::Stream* stream = nullptr);
virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
size_t size,
const stream::Stream* stream = nullptr);
virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
const void* src, size_t size,
const stream::Stream* stream = nullptr);
virtual void* MemoryAllocate(size_t dev_id, size_t size);
virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size);
virtual void* MemoryAllocateHost(size_t dev_id, size_t size);
virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size);
virtual void* MemoryAllocateUnified(size_t dev_id, size_t size);
virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size);
virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size);
virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free);
virtual size_t GetMinChunkSize(size_t dev_id);
virtual size_t GetInitAllocSize(size_t dev_id);
virtual size_t GetReallocSize(size_t dev_id);
virtual size_t GetMaxAllocSize(size_t dev_id);
virtual size_t GetMaxChunkSize(size_t dev_id);
virtual size_t GetExtraPaddingSize(size_t dev_id);
private:
const std::string type_;
const uint8_t priority_;
const bool is_custom_;
size_t AllocSize(size_t dev_id, bool realloc);
size_t AvailableAllocSize(size_t dev_id);
};
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if !defined(_WIN32) && !defined(__APPLE__)
#include <cstddef>
#include <cstring>
#ifdef __cplusplus
extern "C" {
#endif
#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0
#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
typedef enum {
C_SUCCESS = 0, // success
C_WARNING, // results may not meet expectation (such as an asynchronous
// interface is actually synchronous)
C_FAILED, // resource exhausted/query failed
C_ERROR, // invalid argument/wrong usage/uninitialized
C_INTERNAL_ERROR // plugin error
} C_Status;
typedef struct C_Device_st { int id; } * C_Device;
typedef struct C_Stream_st* C_Stream;
typedef struct C_Event_st* C_Event;
typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
C_Status* status);
struct C_DeviceInterface {
// Core fill it and plugin must to check it
size_t size;
///////////////////////
// device manage api //
///////////////////////
/**
* @brief Initialize hardware
*
*/
C_Status (*initialize)();
/**
* @brief Deinitialize hardware
*
*/
C_Status (*finalize)();
/**
* @brief Initialize device
*
* @param[C_Device] device Core fill it with a logical id, and then plugin
* must replace it with a physical id
*/
C_Status (*init_device)(const C_Device device);
/**
* @brief Set current device
*
* @param[C_Device] device Core fill it with a physical id
*/
C_Status (*set_device)(const C_Device device);
/**
* @brief Get current device
*
* @param[C_Device] device Plugin fill it with a physical id
*/
C_Status (*get_device)(const C_Device device);
/**
* @brief Deinitialize device
*
* @param[C_Device] device Core fill it with a physical id
*/
C_Status (*deinit_device)(const C_Device device);
/**
* @brief Create a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream*] stream Plugin create a stream and fill it
*/
C_Status (*create_stream)(const C_Device device, C_Stream* stream);
/**
* @brief Destroy a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
*/
C_Status (*destroy_stream)(const C_Device device, C_Stream stream);
/**
* @brief Query a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
*/
C_Status (*query_stream)(const C_Device device, C_Stream stream);
/**
* @brief Add a callback to stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[C_Callback] callback
* @param[void*] user_data
*/
C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
C_Callback callback, void* user_data);
/**
* @brief Create an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event*] event Plugin create an event and fill it
*/
C_Status (*create_event)(const C_Device device, C_Event* event);
/**
* @brief Record an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[C_Event] event
*/
C_Status (*record_event)(const C_Device device, C_Stream stream,
C_Event event);
/**
* @brief Destroy an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event] event
*/
C_Status (*destroy_event)(const C_Device device, C_Event event);
/**
* @brief Query an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event] event
*/
C_Status (*query_event)(const C_Device device, C_Event event);
/**
* @brief Synchronize a device
*
* @param[C_Device] device Core fill it with a physical id
*/
C_Status (*synchronize_device)(const C_Device device);
/**
* @brief Synchronize a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
*/
C_Status (*synchronize_stream)(const C_Device device, C_Stream stream);
/**
* @brief Synchronize an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event] event
*/
C_Status (*synchronize_event)(const C_Device device, C_Event event);
/**
* @brief Make a stream wait on an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[C_Event] event
*/
C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
C_Event event);
void* reserved_dev_api[8];
///////////////////////
// memory manage api //
///////////////////////
/**
* @brief Device memory allocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void**] ptr Plugin allocate an address and fill it
* @param[size_t] size
*/
C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
size_t size);
/**
* @brief Device memory deallocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[size_t] size
*/
C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
size_t size);
/**
* @brief Device memory set
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[unsigned char] value
* @param[size_t] size
*/
C_Status (*device_memory_set)(const C_Device device, void* ptr,
unsigned char value, size_t size);
/**
* @brief Host memory allocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void**] ptr Plugin allocate an address and fill it
* @param[size_t] size
*/
C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
size_t size);
/**
* @brief Host memory deallocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[size_t] size
*/
C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
size_t size);
/**
* @brief Unified memory allocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void**] ptr Plugin allocate an address and fill it
* @param[size_t] size
*/
C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
size_t size);
/**
* @brief Unified memory deallocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[size_t] size
*/
C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
size_t size);
/**
* @brief Memory copy from host to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
size_t size);
/**
* @brief Memory copy from device to host
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
size_t size);
/**
* @brief Memory copy from device to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
size_t size);
/**
* @brief Peer memory copy from device to device
*
* @param[C_Device] dst_device Core fill it with a physical id
* @param[C_Device] src_device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_p2p)(const C_Device dst_device,
const C_Device src_device, void* dst,
const void* src, size_t size);
/**
* @brief Asynchonrize memory copy from host to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
void* dst, const void* src, size_t size);
/**
* @brief Asynchonrize memory copy from device to host
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
void* dst, const void* src, size_t size);
/**
* @brief Asynchonrize memory copy from device to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
void* dst, const void* src, size_t size);
/**
* @brief Peer asynchonrize memory copy from host to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
const C_Device src_device, C_Stream stream,
void* dst, const void* src, size_t size);
void* reserved_mem_api[8];
//////////////
// info api //
//////////////
/**
* @brief Get visible device count
*
* @param[size_t*] count Plugin fill it
*/
C_Status (*get_device_count)(size_t* count);
/**
* @brief Get visible device list
*
* @param[size_t*] devices Plugin fill it
*/
C_Status (*get_device_list)(size_t* devices);
/**
* @brief Device memory statistic
*
* @param[C_Device] device Core fill it with a physical id
* @param[size_t*] total_memory
* @param[size_t*] free_memory
* @param[size_t*] used_memory
*/
C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
size_t* free_memory);
/**
* @brief Device minimum chunk size
*
* @param[size_t*] count
*/
C_Status (*device_min_chunk_size)(const C_Device device, size_t* count);
/**
* @brief Device maximum chunk size
*
* @param[size_t*] count
*/
C_Status (*device_max_chunk_size)(const C_Device device, size_t* count);
/**
* @brief Device maximum alloc size
*
* @param[size_t*] count
*/
C_Status (*device_max_alloc_size)(const C_Device device, size_t* count);
/**
* @brief Device extra padding size
*
* @param[size_t*] size
*/
C_Status (*device_extra_padding_size)(const C_Device device, size_t* size);
/**
* @brief Device initial allocated size
*
* @param[size_t*] size
*/
C_Status (*device_init_alloc_size)(const C_Device device, size_t* size);
/**
* @brief Device reallocated size
*
* @param[size_t*] size
*/
C_Status (*device_realloc_size)(const C_Device device, size_t* size);
/**
* @brief Get compute capability
*
* @param[size_t*] compute_capability
*/
C_Status (*get_compute_capability)(size_t* compute_capability);
/**
* @brief Get runtime version
*
* @param[size_t*] version
*/
C_Status (*get_runtime_version)(size_t* version);
/**
* @brief Get driver version
*
* @param[size_t*] version
*/
C_Status (*get_driver_version)(size_t* version);
void* reserved_info_api[8];
///////////////
// other api //
///////////////
void* reserved_other_api[8];
};
struct CustomRuntimeVersion {
size_t major, minor, patch;
};
struct CustomRuntimeParams {
// Core fill it and plugin must to check it
size_t size;
// Plugin fill it
C_DeviceInterface* interface;
// Plugin fill it and Core will to check it
CustomRuntimeVersion version;
// Plugin fill it
char* device_type;
// Plugin fill it
char* sub_device_type;
char reserved[32];
};
// Plugin implement it and fill CustomRuntimeParams
void InitPlugin(CustomRuntimeParams*);
#ifdef __cplusplus
} // extern "C"
#endif
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/device_guard.h"
namespace paddle {
namespace platform {
// Even this source file does not contains any code, it is better to keep this
// source file for cmake dependency.
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/device_manager.h"
namespace paddle {
namespace platform {
class DeviceGuard {
public:
explicit inline DeviceGuard(const Place& place)
: dev_type_(PlaceHelper::GetDeviceType(place)) {
prev_id = DeviceManager::GetDevice(dev_type_);
cur_id = PlaceHelper::GetDeviceId(place);
if (cur_id != prev_id) {
DeviceManager::SetDevice(dev_type_, cur_id);
}
}
inline ~DeviceGuard() {
if (cur_id != prev_id) {
DeviceManager::SetDevice(dev_type_, prev_id);
}
}
DeviceGuard(const DeviceGuard& o) = delete;
DeviceGuard& operator=(const DeviceGuard& o) = delete;
private:
size_t prev_id, cur_id;
std::string dev_type_;
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/device_manager.h"
#if !defined(_WIN32)
#include <dirent.h>
#else
#endif
#include <functional>
#include <regex>
namespace paddle {
namespace platform {
void Device::CreateStream(stream::Stream* stream,
const stream::Stream::Priority& priority,
const stream::Stream::Flag& flag) {
impl_->CreateStream(dev_id_, stream, priority, flag);
}
void Device::DestroyStream(stream::Stream* stream) {
impl_->DestroyStream(dev_id_, stream);
}
void Device::SynchronizeStream(const stream::Stream* stream) {
impl_->SynchronizeStream(dev_id_, stream);
}
bool Device::QueryStream(const stream::Stream* stream) {
return impl_->QueryStream(dev_id_, stream);
}
void Device::AddCallback(stream::Stream* stream,
stream::Stream::Callback* callback) {
impl_->AddCallback(dev_id_, stream, callback);
}
void Device::CreateEvent(event::Event* event, event::Event::Flag flags) {
impl_->CreateEvent(dev_id_, event, flags);
}
void Device::DestroyEvent(event::Event* event) {
impl_->DestroyEvent(dev_id_, event);
}
void Device::RecordEvent(const event::Event* event,
const stream::Stream* stream) {
impl_->RecordEvent(dev_id_, event, stream);
}
void Device::SynchronizeEvent(const event::Event* event) {
impl_->SynchronizeEvent(dev_id_, event);
}
bool Device::QueryEvent(const event::Event* event) {
return impl_->QueryEvent(dev_id_, event);
}
void Device::StreamWaitEvent(const stream::Stream* stream,
const event::Event* event) {
impl_->StreamWaitEvent(dev_id_, stream, event);
}
void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
const stream::Stream* stream) {
impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
}
void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
const stream::Stream* stream) {
impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
}
void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
const stream::Stream* stream) {
impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
}
void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
}
void* Device::MemoryAllocate(size_t size) {
return impl_->MemoryAllocate(dev_id_, size);
}
void Device::MemoryDeallocate(void* ptr, size_t size) {
impl_->MemoryDeallocate(dev_id_, ptr, size);
}
void* Device::MemoryAllocateHost(size_t size) {
return impl_->MemoryAllocateHost(dev_id_, size);
}
void Device::MemoryDeallocateHost(void* ptr, size_t size) {
impl_->MemoryDeallocateHost(dev_id_, ptr, size);
}
void* Device::MemoryAllocateUnified(size_t size) {
return impl_->MemoryAllocateUnified(dev_id_, size);
}
void Device::MemoryDeallocateUnified(void* ptr, size_t size) {
impl_->MemoryDeallocateUnified(dev_id_, ptr, size);
}
void Device::MemorySet(void* ptr, uint8_t value, size_t size) {
impl_->MemorySet(dev_id_, ptr, value, size);
}
std::string Device::Type() { return impl_->Type(); }
static pten::RWLock _global_device_manager_rw_lock;
bool DeviceManager::Register(std::unique_ptr<DeviceInterface> device_impl) {
pten::AutoWRLock lock(&_global_device_manager_rw_lock);
VLOG(4) << "Register Device - " << device_impl->Type();
auto device_type = device_impl->Type();
auto& dev_impl_map = Instance().device_impl_map_;
auto& dev_map = Instance().device_map_;
if (dev_impl_map.find(device_type) == dev_impl_map.end()) {
dev_impl_map.insert(
std::pair<std::string, std::unique_ptr<DeviceInterface>>(
device_type, std::move(device_impl)));
auto& dev_impl = dev_impl_map[device_type];
auto& dev_vec = dev_map[device_type];
VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
dev_vec.emplace_back(new Device(i, dev_impl.get()));
}
} else {
auto& plat = dev_impl_map[device_type];
if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) {
dev_impl_map[device_type] = std::move(device_impl);
auto& dev_impl = dev_impl_map[device_type];
auto& dev_vec = dev_map[device_type];
dev_vec.clear();
VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
dev_vec.emplace_back(new Device(i, dev_impl.get()));
}
} else {
return false;
}
}
return true;
}
DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
const std::string& device_type) {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
if (dev_impl_map.find(device_type) != dev_impl_map.end()) {
return dev_impl_map.at(device_type).get();
} else {
LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
PADDLE_THROW(
platform::errors::Fatal("Unregistered device type %s.", device_type));
return nullptr;
}
}
Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_map = Instance().device_map_;
auto dev_type = PlaceHelper::GetDeviceType(place);
auto dev_id = PlaceHelper::GetDeviceId(place);
PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
platform::errors::NotFound(
"Unable to find Device with type %s.", dev_type));
auto& dev_vec = dev_map[dev_type];
PADDLE_ENFORCE_LT(
dev_id, dev_vec.size(),
platform::errors::OutOfRange(
"The visible devices count of type %s is %d, but dev_id is %d.",
dev_type, dev_vec.size(), dev_id));
return dev_vec[dev_id].get();
}
std::vector<std::string> DeviceManager::GetAllDeviceTypes() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
devices.push_back(iter->first);
}
return devices;
}
std::vector<std::string> DeviceManager::GetAllCustomDeviceTypes() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
if (iter->second->IsCustom()) {
devices.push_back(iter->first);
}
}
return devices;
}
std::vector<std::string> DeviceManager::GetAllDeviceList() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
size_t device_count = iter->second->GetDeviceCount();
std::string dev_type = iter->second->Type();
if (device_count == 1) {
devices.push_back(dev_type);
} else {
for (size_t i = 0; i < device_count; ++i) {
devices.push_back(dev_type + ":" + std::to_string(i));
}
}
}
return devices;
}
std::vector<std::string> DeviceManager::GetAllCustomDeviceList() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
size_t device_count = iter->second->GetDeviceCount();
std::string dev_type = iter->second->Type();
if (iter->second->IsCustom()) {
if (device_count == 1) {
devices.push_back(dev_type);
} else {
for (size_t i = 0; i < device_count; ++i) {
devices.push_back(dev_type + ":" + std::to_string(i));
}
}
}
}
return devices;
}
bool DeviceManager::HasDeviceType(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl != nullptr;
}
bool DeviceManager::IsCustom(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->IsCustom();
}
void DeviceManager::Initialize(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->Initialize();
}
void DeviceManager::Finalize(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->Finalize();
}
void DeviceManager::SynchronizeDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->SynchronizeDevice(device_id);
}
void DeviceManager::InitDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->InitDevice(device_id);
}
void DeviceManager::DeInitDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->DeInitDevice(device_id);
}
void DeviceManager::SetDevice(const std::string& device_type,
size_t device_id) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->SetDevice(device_id);
}
void DeviceManager::SetDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
DeviceManager::SetDevice(device_type, device_id);
}
int DeviceManager::GetDevice(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetDevice();
}
size_t DeviceManager::GetMinChunkSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetMinChunkSize(device_id);
}
size_t DeviceManager::GetMaxChunkSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetMaxChunkSize(device_id);
}
size_t DeviceManager::GetMaxAllocSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetMaxAllocSize(device_id);
}
size_t DeviceManager::GetInitAllocSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetInitAllocSize(device_id);
}
size_t DeviceManager::GetReallocSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetReallocSize(device_id);
}
size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetExtraPaddingSize(device_id);
}
void DeviceManager::MemoryStats(const Place& place, size_t* total,
size_t* free) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->MemoryStats(device_id, total, free);
}
size_t DeviceManager::GetDeviceCount(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetDeviceCount();
}
std::vector<size_t> DeviceManager::GetDeviceList(
const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetDeviceList();
}
DeviceManager& DeviceManager::Instance() {
static DeviceManager platform_manager;
return platform_manager;
}
std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
std::vector<std::string> libraries;
std::regex express(".*\\.so");
std::match_results<std::string::iterator> results;
DIR* dir = nullptr;
dirent* ptr = nullptr;
dir = opendir(library_dir.c_str());
if (dir == nullptr) {
VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed";
} else {
while ((ptr = readdir(dir)) != nullptr) {
std::string filename(ptr->d_name);
if (std::regex_match(filename.begin(), filename.end(), results,
express)) {
libraries.push_back(library_dir + '/' + filename);
VLOG(4) << "found CustomDevice library: " << libraries.back()
<< std::endl;
}
}
closedir(dir);
}
return libraries;
}
bool LoadCustomDevice(const std::string& library_dir) {
std::vector<std::string> libs = ListAllLibraries(library_dir);
for (const auto& lib_path : libs) {
auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
LoadCustomRuntimeLib(dso_handle);
}
return true;
}
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/device_base.h"
#include "paddle/fluid/platform/device/device_ext.h"
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/pten/backends/dynload/port.h"
#include "paddle/pten/core/utils/rw_lock.h"
namespace paddle {
namespace platform {
class Device final {
public:
Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
// Stream
// ! Create an asynchronous stream
void CreateStream(
stream::Stream* stream, const stream::Stream::Priority& priority =
stream::Stream::Priority::kNormal,
const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
// ! Destroys an asynchronous stream.
void DestroyStream(stream::Stream* stream);
// ! Waits for stream tasks to complete.
void SynchronizeStream(const stream::Stream* stream);
// ! Queries an asynchronous stream for completion status.
bool QueryStream(const stream::Stream* stream);
// ! Add a callback to a compute stream.
void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback);
// Event
// ! Create an event.
void CreateEvent(event::Event* event, event::Event::Flag flags);
// ! Destroy an event.
void DestroyEvent(event::Event* event);
// ! Records an event.
void RecordEvent(const event::Event* event, const stream::Stream* stream);
// ! Waits for event to complete.
void SynchronizeEvent(const event::Event* event);
// ! Queries an event for completion status.
bool QueryEvent(const event::Event* event);
// ! Make a compute stream wait on an event
void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
// Memory
void MemoryCopyH2D(void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr);
void MemoryCopyD2H(void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr);
void MemoryCopyD2D(void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr);
void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
size_t size, const stream::Stream* stream = nullptr);
void* MemoryAllocate(size_t size);
void MemoryDeallocate(void* ptr, size_t size);
void* MemoryAllocateHost(size_t size);
void MemoryDeallocateHost(void* ptr, size_t size);
void* MemoryAllocateUnified(size_t size);
void MemoryDeallocateUnified(void* ptr, size_t size);
void MemorySet(void* ptr, uint8_t value, size_t size);
std::string Type();
private:
size_t dev_id_;
DeviceInterface* impl_;
};
class DeviceManager {
public:
static bool Register(std::unique_ptr<DeviceInterface> device);
static bool RegisterPinnedDevice(DeviceInterface* device);
static Device* GetDeviceWithPlace(const Place& place);
static std::vector<std::string> GetAllDeviceTypes();
static std::vector<std::string> GetAllCustomDeviceTypes();
static std::vector<std::string> GetAllDeviceList();
static std::vector<std::string> GetAllCustomDeviceList();
static bool HasDeviceType(const std::string& device_type);
static bool IsCustom(const std::string& device_type);
// platform & device
static void Initialize(const std::string& device_type);
static void Finalize(const std::string& device_type);
static void SynchronizeDevice(const Place& place);
static void InitDevice(const Place& place);
static void DeInitDevice(const Place& place);
static void SetDevice(const std::string& device_type, size_t device_id);
static void SetDevice(const Place& place);
static int GetDevice(const std::string& device_type);
static size_t GetMinChunkSize(const Place& place);
static size_t GetMaxChunkSize(const Place& place);
static size_t GetMaxAllocSize(const Place& place);
static size_t GetInitAllocSize(const Place& place);
static size_t GetReallocSize(const Place& place);
static size_t GetExtraPaddingSize(const Place& place);
static void MemoryStats(const Place& place, size_t* total, size_t* free);
static size_t GetDeviceCount(const std::string& device_type);
static std::vector<size_t> GetDeviceList(const std::string& device_type);
private:
DISABLE_COPY_AND_ASSIGN(DeviceManager);
DeviceManager() {}
static DeviceManager& Instance();
static DeviceInterface* GetDeviceInterfaceWithType(
const std::string& device_type);
std::unordered_map<std::string, std::unique_ptr<DeviceInterface>>
device_impl_map_;
std::unordered_map<std::string, std::vector<std::unique_ptr<Device>>>
device_map_;
};
bool LoadCustomRuntimeLib(void* dso_handle);
bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
std::unique_ptr<C_DeviceInterface> device_interface,
void* dso_handle);
bool LoadCustomDevice(const std::string& library_path);
class Registrar {
public:
template <typename DeviceT>
explicit Registrar(DeviceT* device_ptr) {
DeviceManager::Register(std::unique_ptr<DeviceT>(device_ptr));
}
void Touch() {}
};
} // namespace platform
} // namespace paddle
#endif
...@@ -38,3 +38,12 @@ limitations under the License. */ ...@@ -38,3 +38,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/callback_manager.h"
#include "paddle/fluid/platform/device/custom/enforce_custom.h"
#include "paddle/fluid/platform/device/device_guard.h"
#include "paddle/fluid/platform/device/device_manager.h"
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/device_guard.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/stream.h"
namespace paddle {
namespace platform {
namespace event {
event_t Event::raw_event() const { return event_; }
void Event::set_event(event_t event) { event_ = event; }
Event::Event(const Place& place, event_t event)
: place_(place),
device_(platform::DeviceManager::GetDeviceWithPlace(place)),
event_(event),
own_data_(false) {}
Event::~Event() { Destroy(); }
bool Event::Init(const Place& place, Flag flags) {
place_ = place;
DeviceGuard guard(place_);
device_->CreateEvent(this, flags);
VLOG(3) << "Init Event: " << event_ << ", place: " << place_
<< ", flag:" << static_cast<int>(flags);
own_data_ = true;
return true;
}
void Event::Destroy() {
if (own_data_) {
DeviceGuard guard(place_);
device_->DestroyEvent(this);
own_data_ = false;
}
}
void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
bool Event::Query() const { return device_->QueryEvent(this); }
void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
const Place& Event::GetPlace() const { return place_; }
} // namespace event
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class Device;
namespace stream {
class Stream;
} // namespace stream
namespace event {
using event_t = void*;
class Event {
public:
enum Flag {
Default = 0x0,
BlockingSync = 0x1,
DisableTiming = 0x2,
Interprocess = 0x4,
};
// For compatible
Event(const Place& place, event_t event);
~Event();
event_t raw_event() const;
void set_event(event_t event);
bool Init(const Place& place, Flag flags = Flag::Default);
void Destroy();
void Record(const stream::Stream* stream);
bool Query() const;
void Synchonrize() const;
const Place& GetPlace() const;
private:
DISABLE_COPY_AND_ASSIGN(Event);
Place place_;
Device* device_;
event_t event_;
bool own_data_ = true;
};
} // namespace event
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/stream.h"
#include "paddle/fluid/platform/device/device_guard.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/event.h"
namespace paddle {
namespace platform {
namespace stream {
Stream::~Stream() { Destroy(); }
const stream_t& Stream::raw_stream() const { return stream_; }
void Stream::set_stream(stream_t stream) { stream_ = stream; }
// For compatiable
Stream::Stream(const Place& place, stream_t stream)
: place_(place),
device_(platform::DeviceManager::GetDeviceWithPlace(place)),
stream_(stream),
callback_manager_(new CallbackManager(this)),
own_data_(false) {}
bool Stream::Init(const Place& place, const Priority& priority,
const Flag& flag) {
place_ = place;
device_ = platform::DeviceManager::GetDeviceWithPlace(place);
DeviceGuard guard(place_);
device_->CreateStream(this, priority, flag);
callback_manager_.reset(new CallbackManager(this));
VLOG(3) << "Init Stream: " << stream_ << ", place: " << place_
<< ", priority: " << static_cast<int>(priority)
<< ", flag:" << static_cast<int>(flag);
own_data_ = true;
return true;
}
void Stream::RecordEvent(event::Event* event, Callback callback) const {
callback();
device_->RecordEvent(event, this);
}
void Stream::RecordEvent(event::Event* event) const {
device_->RecordEvent(event, this);
}
void Stream::WaitEvent(event::Event* event) const {
device_->StreamWaitEvent(this, event);
}
void Stream::Wait() const {
#if !defined(_WIN32)
device_->SynchronizeStream(this);
#else
while (1) {
if (device_->QueryStream(this)) {
break;
}
}
#endif
}
void Stream::WaitCallback() const { callback_manager_->Wait(); }
void Stream::Destroy() {
if (own_data_) {
DeviceGuard guard(place_);
device_->DestroyStream(this);
own_data_ = false;
}
}
bool Stream::Query() const { return device_->QueryStream(this); }
void Stream::Synchronize() const { device_->SynchronizeStream(this); }
const Place& Stream::GetPlace() const { return place_; }
} // namespace stream
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/callback_manager.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class Device;
namespace event {
class Event;
} // namespace event
namespace stream {
using stream_t = void*;
class Stream {
public:
enum class Priority : uint8_t {
kNull = 0x0,
kHigh = 0x1,
kNormal = 0x2,
};
enum class Flag : uint8_t {
kDefaultFlag = 0x0,
kStreamNonBlocking = 0x1,
};
using Callback = std::function<void()>;
Stream() = default;
// For compatiable
Stream(const Place& place, stream_t stream);
~Stream();
const stream_t& raw_stream() const;
void set_stream(stream_t stream);
bool Init(const Place& place, const Priority& priority = Priority::kNormal,
const Flag& flag = Flag::kDefaultFlag);
template <typename Callback>
void AddCallback(Callback&& callback) const {
callback_manager_->AddCallback(callback);
}
void RecordEvent(event::Event* event, Callback callback) const;
void RecordEvent(event::Event* event) const;
void WaitEvent(event::Event* event) const;
void Wait() const;
void WaitCallback() const;
void Destroy();
bool Query() const;
void Synchronize() const;
const Place& GetPlace() const;
private:
DISABLE_COPY_AND_ASSIGN(Stream);
Place place_;
Device* device_;
stream_t stream_;
std::unique_ptr<CallbackManager> callback_manager_;
bool own_data_ = true;
};
} // namespace stream
} // namespace platform
} // namespace paddle
...@@ -30,6 +30,7 @@ limitations under the License. */ ...@@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -256,6 +257,15 @@ DeviceContextPool::DeviceContextPool( ...@@ -256,6 +257,15 @@ DeviceContextPool::DeviceContextPool(
"NPUPinnedPlace is not supported. Please re-compile with " "NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL " "WITH_ASCEND_CL "
"option.")); "option."));
#endif
} else if (platform::is_custom_place(p)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<CustomDeviceContext>(&device_contexts_, p);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif #endif
} }
} }
...@@ -885,6 +895,24 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob( ...@@ -885,6 +895,24 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
return key_it->second; return key_it->second;
} }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) {
DeviceGuard guard(place_);
stream_.reset(new stream::Stream());
stream_->Init(place_);
}
CustomDeviceContext::~CustomDeviceContext() {}
const Place& CustomDeviceContext::GetPlace() const { return place_; }
void CustomDeviceContext::Wait() const {
// platform::RecordEvent record_event("NPUDeviceContext/wait");
VLOG(4) << "CustomDevice context(" << this << ") Wait";
stream_->Wait();
}
#endif #endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -70,6 +70,9 @@ limitations under the License. */ ...@@ -70,6 +70,9 @@ limitations under the License. */
#include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/fluid/platform/device/npu/npu_stream.h" #include "paddle/fluid/platform/device/npu/npu_stream.h"
#endif #endif
#include "paddle/fluid/platform/device/device_ext.h"
#include "paddle/fluid/platform/device/stream.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
namespace Eigen { namespace Eigen {
...@@ -815,6 +818,47 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -815,6 +818,47 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
}; };
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDeviceContext : public DeviceContext {
public:
explicit CustomDeviceContext(CustomPlace place);
virtual ~CustomDeviceContext();
const Place& GetPlace() const override;
void Wait() const override;
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
C_Stream stream() const {
return reinterpret_cast<C_Stream>(stream_->raw_stream());
}
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return stream_->AddCallback(callback);
}
void WaitStreamCallback() const { return stream_->WaitCallback(); }
private:
std::string device_type_;
CustomPlace place_;
std::shared_ptr<platform::stream::Stream> stream_;
CustomDeviceContext();
DISABLE_COPY_AND_ASSIGN(CustomDeviceContext);
};
template <>
struct DefaultDeviceContextType<platform::CustomPlace> {
using TYPE = CustomDeviceContext;
};
#else
template <>
struct DefaultDeviceContextType<platform::CustomPlace> {
using TYPE = DeviceContext;
};
#endif
/*! \brief device context pool singleton */ /*! \brief device context pool singleton */
class DeviceContextPool { class DeviceContextPool {
public: public:
......
...@@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double( ...@@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double(
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags. // flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_CUSTOM_DEVICE)
/** /**
* Memory related FLAG * Memory related FLAG
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#endif #endif
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -234,6 +235,19 @@ void InitDevices(const std::vector<int> devices) { ...@@ -234,6 +235,19 @@ void InitDevices(const std::vector<int> devices) {
if (!custom_kernel_root.empty()) { if (!custom_kernel_root.empty()) {
LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root; LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
framework::LoadCustomKernel(custom_kernel_root); framework::LoadCustomKernel(custom_kernel_root);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::LoadCustomDevice(custom_kernel_root)) {
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (auto &dev_type : device_types) {
VLOG(1) << "Device type: " << dev_type << ", visible devices count: "
<< platform::DeviceManager::GetDeviceCount(dev_type);
for (size_t i = 0;
i < platform::DeviceManager::GetDeviceCount(dev_type); i++) {
places.push_back(platform::CustomPlace(dev_type, i));
}
}
}
#endif
} else { } else {
VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty."; VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty.";
} }
......
...@@ -56,7 +56,16 @@ bool is_npu_pinned_place(const Place &p) { ...@@ -56,7 +56,16 @@ bool is_npu_pinned_place(const Place &p) {
return p.GetType() == pten::AllocationType::NPUPINNED; return p.GetType() == pten::AllocationType::NPUPINNED;
} }
bool is_custom_place(const Place &p) {
return p.GetType() == pten::AllocationType::CUSTOM;
}
bool places_are_same_class(const Place &p1, const Place &p2) { bool places_are_same_class(const Place &p1, const Place &p2) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (is_custom_place(p1) && is_custom_place(p2)) {
return p1.GetDeviceType() == p2.GetDeviceType();
}
#endif
return p1.GetType() == p2.GetType(); return p1.GetType() == p2.GetType();
} }
...@@ -73,6 +82,8 @@ bool is_same_place(const Place &p1, const Place &p2) { ...@@ -73,6 +82,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
return p1 == p2; return p1 == p2;
} else if (is_ipu_place(p1)) { } else if (is_ipu_place(p1)) {
return p1 == p2; return p1 == p2;
} else if (is_custom_place(p1)) {
return p1 == p2;
} else { } else {
return p1 == p2; return p1 == p2;
} }
...@@ -81,5 +92,43 @@ bool is_same_place(const Place &p1, const Place &p2) { ...@@ -81,5 +92,43 @@ bool is_same_place(const Place &p1, const Place &p2) {
} }
} }
#ifdef PADDLE_WITH_CUSTOM_DEVICE
std::string PlaceHelper::GetDeviceType(const Place &place) {
if (is_cpu_place(place)) {
return "cpu";
} else if (is_gpu_place(place)) {
return "gpu";
} else if (is_npu_place(place)) {
return "npu";
} else if (is_xpu_place(place)) {
return "xpu";
} else if (is_custom_place(place)) {
return place.GetDeviceType();
} else {
PADDLE_THROW(platform::errors::Fatal(
"Unknown device type. Please check available devices by "
"paddle.device.get_available_device()"));
}
}
size_t PlaceHelper::GetDeviceId(const Place &place) {
return place.GetDeviceId();
}
Place PlaceHelper::CreatePlace(const std::string &dev_type, size_t dev_id) {
if (dev_type == "cpu") {
return platform::CPUPlace();
} else if (dev_type == "gpu") {
return platform::CUDAPlace(dev_id);
} else if (dev_type == "npu") {
return platform::NPUPlace(dev_id);
} else if (dev_type == "xpu") {
return platform::XPUPlace(dev_id);
} else {
return platform::CustomPlace(dev_type, dev_id);
}
}
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -36,9 +36,19 @@ using NPUPinnedPlace = pten::NPUPinnedPlace; ...@@ -36,9 +36,19 @@ using NPUPinnedPlace = pten::NPUPinnedPlace;
using XPUPlace = pten::XPUPlace; using XPUPlace = pten::XPUPlace;
using IPUPlace = pten::IPUPlace; using IPUPlace = pten::IPUPlace;
using MLUPlace = pten::MLUPlace; using MLUPlace = pten::MLUPlace;
using CustomPlace = pten::CustomPlace;
using PlaceList = std::vector<Place>; using PlaceList = std::vector<Place>;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class PlaceHelper {
public:
static std::string GetDeviceType(const Place &place);
static size_t GetDeviceId(const Place &place);
static Place CreatePlace(const std::string &dev_type, size_t dev_id = 0);
};
#endif
bool is_gpu_place(const Place &); bool is_gpu_place(const Place &);
bool is_xpu_place(const Place &); bool is_xpu_place(const Place &);
bool is_npu_place(const Place &); bool is_npu_place(const Place &);
...@@ -47,6 +57,7 @@ bool is_ipu_place(const Place &); ...@@ -47,6 +57,7 @@ bool is_ipu_place(const Place &);
bool is_cpu_place(const Place &); bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &); bool is_cuda_pinned_place(const Place &);
bool is_npu_pinned_place(const Place &); bool is_npu_pinned_place(const Place &);
bool is_custom_place(const Place &p);
bool places_are_same_class(const Place &, const Place &); bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &); bool is_same_place(const Place &, const Place &);
...@@ -121,6 +132,15 @@ typename Visitor::result_type VisitPlace(const Place &place, ...@@ -121,6 +132,15 @@ typename Visitor::result_type VisitPlace(const Place &place,
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with MLU. Cannot visit mlu device")); "Paddle is not compiled with MLU. Cannot visit mlu device"));
#endif
}
case pten::AllocationType::CUSTOM: {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
platform::CustomPlace p(place.GetDeviceType(), place.GetDeviceId());
return visitor(p);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUSTOM. Cannot visit custom device"));
#endif #endif
} }
default: { default: {
......
...@@ -284,7 +284,7 @@ if(WITH_PYTHON) ...@@ -284,7 +284,7 @@ if(WITH_PYTHON)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS} SRCS ${PYBIND_SRCS}
DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
......
...@@ -136,10 +136,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { ...@@ -136,10 +136,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
return place_obj.cast<platform::Place>(); return place_obj.cast<platform::Place>();
} else if (py::isinstance<platform::MLUPlace>(place_obj)) { } else if (py::isinstance<platform::MLUPlace>(place_obj)) {
return place_obj.cast<platform::MLUPlace>(); return place_obj.cast<platform::MLUPlace>();
} else if (py::isinstance<platform::CustomPlace>(place_obj)) {
return place_obj.cast<platform::CustomPlace>();
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Place should be one of " "Place should be one of "
"Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace")); "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/"
"CustomPlace"));
} }
} }
...@@ -183,6 +186,9 @@ static void InitVarBaseAndTensor( ...@@ -183,6 +186,9 @@ static void InitVarBaseAndTensor(
SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy); SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy); SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
} else if (platform::is_custom_place(place)) {
SetTensorFromPyArray<platform::CustomPlace>(tensor, array, place,
zero_copy);
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Place should be one of " "Place should be one of "
...@@ -941,6 +947,10 @@ void BindImperative(py::module *m_ptr) { ...@@ -941,6 +947,10 @@ void BindImperative(py::module *m_ptr) {
py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("value"), py::arg("place"), py::arg("persistable") = false,
py::arg("zero_copy") = false, py::arg("name") = "", py::arg("zero_copy") = false, py::arg("name") = "",
py::arg("stop_gradient") = -1) py::arg("stop_gradient") = -1)
.def("__init__", &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
py::arg("value"), py::arg("place"), py::arg("persistable") = false,
py::arg("zero_copy") = false, py::arg("name") = "",
py::arg("stop_gradient") = -1)
.def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value")) .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
.def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"), .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"),
py::arg("name") = "") py::arg("name") = "")
...@@ -956,6 +966,8 @@ void BindImperative(py::module *m_ptr) { ...@@ -956,6 +966,8 @@ void BindImperative(py::module *m_ptr) {
py::arg("tensor"), py::arg("place"), py::arg("name") = "") py::arg("tensor"), py::arg("place"), py::arg("name") = "")
.def("__init__", &InitVarBaseFromTensorWithArg<platform::MLUPlace>, .def("__init__", &InitVarBaseFromTensorWithArg<platform::MLUPlace>,
py::arg("tensor"), py::arg("place"), py::arg("name") = "") py::arg("tensor"), py::arg("place"), py::arg("name") = "")
.def("__init__", &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
py::arg("tensor"), py::arg("place"), py::arg("name") = "")
.def("__init__", &InitVarBaseFromNumpyWithKwargs) .def("__init__", &InitVarBaseFromNumpyWithKwargs)
.def( .def(
"__setitem_varbase__", "__setitem_varbase__",
...@@ -2258,6 +2270,11 @@ void BindImperative(py::module *m_ptr) { ...@@ -2258,6 +2270,11 @@ void BindImperative(py::module *m_ptr) {
self.SetExpectedPlace(*p); self.SetExpectedPlace(*p);
VLOG(4) << "Tracer(" << &self << ")" VLOG(4) << "Tracer(" << &self << ")"
<< " set expected place " << *p; << " set expected place " << *p;
} else if (py::isinstance<platform::CustomPlace>(obj)) {
auto p = obj.cast<platform::CustomPlace *>();
self.SetExpectedPlace(*p);
VLOG(4) << "Tracer(" << &self << ")"
<< " set expected place " << *p;
} else if (py::isinstance<platform::Place>(obj)) { } else if (py::isinstance<platform::Place>(obj)) {
auto p = obj.cast<platform::Place *>(); auto p = obj.cast<platform::Place *>();
self.SetExpectedPlace(*p); self.SetExpectedPlace(*p);
...@@ -2301,6 +2318,21 @@ void BindImperative(py::module *m_ptr) { ...@@ -2301,6 +2318,21 @@ void BindImperative(py::module *m_ptr) {
*(imperative::AmpOperators::Instance().GetMutableAllowOps()), *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
*(imperative::AmpOperators::Instance().GetMutableBlockOps())); *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
}) })
.def("trace",
[](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
framework::AttributeMap attrs, const platform::CustomPlace &place,
bool trace_backward,
const std::map<std::string, std::string> &inplace_map = {}) {
auto ins_map = ConvertToNameVarBaseMap(ins);
auto outs_map = ConvertToNameVarBaseMap(outs);
{
py::gil_scoped_release release;
self.TraceOp<imperative::VarBase>(
type, std::move(ins_map), std::move(outs_map),
std::move(attrs), place, trace_backward, inplace_map);
}
})
.def("trace", .def("trace",
[](imperative::Tracer &self, const std::string &type, [](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
......
...@@ -69,6 +69,7 @@ limitations under the License. */ ...@@ -69,6 +69,7 @@ limitations under the License. */
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -1667,6 +1668,139 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1667,6 +1668,139 @@ All parameter, weight, gradient are variables in Paddle.
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
m.def("get_all_device_type", []() {
std::vector<std::string> device_types;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
device_types = platform::DeviceManager::GetAllDeviceTypes();
#else
LOG(WARNING) << string::Sprintf(
"Cannot use get_all_device_type because you have installed"
"CPU/GPU version PaddlePaddle.\n"
"If you want to use get_all_device_type, please try to install"
"CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle-core\n");
#endif
return device_types;
});
m.def("get_all_custom_device_type", []() {
std::vector<std::string> device_types;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
#else
LOG(WARNING) << string::Sprintf(
"Cannot use get_all_custom_device_type because you have installed"
"CPU/GPU version PaddlePaddle.\n"
"If you want to use get_all_custom_device_type, please try to "
"install CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle-core\n");
#endif
return device_types;
});
m.def("get_available_device", [] {
std::vector<std::string> devices;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
devices = platform::DeviceManager::GetAllDeviceList();
#else
LOG(WARNING) << string::Sprintf(
"Cannot use get_available_device because you have installed"
"CPU/GPU version PaddlePaddle.\n"
"If you want to use get_available_device, please try to install"
"CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle-core\n");
#endif
return devices;
});
m.def("get_available_custom_device", [] {
std::vector<std::string> devices;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
devices = platform::DeviceManager::GetAllCustomDeviceList();
#else
LOG(WARNING) << string::Sprintf(
"Cannot use get_available_custom_device because you have "
"installed"
"CPU/GPU version PaddlePaddle.\n"
"If you want to use get_available_custom_device, please try to "
"install"
"CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle-core\n");
#endif
return devices;
});
py::class_<platform::CustomPlace>(m, "CustomPlace",
R"DOC(
CustomPlace is a descriptor of a device.
It represents a custom device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
)DOC")
.def("__init__",
[](platform::CustomPlace &self, const std::string &device_type,
int dev_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), device id must be 0 "
"or "
"positive integer",
device_type, dev_id);
std::exit(-1);
}
if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) &&
platform::DeviceManager::IsCustom(device_type))) {
int dev_count = static_cast<int>(
platform::DeviceManager::GetDeviceCount(device_type));
if (UNLIKELY(dev_id >= dev_count)) {
if (dev_count == 0) {
LOG(ERROR) << "Cannot use " << device_type
<< " because there is no " << device_type
<< " detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), dev_id must "
"inside "
"[0, %d), because %s "
"number on your machine is %d",
device_type, dev_id, dev_count, device_type, dev_count);
std::exit(-1);
}
}
new (&self) platform::CustomPlace(device_type, dev_id);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), the device type is "
"not registered "
"as a custom device.",
device_type, dev_id);
std::exit(-1);
}
#else
LOG(ERROR) << string::Sprintf(
"Cannot use CustomDevice because you have installed CPU/GPU"
"version PaddlePaddle.\n"
"If you want to use CustomDevice, please try to install"
"CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle-core\n"
"If you only have CPU, please change "
"CustomPlace(%s, %d) to be CPUPlace().\n",
device_type, dev_id);
std::exit(-1);
#endif
})
.def("get_device_id",
[](const platform::CustomPlace &self) { return self.GetDeviceId(); })
.def("get_device_type",
[](const platform::CustomPlace &self) {
return self.GetDeviceType();
})
.def("__repr__", string::to_string<const platform::CustomPlace &>)
.def("__str__", string::to_string<const platform::CustomPlace &>);
py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC( py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
CUDAPlace is a descriptor of a device. CUDAPlace is a descriptor of a device.
...@@ -2118,11 +2252,16 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2118,11 +2252,16 @@ All parameter, weight, gradient are variables in Paddle.
}) })
.def("is_mlu_place", .def("is_mlu_place",
[](platform::Place &self) { return platform::is_mlu_place(self); }) [](platform::Place &self) { return platform::is_mlu_place(self); })
.def(
"is_custom_place",
[](platform::Place &self) { return platform::is_custom_place(self); })
.def("gpu_device_id", [](platform::Place &self) { return self.device; }) .def("gpu_device_id", [](platform::Place &self) { return self.device; })
.def("xpu_device_id", [](platform::Place &self) { return self.device; }) .def("xpu_device_id", [](platform::Place &self) { return self.device; })
.def("npu_device_id", [](platform::Place &self) { return self.device; }) .def("npu_device_id", [](platform::Place &self) { return self.device; })
.def("ipu_device_id", [](platform::Place &self) { return self.device; }) .def("ipu_device_id", [](platform::Place &self) { return self.device; })
.def("mlu_device_id", [](platform::Place &self) { return self.device; }) .def("mlu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id",
[](platform::Place &self) { return self.device; })
.def("set_place", [](platform::Place &self, .def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; }) const platform::Place &other) { self = other; })
.def("set_place", .def("set_place",
...@@ -2154,6 +2293,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2154,6 +2293,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self, const platform::MLUPlace &mlu_place) { [](platform::Place &self, const platform::MLUPlace &mlu_place) {
self = mlu_place; self = mlu_place;
}) })
.def("set_place",
[](platform::Place &self, const platform::CustomPlace &plug_place) {
self = plug_place;
})
.def("__repr__", string::to_string<const platform::Place &>) .def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>); .def("__str__", string::to_string<const platform::Place &>);
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -247,6 +248,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { ...@@ -247,6 +248,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
auto p = self.place(); auto p = self.place();
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
nullptr); nullptr);
#endif
} else if (platform::is_custom_place(self.place())) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
const T *a = self.data<T>();
auto p = self.place();
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
nullptr);
#endif #endif
} }
VLOG(10) << "TensorGetElement, place: " << self.place() VLOG(10) << "TensorGetElement, place: " << self.place()
...@@ -289,6 +297,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { ...@@ -289,6 +297,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
T *a = self->mutable_data<T>(p); T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
nullptr); nullptr);
#endif
} else if (platform::is_custom_place(self->place())) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
auto p = self->place();
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
nullptr);
#endif #endif
} }
} }
...@@ -368,6 +383,24 @@ void SetTensorFromPyArrayT( ...@@ -368,6 +383,24 @@ void SetTensorFromPyArrayT(
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use MLUPlace in CPU/GPU version, " "Cannot use MLUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with MLU support.")); "Please recompile or reinstall Paddle with MLU support."));
#endif
} else if (paddle::platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
platform::Place tmp_place = place;
platform::DeviceGuard guard(tmp_place);
auto dst = self->mutable_data<T>(place);
platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
reinterpret_cast<void *>(dst),
const_cast<void *>(reinterpret_cast<const void *>(array.data())),
array.nbytes());
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place);
ctx.Wait();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CustomDevice in CPU/GPU/XPU version. "
"Please recompile or reinstall Paddle with CustomDevice support."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -757,6 +790,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, ...@@ -757,6 +790,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
bool is_npu_tensor = platform::is_npu_place(tensor.place()); bool is_npu_tensor = platform::is_npu_place(tensor.place());
bool is_mlu_tensor = platform::is_mlu_place(tensor.place()); bool is_mlu_tensor = platform::is_mlu_place(tensor.place());
bool is_custom_device_tensor = platform::is_custom_place(tensor.place());
const auto &tensor_dims = tensor.dims(); const auto &tensor_dims = tensor.dims();
auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype()); auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype());
size_t sizeof_dtype = framework::SizeOfType(tensor_dtype); size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
...@@ -776,7 +810,8 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, ...@@ -776,7 +810,8 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
std::string py_dtype_str = details::TensorDTypeToPyDTypeStr( std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(
framework::TransToProtoVarType(tensor.dtype())); framework::TransToProtoVarType(tensor.dtype()));
if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor) { if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor &&
!is_custom_device_tensor) {
if (!need_deep_copy) { if (!need_deep_copy) {
auto base = py::cast(std::move(tensor)); auto base = py::cast(std::move(tensor));
return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides, return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
...@@ -900,6 +935,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, ...@@ -900,6 +935,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use MLUPlace in CPU/GPU/XPU/NPU version, " "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with MLU support.")); "Please recompile or reinstall Paddle with MLU support."));
#endif
} else if (is_custom_device_tensor) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
size_t copy_bytes = sizeof_dtype * numel;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place());
paddle::memory::Copy(
platform::CPUPlace(), py_arr.mutable_data(), tensor.place(),
tensor_buf_ptr, copy_bytes,
reinterpret_cast<const platform::CustomDeviceContext &>(ctx).stream());
ctx.Wait();
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CustomPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with CustomPlace "
"support."));
#endif #endif
} }
PADDLE_THROW(platform::errors::Unimplemented("Place is not supported")); PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/pten/api/ext/exception.h" #include "paddle/pten/api/ext/exception.h"
...@@ -50,7 +51,11 @@ const char *AllocationTypeStr(AllocationType type) { ...@@ -50,7 +51,11 @@ const char *AllocationTypeStr(AllocationType type) {
std::string Place::DebugString() const { std::string Place::DebugString() const {
std::ostringstream os; std::ostringstream os;
os << "Place("; os << "Place(";
os << AllocationTypeStr(alloc_type_); if (alloc_type_ == AllocationType::CUSTOM) {
os << GetGlobalDeviceType(device_type_id_);
} else {
os << AllocationTypeStr(alloc_type_);
}
if (alloc_type_ == AllocationType::GPUPINNED || if (alloc_type_ == AllocationType::GPUPINNED ||
alloc_type_ == AllocationType::NPUPINNED || alloc_type_ == AllocationType::NPUPINNED ||
alloc_type_ == AllocationType::CPU) { alloc_type_ == AllocationType::CPU) {
...@@ -66,4 +71,23 @@ std::ostream &operator<<(std::ostream &os, const Place &p) { ...@@ -66,4 +71,23 @@ std::ostream &operator<<(std::ostream &os, const Place &p) {
return os; return os;
} }
static std::unordered_map<std::string, size_t> global_registered_device_type_id;
static std::unordered_map<size_t, std::string> global_registered_device_type;
size_t GetOrRegisterGlobalDeviceTypeId(const std::string &device_type) {
if (device_type.empty()) return 0;
if (global_registered_device_type_id.find(device_type) ==
global_registered_device_type_id.end()) {
size_t device_type_id = global_registered_device_type_id.size() + 1;
global_registered_device_type_id[device_type] = device_type_id;
global_registered_device_type[device_type_id] = device_type;
}
return global_registered_device_type_id[device_type];
}
std::string GetGlobalDeviceType(size_t device_type_id) {
if (device_type_id == 0) return "";
return global_registered_device_type[device_type_id];
}
} // namespace pten } // namespace pten
...@@ -28,29 +28,49 @@ enum class AllocationType : int8_t { ...@@ -28,29 +28,49 @@ enum class AllocationType : int8_t {
NPUPINNED = 6, NPUPINNED = 6,
IPU = 7, IPU = 7,
MLU = 8, MLU = 8,
CUSTOM = 9,
}; };
const char* AllocationTypeStr(AllocationType type); const char* AllocationTypeStr(AllocationType type);
size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
std::string GetGlobalDeviceType(size_t device_type_id_);
/// \brief The place is used to specify where the data is stored. /// \brief The place is used to specify where the data is stored.
class Place { class Place {
public: public:
Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {} Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {}
explicit Place(AllocationType type, int8_t id) explicit Place(AllocationType type,
: device(id), alloc_type_(type) {} int8_t id,
const std::string& dev_type = "")
explicit Place(AllocationType type) : device(0), alloc_type_(type) {} : device(id),
alloc_type_(type),
void Reset(AllocationType type, int8_t device_id = 0) noexcept { device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
explicit Place(AllocationType type, const std::string& dev_type = "")
: device(0),
alloc_type_(type),
device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {}
void Reset(AllocationType type,
int8_t device_id = 0,
const std::string& dev_type = "") noexcept {
alloc_type_ = type; alloc_type_ = type;
device = device_id; device = device_id;
if (!dev_type.empty()) {
device_type_id_ = GetOrRegisterGlobalDeviceTypeId(dev_type);
}
} }
AllocationType GetType() const { return alloc_type_; } AllocationType GetType() const { return alloc_type_; }
int8_t GetDeviceId() const { return device; } int8_t GetDeviceId() const { return device; }
std::string GetDeviceType() const {
return GetGlobalDeviceType(device_type_id_);
}
std::string DebugString() const; std::string DebugString() const;
inline bool operator==(const Place& rhs) const { inline bool operator==(const Place& rhs) const {
...@@ -62,6 +82,10 @@ class Place { ...@@ -62,6 +82,10 @@ class Place {
alloc_type_ == AllocationType::NPUPINNED) { alloc_type_ == AllocationType::NPUPINNED) {
return true; return true;
} }
if (alloc_type_ == AllocationType::CUSTOM) {
return device_type_id_ == rhs.device_type_id_ &&
device == rhs.GetDeviceId();
}
return device == rhs.GetDeviceId(); return device == rhs.GetDeviceId();
} }
inline bool operator!=(const Place& rhs) const { return !(*this == rhs); } inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
...@@ -69,6 +93,10 @@ class Place { ...@@ -69,6 +93,10 @@ class Place {
if (alloc_type_ != rhs.GetType()) { if (alloc_type_ != rhs.GetType()) {
return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType()); return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
} }
if (alloc_type_ == AllocationType::CUSTOM &&
device_type_id_ != rhs.device_type_id_) {
return device_type_id_ < rhs.device_type_id_;
}
return device < rhs.GetDeviceId(); return device < rhs.GetDeviceId();
} }
...@@ -79,6 +107,7 @@ class Place { ...@@ -79,6 +107,7 @@ class Place {
private: private:
AllocationType alloc_type_{AllocationType::UNDEFINED}; AllocationType alloc_type_{AllocationType::UNDEFINED};
size_t device_type_id_;
}; };
class CPUPlace : public Place { class CPUPlace : public Place {
...@@ -157,6 +186,22 @@ class MLUPlace : public Place { ...@@ -157,6 +186,22 @@ class MLUPlace : public Place {
: Place(AllocationType::MLU, place.GetDeviceId()) {} : Place(AllocationType::MLU, place.GetDeviceId()) {}
}; };
class CustomPlace : public Place {
public:
explicit CustomPlace(const std::string dev_type)
: Place(AllocationType::CUSTOM, 0, dev_type) {}
CustomPlace(const std::string dev_type, int device_id)
: Place(AllocationType::CUSTOM, device_id, dev_type) {}
CustomPlace(const CustomPlace&) = default;
CustomPlace(const Place& place) { // NOLINT
if (place.GetType() == AllocationType::CUSTOM) {
this->Reset(
AllocationType::CUSTOM, place.GetDeviceId(), place.GetDeviceType());
}
}
};
std::ostream& operator<<(std::ostream&, const Place&); std::ostream& operator<<(std::ostream&, const Place&);
} // namespace pten } // namespace pten
...@@ -215,6 +215,15 @@ void set_constant_with_place<paddle::platform::IPUPlace>( ...@@ -215,6 +215,15 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
paddle::platform::errors::Unimplemented("IPUPlace is not supported")); paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
} }
template <>
void set_constant_with_place<paddle::platform::CustomPlace>(
const paddle::platform::DeviceContext& context,
paddle::framework::Tensor* tensor,
float value) {
PADDLE_THROW(
paddle::platform::errors::Unimplemented("CustomPlace is not supported"));
}
template <> template <>
void set_constant_with_place<paddle::platform::CPUPlace>( void set_constant_with_place<paddle::platform::CPUPlace>(
const paddle::platform::DeviceContext& context, const paddle::platform::DeviceContext& context,
......
...@@ -293,6 +293,7 @@ from .framework import CUDAPlace # noqa: F401 ...@@ -293,6 +293,7 @@ from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401 from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401
from .framework import MLUPlace # noqa: F401 from .framework import MLUPlace # noqa: F401
from .framework import CustomPlace # noqa: F401
from .autograd import grad # noqa: F401 from .autograd import grad # noqa: F401
from .autograd import no_grad # noqa: F401 from .autograd import no_grad # noqa: F401
......
...@@ -36,7 +36,11 @@ __all__ = [ # noqa ...@@ -36,7 +36,11 @@ __all__ = [ # noqa
'is_compiled_with_cuda', 'is_compiled_with_cuda',
'is_compiled_with_rocm', 'is_compiled_with_rocm',
'is_compiled_with_npu', 'is_compiled_with_npu',
'is_compiled_with_mlu' 'is_compiled_with_mlu',
'get_all_device_type',
'get_all_custom_device_type',
'get_available_device',
'get_available_custom_device',
] ]
_cudnn_version = None _cudnn_version = None
...@@ -225,15 +229,26 @@ def _convert_to_place(device): ...@@ -225,15 +229,26 @@ def _convert_to_place(device):
selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
device_id = int(selected_mlus[0]) device_id = int(selected_mlus[0])
place = core.MLUPlace(device_id) place = core.MLUPlace(device_id)
elif device in core.get_all_custom_device_type():
place = core.CustomPlace(device, 0)
else: else:
avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device)
avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device: if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device:
raise ValueError( device_info_list = device.split(':', 1)
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'mlu', 'mlu:x', 'npu', 'npu:x' or ipu" device_type = device_info_list[0]
) if device_type in core.get_all_custom_device_type():
device_id = device_info_list[1]
device_id = int(device_id)
place = core.CustomPlace(device_type, device_id)
else:
raise ValueError(
"The device must be a string which is like 'cpu', {}".
format(', '.join("'{}', '{}:x'".format(x, x)
for x in ['gpu', 'xpu', 'npu', 'mlu'] +
core.get_all_custom_device_type())))
if avaliable_gpu_device: if avaliable_gpu_device:
if not core.is_compiled_with_cuda(): if not core.is_compiled_with_cuda():
raise ValueError( raise ValueError(
...@@ -338,3 +353,103 @@ def get_device(): ...@@ -338,3 +353,103 @@ def get_device():
raise ValueError("The device specification {} is invalid".format(place)) raise ValueError("The device specification {} is invalid".format(place))
return device return device
def get_all_device_type():
"""
Get all available device types.
Returns:
A list of all available device types.
Examples:
.. code-block:: python
import paddle
paddle.device.get_all_device_type()
# Case 1: paddlepaddle-cpu package installed, and no custom device registerd.
# Output: ['cpu']
# Case 2: paddlepaddle-gpu package installed, and no custom device registerd.
# Output: ['cpu', 'gpu']
# Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd.
# Output: ['cpu', 'CustomCPU']
# Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
# Output: ['cpu', 'gpu', 'CustomCPU', 'CustomGPU']
"""
return core.get_all_device_type()
def get_all_custom_device_type():
"""
Get all available custom device types.
Returns:
A list of all available custom device types.
Examples:
.. code-block:: python
import paddle
paddle.device.get_all_custom_device_type()
# Case 1: paddlepaddle-gpu package installed, and no custom device registerd.
# Output: None
# Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
# Output: ['CustomCPU', 'CustomGPU']
"""
return core.get_all_custom_device_type()
def get_available_device():
"""
Get all available devices.
Returns:
A list of all available devices.
Examples:
.. code-block:: python
import paddle
paddle.device.get_available_device()
# Case 1: paddlepaddle-cpu package installed, and no custom device registerd.
# Output: ['cpu']
# Case 2: paddlepaddle-gpu package installed, and no custom device registerd.
# Output: ['cpu', 'gpu:0', 'gpu:1']
# Case 3: paddlepaddle-cpu package installed, and custom deivce 'CustomCPU' is registerd.
# Output: ['cpu', 'CustomCPU']
# Case 4: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
# Output: ['cpu', 'gpu:0', 'gpu:1', 'CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
"""
return core.get_available_device()
def get_available_custom_device():
"""
Get all available custom devices.
Returns:
A list of all available custom devices.
Examples:
.. code-block:: python
import paddle
paddle.device.get_available_custom_device()
# Case 1: paddlepaddle-gpu package installed, and no custom device registerd.
# Output: None
# Case 2: paddlepaddle-gpu package installed, and custom deivce 'CustomCPU' and 'CustomGPU' is registerd.
# Output: ['CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
"""
return core.get_available_custom_device()
...@@ -71,7 +71,7 @@ from .param_attr import ParamAttr, WeightNormParamAttr ...@@ -71,7 +71,7 @@ from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, Scope, _Scope from .core import LoDTensor, LoDTensorArray, Scope, _Scope
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace, CustomPlace
from .incubate import fleet from .incubate import fleet
from .transpiler import DistributeTranspiler, \ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
......
...@@ -6918,7 +6918,7 @@ def _get_paddle_place(place): ...@@ -6918,7 +6918,7 @@ def _get_paddle_place(place):
return place return place
if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace, if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace, core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace,
core.IPUPlace, core.MLUPlace)): core.IPUPlace, core.MLUPlace, core.CustomPlace)):
return place return place
if not isinstance(place, str): if not isinstance(place, str):
......
...@@ -29,6 +29,7 @@ from ..fluid.core import CUDAPlace # noqa: F401 ...@@ -29,6 +29,7 @@ from ..fluid.core import CUDAPlace # noqa: F401
from ..fluid.core import CUDAPinnedPlace # noqa: F401 from ..fluid.core import CUDAPinnedPlace # noqa: F401
from ..fluid.core import NPUPlace # noqa: F401 from ..fluid.core import NPUPlace # noqa: F401
from ..fluid.core import MLUPlace # noqa: F401 from ..fluid.core import MLUPlace # noqa: F401
from ..fluid.core import CustomPlace # noqa: F401
from ..fluid.core import VarBase # noqa: F401 from ..fluid.core import VarBase # noqa: F401
from paddle.fluid import core # noqa: F401 from paddle.fluid import core # noqa: F401
......
...@@ -106,9 +106,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): ...@@ -106,9 +106,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
if place is None: if place is None:
place = _current_expected_place() place = _current_expected_place()
elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
core.CUDAPlace, core.NPUPlace, core.XPUPlace)): core.CUDAPlace, core.NPUPlace, core.XPUPlace,
core.CustomPlace)):
raise ValueError( raise ValueError(
"'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace" "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace"
) )
#Todo(zhouwei): Support allocate tensor on any other specified card #Todo(zhouwei): Support allocate tensor on any other specified card
......
...@@ -579,7 +579,8 @@ headers = ( ...@@ -579,7 +579,8 @@ headers = (
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) + # pten core headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) + # pten core headers
# utila api headers # utila api headers
['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] + ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h']) ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] +
['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
if '${WITH_MKLDNN}' == 'ON': if '${WITH_MKLDNN}' == 'ON':
headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
...@@ -624,6 +625,8 @@ class InstallHeaders(Command): ...@@ -624,6 +625,8 @@ class InstallHeaders(Command):
elif 'third_party' not in header: elif 'third_party' not in header:
# paddle headers # paddle headers
install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
if 'device_ext.h' in header:
install_dir = "paddle/"
else: else:
# third_party # third_party
install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册