未验证 提交 3e7825f3 编写于 作者: R ronnywang 提交者: GitHub

[PluggableDevice] Add custom runtime support (#38740)

* [CustomRuntime] Add DeviceManager

* [CustomRuntime] Add DeviceInterface

* [CustomRuntime] Add Stream, Event, DeviceGuard, CallbackManager

* [CustomRuntime] Add plug-in device

* [CustomRuntime] Memory module support PluggableDevice

* [CustomRuntime] Add WITH_PLUGGABLE_DEVICE cmake option

* update

* [API] update API doc based on comments, test=develop
Co-authored-by: Nqili93 <qili93@qq.com>
上级 0d46a108
......@@ -243,6 +243,7 @@ option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup ji
option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
option(WITH_POCKETFFT "Compile with pocketfft support" ON)
option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF)
option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
if(WITH_RECORD_BUILDTIME)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
......@@ -265,6 +266,10 @@ if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thr
return()
endif()
if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
set(WITH_CUSTOM_DEVICE ON)
endif()
if(WIN32)
if(WITH_DISTRIBUTE)
MESSAGE(WARNING
......
......@@ -219,3 +219,7 @@ endif(ON_INFER)
if(WITH_CRYPTO)
add_definitions(-DPADDLE_WITH_CRYPTO)
endif(WITH_CRYPTO)
if(WITH_CUSTOM_DEVICE AND NOT WIN32)
add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
endif()
......@@ -100,6 +100,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
platform::errors::Unimplemented("platform::MLUPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CustomPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::CustomPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLDevice device;
......
......@@ -494,6 +494,20 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
#endif
} else if (platform::is_custom_place(place_)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for " << place_ << ".";
gc.reset(new CustomDeviceUnsafeFastGarbageCollector(place_,
max_memory_size));
} else {
VLOG(4) << "Use default stream gc for " << place_ << ".";
gc.reset(
new CustomDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(platform::errors::Unimplemented("No CustomDevice gc found"));
#endif
}
}
......
......@@ -18,6 +18,7 @@
#endif
#include "gflags/gflags.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
DECLARE_double(eager_delete_tensor_gb);
DECLARE_double(memory_fraction_of_eager_deletion);
......@@ -202,6 +203,58 @@ void MLUStreamGarbageCollector::ClearCallback(
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void CustomDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void CustomDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::CustomDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
CustomDeviceUnsafeFastGarbageCollector::CustomDeviceUnsafeFastGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void CustomDeviceUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
CustomStreamGarbageCollector::CustomStreamGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {
platform::DeviceGuard guard(place);
stream_.reset(new platform::stream::Stream);
stream_->Init(place);
callback_manager_.reset(new platform::CallbackManager(stream_.get()));
}
CustomStreamGarbageCollector::~CustomStreamGarbageCollector() {
platform::DeviceGuard guard(this->dev_ctx_->GetPlace());
stream_->Synchronize();
stream_->Destroy();
}
platform::stream::Stream *CustomStreamGarbageCollector::stream() const {
return stream_.get();
}
void CustomStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
void CustomStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback_manager_->AddCallback(callback);
}
#endif
int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0
? -1
......
......@@ -200,6 +200,47 @@ class MLUStreamGarbageCollector : public GarbageCollector {
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDefaultStreamGarbageCollector : public GarbageCollector {
public:
CustomDefaultStreamGarbageCollector(const platform::CustomPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class CustomDeviceUnsafeFastGarbageCollector : public GarbageCollector {
public:
CustomDeviceUnsafeFastGarbageCollector(const platform::CustomPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class CustomStreamGarbageCollector : public GarbageCollector {
public:
CustomStreamGarbageCollector(const platform::CustomPlace &place,
size_t max_memory_size);
~CustomStreamGarbageCollector();
void Wait() const override;
platform::stream::Stream *stream() const;
protected:
void ClearCallback(const std::function<void()> &callback) override;
private:
std::unique_ptr<platform::stream::Stream> stream_;
std::unique_ptr<platform::CallbackManager> callback_manager_;
};
#endif
template <typename Container>
void GarbageCollector::Add(Container &&objs) {
Add(std::forward<Container>(objs), []() {});
......
......@@ -47,10 +47,20 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
"Too many OpKernel attribute values, expected maximum "
"value is 64, received value is %d.",
cur_loc));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
std::hash<int> hasher;
size_t seed =
hasher(place + data_type + data_layout + library_type + customized_value);
if (platform::is_custom_place(key.place_)) {
seed ^= std::hash<std::string>{}(key.place_.GetDeviceType()) + 0x9e3779b9 +
(seed << 6) + (seed >> 2) + 4;
}
return seed;
#else
std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type +
customized_value);
#endif
}
bool OpKernelType::operator==(const OpKernelType& o) const {
......
......@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/unused_var_check.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/common/scalar.h"
......@@ -244,6 +245,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
auto dev_id = place.device;
platform::SetMLUDeviceId(dev_id);
#endif
} else if (platform::is_custom_place(place)) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with CustomDevice support.",
place));
#else
platform::DeviceManager::SetDevice(place);
#endif
}
......
......@@ -532,6 +532,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(
new CustomDeviceUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new CustomStreamGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use custom device since it's not compiled with "
"CustomDevice,"
"Please recompile or reinstall Paddle with CustomDevice support."));
#endif
} else if (platform::is_cpu_place(place)) {
gc.reset(new CPUGarbageCollector(place, max_memory_size));
......
......@@ -91,7 +91,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) {
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
#endif
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
......@@ -376,7 +398,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
platform::is_mlu_place(dst_place)) {
platform::is_mlu_place(dst_place) ||
platform::is_custom_place(dst_place)) {
dev_ctx = pool.Get(dst_place);
} else {
dev_ctx = pool.Get(src.place());
......@@ -436,6 +459,26 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_custom_place(src_place) && // NOLINT
platform::is_custom_place(
dst_place)) { /* custom_device -> custom_device*/
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
......@@ -664,6 +707,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
const platform::CUDAPinnedPlace& cpu) const {
return *out.data<bool>();
}
bool GetResult(const framework::Tensor& out,
const platform::CustomPlace& custom_dev) const {
PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ",
custom_dev));
return false;
}
};
template <typename Predicate>
......@@ -903,6 +953,11 @@ struct BothFalseVisitor : public boost::static_visitor<> {
out_ptr[i] = lhs && rhs;
}
}
void VisitorImpl(const platform::CustomPlace& custom_dev) const {
PADDLE_THROW(
platform::errors::Unimplemented("CustomPlace is not supported"));
}
};
void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
......@@ -1036,6 +1091,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
#endif
} else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& custom_device_context =
static_cast<const platform::CustomDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(), tensor.place(),
reinterpret_cast<const void*>(data), size_to_write,
custom_device_context.stream());
custom_device_context.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CustomPlace is not supported when not compiled with "
"CustomDevice"));
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
......@@ -1093,10 +1171,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_mlu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType(
......@@ -1105,7 +1184,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
if (platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
......@@ -1163,10 +1243,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_mlu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
......@@ -1175,7 +1256,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
if (platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
......@@ -1188,9 +1270,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else {
} else if (platform::is_npu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"CutomPlace is not supported when not compiled with CustomDevice"));
}
#endif
} else {
......
......@@ -180,6 +180,17 @@ void TensorFromArray(const T* src, const size_t& array_size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy(
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorFromArray on %s is not supported.", dst_place));
}
}
template <typename T>
......@@ -241,6 +252,17 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy(
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream());
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorFromVector on %s is not supported.", dst_place));
}
}
// The fully specialized function should be inline to avoid
......@@ -300,6 +322,17 @@ inline void TensorFromVector(const std::vector<bool>& src,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
auto stream =
reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorFromVector on %s is not supported.", dst_place));
}
delete[] array;
}
......@@ -369,6 +402,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"TensorToVector on %s is not supported.", src.place()));
}
}
template <>
......@@ -410,6 +452,11 @@ inline void TensorToVector(const Tensor& src,
dst_place, dst_ptr, src.place(), src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]);
......
......@@ -180,6 +180,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
"is not supported in imperative mode",
place));
}
void operator()(const platform::CustomPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
private:
int64_t numel_;
......@@ -331,7 +337,14 @@ void TensorAdd(const VarType& src, VarType* dst) {
return;
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::is_custom_place(place)) {
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type), place));
}
#endif
#ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(place)) {
if (data_type == framework::DataTypeTrait<float>::DataType()) {
......
......@@ -278,6 +278,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (kernel_iter == kernels.end() &&
paddle::platform::is_custom_place(expected_kernel_key.place_)) {
VLOG(3) << "missing " << place.GetDeviceType() << " kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that
// case
......
......@@ -20,6 +20,7 @@
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/op_base.h"
#include "paddle/fluid/platform/denormal.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -138,6 +139,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."));
#endif
} else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use CustomDevice since it's not compiled with "
"CustomDevice,"
"Please recompile or reinstall Paddle with CustomDevice "
"support."));
#endif
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
......@@ -222,6 +234,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU if use MLUPlace."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
platform::DeviceManager::SetDevice(place);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with CustomDevice if use "
"CustomPlace."));
#endif
}
if (!override_default_attr_map) {
......
......@@ -58,6 +58,11 @@ else ()
set(AllocatorFacadeDeps)
endif()
if (WITH_CUSTOM_DEVICE)
cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
endif()
if (WITH_GPU)
nv_test(best_fit_allocator_test
SRCS best_fit_allocator_test.cc
......
......@@ -62,6 +62,11 @@
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#endif
PADDLE_DEFINE_EXPORTED_int64(
gpu_allocator_retry_time, 10000,
"The retry time (milliseconds) when allocator fails "
......@@ -186,6 +191,17 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
for (size_t dev_id = 0;
dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
++dev_id) {
InitNaiveBestFitCustomDeviceAllocator(
platform::CustomPlace(dev_type, dev_id));
}
}
#endif
break;
}
......@@ -222,6 +238,17 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
for (size_t dev_id = 0;
dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
++dev_id) {
InitAutoGrowthCustomDeviceAllocator(
platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
}
}
#endif
break;
}
......@@ -700,6 +727,21 @@ class AllocatorFacadePrivate {
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
bool allow_free_idle_chunk) {
auto custom_allocator =
std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
custom_allocator, platform::DeviceManager::GetMinChunkSize(p),
allow_free_idle_chunk);
}
#endif
void InitSystemAllocators() {
if (!system_allocators_.empty()) return;
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
......@@ -770,6 +812,16 @@ class AllocatorFacadePrivate {
places.emplace_back(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
for (size_t dev_id = 0;
dev_id < platform::DeviceManager::GetDeviceCount(dev_type);
dev_id++) {
places.emplace_back(platform::CustomPlace(dev_type, dev_id));
}
}
#endif
for (auto& p : places) {
zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
......@@ -1005,7 +1057,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
bool CustomAllocator::IsAllocThreadSafe() const { return true; }
void CustomAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
allocation->place(), place_,
platform::errors::PermissionDenied("CustomDevice memory is "
"freed in incorrect device. "
"This may be a bug"));
delete allocation;
}
pten::Allocation* CustomAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_,
[this] { platform::DeviceManager::SetDevice(place_); });
void* ptr =
platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size);
if (LIKELY(ptr)) {
return new Allocation(ptr, size, place_);
}
size_t avail, total;
platform::DeviceManager::MemoryStats(place_, &total, &avail);
auto dev_type = platform::PlaceHelper::GetDeviceType(place_);
auto dev_id = platform::PlaceHelper::GetDeviceId(place_);
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on %s:%d. "
"Cannot allocate %s memory on %s:%d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using %s:%d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another %s.\n"
"2. If no, please decrease the batch size of your model.\n\n",
dev_type, dev_id, string::HumanReadableSize(size), dev_type, dev_id,
string::HumanReadableSize(avail), dev_type, dev_id, dev_type));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class CustomAllocator : public Allocator {
public:
explicit CustomAllocator(const platform::CustomPlace& place)
: place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(pten::Allocation* allocation) override;
pten::Allocation* AllocateImpl(size_t size) override;
private:
platform::Place place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -20,6 +20,7 @@
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -30,7 +31,6 @@
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false,
......@@ -733,6 +733,136 @@ uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
#endif
}
// For CustomDevice
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class BuddyAllocatorList {
private:
explicit BuddyAllocatorList(const std::string &device_type)
: device_type_(device_type) {
auto devices = platform::DeviceManager::GetDeviceList(device_type);
for (auto dev_id : devices) {
init_flags_[dev_id].reset(new std::once_flag());
}
}
static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) {
return new BuddyAllocatorList(device_type);
}
public:
static BuddyAllocatorList *Instance(const std::string &device_type) {
// DeviceType -> AllocatorList
static std::unordered_map<std::string, BuddyAllocatorList *> pool;
if (pool.find(device_type) == pool.end()) {
pool[device_type] = CreateNewInstance(device_type);
}
return pool[device_type];
}
BuddyAllocator *Get(int dev_id) {
PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
platform::errors::OutOfRange(
"Cannot find %s %d, please check visible devices.",
device_type_, dev_id));
std::call_once(*init_flags_[dev_id], [this, dev_id] {
platform::DeviceManager::SetDevice(device_type_, dev_id);
platform::CustomPlace place(device_type_, dev_id);
allocators_[dev_id].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::CustomAllocator(device_type_, dev_id)),
platform::DeviceManager::GetMinChunkSize(place),
platform::DeviceManager::GetMaxChunkSize(place),
platform::DeviceManager::GetExtraPaddingSize(place), device_type_));
});
return allocators_[dev_id].get();
}
private:
std::string device_type_;
std::unordered_map<size_t, std::unique_ptr<std::once_flag>> init_flags_;
std::unordered_map<size_t, std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetBuddyAllocator(const platform::Place &place) {
VLOG(10) << "GetBuddyAllocator place = " << place;
if (platform::is_custom_place(place)) {
return BuddyAllocatorList::Instance(
platform::PlaceHelper::GetDeviceType(place))
->Get(platform::PlaceHelper::GetDeviceId(place));
} else {
PADDLE_THROW(
platform::errors::InvalidArgument("place must be CustomPlace"));
}
}
#endif
template <>
void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
auto *buddy_allocator = GetBuddyAllocator(place);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::DeviceGuard guard(place);
size_t avail, total;
platform::DeviceManager::MemoryStats(place, &total, &avail);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
"%s. ",
string::HumanReadableSize(size), place.GetDeviceType(), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(total - avail)));
} else {
if (FLAGS_init_allocated_mem) {
platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF,
size);
}
}
VLOG(10) << " pointer=" << ptr;
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetBuddyAllocator(place)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::CustomPlace>(const platform::CustomPlace &place) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
return GetBuddyAllocator(place)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
template <>
size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
return GetBuddyAllocator(place)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CustomPlace' is not supported in CPU only device."));
#endif
}
struct AllocVisitor : public boost::static_visitor<void *> {
inline explicit AllocVisitor(size_t size) : size_(size) {}
......
......@@ -25,9 +25,7 @@ limitations under the License. */
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
namespace memory {
......@@ -35,12 +33,37 @@ namespace detail {
BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t max_chunk_size, size_t extra_padding_size)
size_t max_chunk_size, size_t extra_padding_size,
const std::string dev_type)
: min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size),
extra_padding_size_(extra_padding_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}
system_allocator_(std::move(system_allocator)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (!dev_type.empty()) {
init_allocate_size_func_ = [dev_type]() {
return platform::DeviceManager::GetInitAllocSize(
platform::PlaceHelper::CreatePlace(dev_type));
};
re_allocate_size_func_ = [dev_type]() {
return platform::DeviceManager::GetReallocSize(
platform::PlaceHelper::CreatePlace(dev_type));
};
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_ = &platform::GpuInitAllocSize;
re_allocate_size_func_ = &platform::GpuReallocSize;
#elif defined(PADDLE_WITH_ASCEND_CL)
init_allocate_size_func_ = &platform::NPUInitAllocSize;
re_allocate_size_func_ = &platform::NPUReallocSize;
#elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_ = &platform::MLUInitAllocSize;
re_allocate_size_func_ = &platform::MLUReallocSize;
#endif
}
#endif
}
BuddyAllocator::~BuddyAllocator() {
VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
......@@ -224,6 +247,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t allocate_bytes = max_chunk_size_;
size_t index = 0;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
allocate_bytes = DeviceAllocateSize(init_allocate_size_func_,
re_allocate_size_func_, request_bytes);
#else
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
&platform::GpuReallocSize, request_bytes);
......@@ -233,6 +260,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#elif defined(PADDLE_WITH_MLU)
allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
&platform::MLUReallocSize, request_bytes);
#endif
#endif
// Allocate a new block
......
......@@ -39,7 +39,8 @@ class BuddyAllocator {
public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t max_chunk_size,
size_t extra_padding_size = 0);
size_t extra_padding_size = 0,
const std::string dev_type = "");
~BuddyAllocator();
......@@ -123,6 +124,9 @@ class BuddyAllocator {
/*! Allocate CPU/GPU memory from system */
std::unique_ptr<SystemAllocator> system_allocator_;
std::mutex mutex_;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
std::function<size_t()> init_allocate_size_func_, re_allocate_size_func_;
#endif
};
} // namespace detail
......
......@@ -38,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
......@@ -430,6 +432,51 @@ void MLUAllocator::Free(void* p, size_t size, size_t index) {
bool MLUAllocator::UseGpu() const { return true; }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void* CustomAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = platform::DeviceManager::GetDeviceWithPlace(place);
p = device->MemoryAllocate(size);
if (LIKELY(p)) {
VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
*index = 0;
plug_alloc_size += size;
} else {
size_t avail, total;
platform::DeviceManager::MemoryStats(place, &total, &avail);
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on %s %d. "
"total memory is %s, used memory is %s, "
"available memory is only %s.\n\n",
dev_type_, dev_id_, string::HumanReadableSize(total),
string::HumanReadableSize(total - avail),
string::HumanReadableSize(avail)));
}
return p;
}
void CustomAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(plug_alloc_size, size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, plug_alloc_size));
plug_alloc_size -= size;
auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = platform::DeviceManager::GetDeviceWithPlace(place);
device->MemoryDeallocate(p, size);
}
bool CustomAllocator::UseGpu() const { return true; }
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <string>
namespace paddle {
namespace memory {
......@@ -107,6 +108,23 @@ class MLUAllocator : public SystemAllocator {
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomAllocator : public SystemAllocator {
public:
explicit CustomAllocator(const std::string& device_type, size_t dev_id)
: dev_type_(device_type), dev_id_(dev_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t plug_alloc_size = 0;
std::string dev_type_;
size_t dev_id_;
};
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
此差异已折叠。
......@@ -36,66 +36,25 @@ namespace memory {
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or GPU).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or GPU).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream CUDA stream.
*
* \note For GPU memory copy, CUDA stream need to be specified
* for asynchronously memory copy.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
gpuStream_t stream);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or NPU).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or NPU).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream NPU stream.
*
* \note For NPU memory copy, NPU stream need to be specified
* for asynchronously memory copy.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
aclrtStream stream);
#endif
#ifdef PADDLE_WITH_MLU
/**
* \brief Copy memory from one place to another place.
*
* \param[in] DstPlace Destination allocation place (CPU or MLU).
* \param[in] DstPlace Destination allocation place (CPU or GPU or XPU or
* CustomDevice).
* \param[in] dst Destination memory address.
* \param[in] SrcPlace Source allocation place (CPU or MLU).
* \param[in] SrcPlace Source allocation place (CPU or GPU or XPU or
* CustomDevice).
* \param[in] src Source memory address.
* \param[in] num memory size in bytes to copy.
* \param[in] stream MLU stream.
* \param[in] stream stream for asynchronously memory copy.
*
* \note For MLU memory copy, MLU stream need to be specified
* for asynchronously memory copy.
* \note For GPU/XPU/CustomDevice memory copy, stream need to be specified
* for asynchronously memory copy, and type is restored in the
* implementation.
*
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
mluStream stream);
#endif
void* stream);
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int16_t>;
template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
template struct SetConstant<platform::CPUDeviceContext,
platform::complex<float>>;
template struct SetConstant<platform::CPUDeviceContext,
platform::complex<double>>;
template struct SetConstant<pten::CPUContext, platform::float16>;
template struct SetConstant<pten::CPUContext, platform::bfloat16>;
template struct SetConstant<pten::CPUContext, float>;
template struct SetConstant<pten::CPUContext, double>;
template struct SetConstant<pten::CPUContext, int16_t>;
template struct SetConstant<pten::CPUContext, int>;
template struct SetConstant<pten::CPUContext, int64_t>;
template struct SetConstant<pten::CPUContext, bool>;
template struct SetConstant<pten::CPUContext, uint8_t>;
template struct SetConstant<pten::CPUContext, platform::complex<float>>;
template struct SetConstant<pten::CPUContext, platform::complex<double>>;
#ifdef PADDLE_WITH_XPU
template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
template struct SetConstant<platform::XPUDeviceContext, float>;
template struct SetConstant<platform::XPUDeviceContext, double>;
template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
template struct SetConstant<platform::XPUDeviceContext, int16_t>;
template struct SetConstant<platform::XPUDeviceContext, int>;
template struct SetConstant<platform::XPUDeviceContext, int64_t>;
template struct SetConstant<platform::XPUDeviceContext, bool>;
template struct SetConstant<platform::XPUDeviceContext,
platform::complex<float>>;
template struct SetConstant<platform::XPUDeviceContext,
platform::complex<double>>;
#endif
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::complex<float>, RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::complex<double>, RANK>;
DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
DEFINE_CPU_TRANS(3);
DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6);
template <typename T>
struct TransposeNormal<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& in, framework::Tensor* out,
const std::vector<int>& axis) {
const int rank = axis.size();
auto in_stride = framework::stride(in.dims());
auto out_stride = framework::stride(out->dims());
const T* in_ptr = in.data<T>();
T* out_ptr = out->data<T>();
auto transpose_helper = [&](int64_t beg, int64_t end) {
for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
int64_t in_idx = 0;
int64_t tmp_idx = out_idx;
// calculate the input index
for (int i = 0; i < rank; ++i) {
const int64_t coordinate = tmp_idx / out_stride[i];
tmp_idx -= coordinate * out_stride[i];
in_idx += coordinate * in_stride[axis[i]];
}
out_ptr[out_idx] = in_ptr[in_idx];
}
};
transpose_helper(0, out->numel());
}
};
// define transpose normal
#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
DEFINE_CPU_TRANS_NORMAL(platform::float16);
DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
DEFINE_CPU_TRANS_NORMAL(float);
DEFINE_CPU_TRANS_NORMAL(double);
DEFINE_CPU_TRANS_NORMAL(int);
DEFINE_CPU_TRANS_NORMAL(int64_t);
DEFINE_CPU_TRANS_NORMAL(bool);
DEFINE_CPU_TRANS_NORMAL(int16_t);
DEFINE_CPU_TRANS_NORMAL(uint8_t);
DEFINE_CPU_TRANS_NORMAL(int8_t);
DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
struct TensorSetConstantCPU {
TensorSetConstantCPU(framework::Tensor* tensor, float value)
: tensor_(tensor), value_(value) {}
template <typename T>
void apply() const {
auto cpu = platform::CPUPlace();
auto* begin = tensor_->mutable_data<T>(cpu);
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
}
framework::Tensor* tensor_;
float value_;
};
template <>
void set_constant_with_place<platform::XPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPinnedPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(
platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<platform::IPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
}
template <>
void set_constant_with_place<platform::MLUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CustomPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CUDAPinnedPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
}
struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
TensorSetConstantWithPlace(const platform::DeviceContext& context,
framework::Tensor* tensor, float value)
: context_(context), tensor_(tensor), value_(value) {}
template <typename Place>
void operator()(Place place) const {
set_constant_with_place<Place>(context_, tensor_, value_);
}
const platform::DeviceContext& context_;
framework::Tensor* tensor_;
float value_;
};
void set_constant(const platform::DeviceContext& context,
framework::Tensor* tensor, float value) {
TensorSetConstantWithPlace func(context, tensor, value);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// tensor->place().apply_visitor(func);
paddle::platform::VisitPlace(tensor->place(), func);
#else
func(platform::CPUPlace());
#endif
}
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(
vector.numel(), size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size, vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(out_dims, in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input"
" tensor shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr, out_dims_cstr));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
auto out = framework::EigenMatrix<T>::From(*output);
for (int64_t i = 0; i < in_dims[0]; ++i) {
out.chip(i, 0) = in.chip(i, 0) + vec;
}
}
};
template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, int>;
template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
IF(WITH_CUSTOM_DEVICE)
cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
cc_library(stream SRCS stream.cc DEPS callback_manager)
cc_library(event SRCS event.cc DEPS enforce place)
cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
ENDIF()
set(DEV_LIBS custom_device)
# GPU
IF(WITH_GPU OR WITH_ROCM)
add_subdirectory(gpu)
......@@ -22,3 +37,11 @@ ENDIF()
IF(WITH_MLU)
add_subdirectory(mlu)
ENDIF()
# CUSTOM
IF(WITH_CUSTOM_DEVICE)
add_subdirectory(custom)
cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
ENDIF()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/callback_manager.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
CallbackManager::CallbackManager(stream::Stream *stream)
: stream_(stream), thread_pool_(1) {}
void CallbackManager::AddCallback(std::function<void()> callback) const {
auto *callback_func = new std::function<void()>(std::move(callback));
auto *func = new std::function<void()>([this, callback_func] {
std::lock_guard<std::mutex> lock(mtx_);
last_future_ = thread_pool_.enqueue([callback_func] {
std::unique_ptr<std::function<void()>> releaser(callback_func);
(*callback_func)();
});
});
platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
->AddCallback(stream_, func);
}
void CallbackManager::Wait() const {
platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
->SynchronizeStream(stream_);
{
std::lock_guard<std::mutex> lock(mtx_);
if (last_future_.valid()) {
last_future_.wait();
}
}
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace stream {
class Stream;
} // namespace stream
// NOTE(zjl): clean CallbackManager to make compilation faster
// Make CallbackManager thread-safe
class CallbackManager {
public:
explicit CallbackManager(stream::Stream* stream);
~CallbackManager() = default;
void AddCallback(std::function<void()> callback) const;
void Wait() const;
private:
stream::Stream* stream_;
mutable ::ThreadPool thread_pool_;
mutable std::mutex mtx_;
mutable std::future<void> last_future_;
};
} // namespace platform
} // namespace paddle
IF(WITH_CUSTOM_DEVICE)
cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context )
ENDIF()
此差异已折叠。
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device/custom/fake_cpu_device.h"
#include "paddle/fluid/platform/device/device_manager.h"
#include "paddle/fluid/platform/device_context.h"
void RegisterDevice() {
CustomRuntimeParams runtime_params;
runtime_params.size = sizeof(CustomRuntimeParams);
auto device_interface = std::make_unique<C_DeviceInterface>();
runtime_params.interface = device_interface.get();
std::memset(runtime_params.interface, 0, sizeof(C_DeviceInterface));
runtime_params.interface->size = sizeof(C_DeviceInterface);
InitFakeCPUDevice(&runtime_params);
EXPECT_TRUE(paddle::platform::LoadCustomRuntimeLib(
runtime_params, std::move(device_interface), nullptr));
}
void InitDevice() {
RegisterDevice();
EXPECT_GT(static_cast<int>(
paddle::platform::DeviceManager::GetAllDeviceTypes().size()),
0);
auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0);
auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
EXPECT_NE(device, nullptr);
std::vector<paddle::platform::Place> places;
auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
for (auto dev_type : device_types) {
auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type);
for (auto dev_id : devices) {
places.push_back(
paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id));
}
}
EXPECT_GT(static_cast<int>(places.size()), 0);
paddle::platform::DeviceContextPool::Init(places);
}
void TestDeviceInterface(const paddle::platform::Place& place) {
std::cout << "TestDeviceInterface on " << place << std::endl;
if (paddle::platform::is_custom_place(place)) {
auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place);
auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place);
auto p1 = device->MemoryAllocate(
paddle::platform::DeviceManager::GetMinChunkSize(place));
EXPECT_NE(p1, nullptr);
paddle::platform::DeviceManager::SetDevice(place);
auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type);
EXPECT_EQ(dev_id, place.GetDeviceId());
}
}
void TestTensorMutableData(const paddle::platform::Place& place) {
std::cout << "TestTensorInitialization on " << place << std::endl;
paddle::framework::Tensor src_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({1, 2, 3}),
place);
auto p1_holder = src_tensor.Holder();
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({3, 1024}),
place);
auto p2_holder = src_tensor.Holder();
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1_holder.get(), p2_holder.get());
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2, 3}),
place);
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(paddle::framework::make_ddim({2, 2}),
place);
EXPECT_EQ(p1, p2);
}
void TestTensorShareDataWith(const paddle::platform::Place& place) {
std::cout << "TestTensorShareDataWith on " << place << std::endl;
paddle::framework::Tensor src_tensor;
paddle::framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(paddle::framework::make_ddim({2, 3, 4}), place);
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
void TestTensorUtils(const paddle::platform::Place& place) {
if (paddle::platform::is_custom_place(place) == false) {
return;
}
paddle::framework::Tensor src_tensor;
paddle::framework::Tensor gpu_tensor;
paddle::framework::Tensor dst_tensor;
int* src_ptr = src_tensor.mutable_data<int>(
paddle::framework::make_ddim({3, 3}), paddle::platform::CPUPlace());
int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
memcpy(src_ptr, arr, 9 * sizeof(int));
// CPU Tensor to GPU Tensor
paddle::platform::CustomDeviceContext gpu_ctx(place);
paddle::framework::TensorCopy(src_tensor, place, gpu_ctx, &gpu_tensor);
#if 0
// GPU Tensor to CPU Tensor
auto cpu_place = new paddle::platform::CPUPlace();
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
const int* dst_ptr = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
// Copy the same tensor
paddle::framework::TensorCopy(gpu_tensor, place, gpu_ctx, &gpu_tensor);
gpu_ctx.Wait();
const int* dst_ptr_tmp = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr_tmp);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
}
paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor
paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Slice Tensors
gpu_ctx.Wait();
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
EXPECT_NE(dst_ptr, slice_ptr);
for (size_t i = 0; i < 3; ++i) {
EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
}
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
#endif
}
TEST(CustomDevice, Tensor) {
InitDevice();
auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes();
for (const auto& dev_type : dev_types) {
std::cout << "Test on " << dev_type << std::endl;
EXPECT_GT(static_cast<int>(
paddle::platform::DeviceManager::GetDeviceCount(dev_type)),
0);
auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type);
TestDeviceInterface(place);
TestTensorMutableData(place);
TestTensorShareDataWith(place);
TestTensorUtils(place);
}
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/device_ext.h"
namespace paddle {
namespace platform {
namespace details {
template <typename T>
struct CustomDeviceStatusType {};
#define DEFINE_CUSTOM_DEVICE_STATUS_TYPE(type, success_value) \
template <> \
struct CustomDeviceStatusType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_CUSTOM_DEVICE_STATUS_TYPE(C_Status, C_SUCCESS);
} // namespace details
inline std::string build_custom_device_error_msg(C_Status stat) {
std::ostringstream sout;
sout << " CustomDevice error, the error code is : " << stat << ". ";
return sout.str();
}
#define PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __CUSTOM_DEVICE_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CustomDeviceStatusType< \
__CUSTOM_DEVICE_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_custom_device_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
} // namespace platform
} // namespace paddle
#endif // PADDLE_WITH_CUSTOM_DEVICE
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/device_ext.h"
constexpr size_t global_total_memory = 1024 * 1024UL;
static size_t global_free_memory = global_total_memory;
C_Status Init() { return C_SUCCESS; }
C_Status InitDevice(const C_Device device) { return C_SUCCESS; }
C_Status SetDevice(const C_Device device) { return C_SUCCESS; }
C_Status GetDevice(const C_Device device) {
device->id = 0;
return C_SUCCESS;
}
C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; }
C_Status Finalize() { return C_SUCCESS; }
C_Status GetDevicesCount(size_t *count) {
*count = 1;
return C_SUCCESS;
}
C_Status GetDevicesList(size_t *device) {
*device = 0;
return C_SUCCESS;
}
C_Status MemCpy(const C_Device device, void *dst, const void *src,
size_t size) {
memcpy(dst, src, size);
return C_SUCCESS;
}
C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst,
const void *src, size_t size) {
memcpy(dst, src, size);
return C_SUCCESS;
}
C_Status Allocate(const C_Device device, void **ptr, size_t size) {
if (global_free_memory >= size) {
*ptr = malloc(size);
global_free_memory -= size;
return C_SUCCESS;
} else {
*ptr = nullptr;
return C_FAILED;
}
}
C_Status Deallocate(const C_Device device, void *ptr, size_t size) {
free(ptr);
global_free_memory += size;
return C_SUCCESS;
}
C_Status CreateStream(const C_Device device, C_Stream *stream) {
return C_SUCCESS;
}
C_Status DestroyStream(const C_Device device, C_Stream stream) {
return C_SUCCESS;
}
C_Status CreateEvent(const C_Device device, C_Event *event) {
return C_SUCCESS;
}
C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) {
return C_SUCCESS;
}
C_Status DestroyEvent(const C_Device device, C_Event event) {
return C_SUCCESS;
}
C_Status SyncDevice(const C_Device device) { return C_SUCCESS; }
C_Status SyncStream(const C_Device device, C_Stream stream) {
return C_SUCCESS;
}
C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; }
C_Status StreamWaitEvent(const C_Device device, C_Stream stream,
C_Event event) {
return C_SUCCESS;
}
C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; }
C_Status DeviceMemStats(const C_Device device, size_t *total_memory,
size_t *free_memory) {
*total_memory = global_total_memory;
*free_memory = global_free_memory;
return C_SUCCESS;
}
C_Status DeviceMinChunkSize(const C_Device device, size_t *size) {
*size = 4 * 1024;
return C_SUCCESS;
}
C_Status DeviceMaxChunkSize(const C_Device device, size_t *size) {
*size = 64 * 1024;
return C_SUCCESS;
}
C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) {
*size = global_total_memory * 0.95;
return C_SUCCESS;
}
#define DEVICE_TYPE "FakeCPU"
#define SUB_DEVICE_TYPE "V100"
void InitFakeCPUDevice(CustomRuntimeParams *params) {
params->device_type = const_cast<char *>(DEVICE_TYPE);
params->sub_device_type = const_cast<char *>(SUB_DEVICE_TYPE);
params->version.major = PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION;
params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION;
params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION;
memset(reinterpret_cast<void *>(params->interface), 0,
sizeof(C_DeviceInterface));
params->interface->initialize = Init;
params->interface->finalize = Finalize;
params->interface->init_device = InitDevice;
params->interface->set_device = SetDevice;
params->interface->get_device = GetDevice;
params->interface->deinit_device = DestroyDevice;
params->interface->create_stream = CreateStream;
params->interface->destroy_stream = DestroyStream;
params->interface->create_event = CreateEvent;
params->interface->destroy_event = DestroyEvent;
params->interface->record_event = RecordEvent;
params->interface->synchronize_device = SyncDevice;
params->interface->synchronize_stream = SyncStream;
params->interface->synchronize_event = SyncEvent;
params->interface->stream_wait_event = StreamWaitEvent;
params->interface->memory_copy_h2d = MemCpy;
params->interface->memory_copy_d2d = MemCpy;
params->interface->memory_copy_d2h = MemCpy;
params->interface->async_memory_copy_h2d = AsyncMemCpy;
params->interface->async_memory_copy_d2d = AsyncMemCpy;
params->interface->async_memory_copy_d2h = AsyncMemCpy;
params->interface->device_memory_allocate = Allocate;
params->interface->host_memory_allocate = Allocate;
params->interface->unified_memory_allocate = Allocate;
params->interface->device_memory_deallocate = Deallocate;
params->interface->host_memory_deallocate = Deallocate;
params->interface->unified_memory_deallocate = Deallocate;
params->interface->get_device_count = GetDevicesCount;
params->interface->get_device_list = GetDevicesList;
params->interface->device_memory_stats = DeviceMemStats;
params->interface->device_max_chunk_size = DeviceMaxChunkSize;
params->interface->device_min_chunk_size = DeviceMinChunkSize;
params->interface->device_max_alloc_size = DeviceMaxAllocSize;
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/device_base.h"
#include "gflags/gflags.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
namespace paddle {
namespace platform {
#define INTERFACE_UNIMPLEMENT \
PADDLE_THROW(platform::errors::Unimplemented( \
"%s is not implemented on %s device.", __func__, Type()));
// info
size_t DeviceInterface::GetComputeCapability() {
VLOG(10) << Type() + " get compute capability " << 0;
return 0;
}
size_t DeviceInterface::GetRuntimeVersion() {
VLOG(10) << Type() + " get runtime version " << 0;
return 0;
}
size_t DeviceInterface::GetDriverVersion() {
VLOG(10) << Type() + " get driver version " << 0;
return 0;
}
// device manage
void DeviceInterface::Initialize() { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::Finalize() { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::SynchronizeDevice(size_t dev_id) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::InitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::DeInitDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; }
int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; }
// stream manage
void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream,
const stream::Stream::Priority& priority,
const stream::Stream::Flag& flag) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::DestroyStream(size_t dev_id, stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::SynchronizeStream(size_t dev_id,
const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
return true;
}
void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream,
stream::Stream::Callback* callback) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::StreamWaitEvent(size_t dev_id,
const stream::Stream* stream,
const event::Event* event) {
INTERFACE_UNIMPLEMENT;
}
// event manage
void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event,
event::Event::Flag flags) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event,
const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::SynchronizeEvent(size_t dev_id,
const event::Event* event) {
INTERFACE_UNIMPLEMENT;
}
bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) {
INTERFACE_UNIMPLEMENT;
return true;
}
// memery manage
void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst,
size_t src_id, const void* src, size_t size,
const stream::Stream* stream) {
INTERFACE_UNIMPLEMENT;
}
void* DeviceInterface::MemoryAllocate(size_t dev_id, size_t size) {
INTERFACE_UNIMPLEMENT;
return nullptr;
}
void DeviceInterface::MemoryDeallocate(size_t dev_id, void* ptr, size_t size) {
INTERFACE_UNIMPLEMENT;
}
void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) {
INTERFACE_UNIMPLEMENT;
return nullptr;
}
void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr,
size_t size) {
INTERFACE_UNIMPLEMENT;
}
void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) {
INTERFACE_UNIMPLEMENT;
return nullptr;
}
void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr,
size_t size) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value,
size_t size) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::MemoryStats(size_t dev_id, size_t* total, size_t* free) {
INTERFACE_UNIMPLEMENT;
}
size_t DeviceInterface::GetMinChunkSize(size_t dev_id) {
INTERFACE_UNIMPLEMENT;
}
size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
size_t available_to_alloc = AvailableAllocSize(dev_id);
PADDLE_ENFORCE_GT(available_to_alloc, 0,
platform::errors::ResourceExhausted(
"Not enough available %s memory.", Type()));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
: FLAGS_initial_gpu_memory_in_mb;
size_t alloc_bytes =
(flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
platform::errors::ResourceExhausted(
"Not enough available %s memory.", Type()));
return alloc_bytes;
}
size_t DeviceInterface::AvailableAllocSize(size_t dev_id) {
size_t total = 0;
size_t available = 0;
MemoryStats(dev_id, &total, &available);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GetMinChunkSize(dev_id);
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
return available_to_alloc;
}
size_t DeviceInterface::GetInitAllocSize(size_t dev_id) {
size_t init_alloc_size = AllocSize(dev_id, false);
VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M";
return init_alloc_size;
}
size_t DeviceInterface::GetReallocSize(size_t dev_id) {
size_t realloc_size = AllocSize(dev_id, true);
VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M";
return realloc_size;
}
size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) {
size_t max_alloc_size =
std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id));
VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M";
return max_alloc_size;
}
size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) {
size_t max_chunk_size = GetMaxAllocSize(dev_id);
VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M";
return max_chunk_size;
}
size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
VLOG(10) << Type() + " extra padding size " << 0;
return 0;
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
namespace paddle {
namespace platform {
class DeviceInterface { // Driver / Runtime
public:
DeviceInterface(const std::string& type, uint8_t priority, bool is_custom)
: type_(type), priority_(priority), is_custom_(is_custom) {}
uint8_t Priority() { return priority_; }
std::string Type() { return type_; }
bool IsCustom() { return is_custom_; }
virtual ~DeviceInterface() {}
// Info
virtual size_t GetComputeCapability();
virtual size_t GetRuntimeVersion();
virtual size_t GetDriverVersion();
// Platform
//! Initialize
virtual void Initialize();
//! Finalize
virtual void Finalize();
// Device
virtual size_t GetDeviceCount() = 0;
virtual std::vector<size_t> GetDeviceList() = 0;
//! Wait for compute device to finish.
virtual void SynchronizeDevice(size_t dev_id);
//! Initialize device.
virtual void InitDevice(size_t dev_id);
//! Deinitialize device.
virtual void DeInitDevice(size_t dev_id);
// ! Set device to be used.
virtual void SetDevice(size_t dev_id);
// ! Returns which device is currently being used.
virtual int GetDevice();
// Stream
// ! Create an asynchronous stream
virtual void CreateStream(
size_t dev_id, stream::Stream* stream,
const stream::Stream::Priority& priority =
stream::Stream::Priority::kNormal,
const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
// ! Destroys an asynchronous stream.
virtual void DestroyStream(size_t dev_id, stream::Stream* stream);
// ! Waits for stream tasks to complete.
virtual void SynchronizeStream(size_t dev_id, const stream::Stream* stream);
// ! Queries an asynchronous stream for completion status.
virtual bool QueryStream(size_t dev_id, const stream::Stream* stream);
// ! Add a callback to a compute stream.
virtual void AddCallback(size_t dev_id, stream::Stream* stream,
stream::Stream::Callback* callback);
// Event
// ! Create an event.
virtual void CreateEvent(size_t dev_id, event::Event* event,
event::Event::Flag flags);
// ! Destroy an event.
virtual void DestroyEvent(size_t dev_id, event::Event* event);
// ! Records an event.
virtual void RecordEvent(size_t dev_id, const event::Event* event,
const stream::Stream* stream);
// ! Waits for event to complete.
virtual void SynchronizeEvent(size_t dev_id, const event::Event* event);
// ! Queries an event for completion status.
virtual bool QueryEvent(size_t dev_id, const event::Event* event);
// ! Make a compute stream wait on an event
virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream,
const event::Event* event);
// Memory
virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src,
size_t size,
const stream::Stream* stream = nullptr);
virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src,
size_t size,
const stream::Stream* stream = nullptr);
virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src,
size_t size,
const stream::Stream* stream = nullptr);
virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id,
const void* src, size_t size,
const stream::Stream* stream = nullptr);
virtual void* MemoryAllocate(size_t dev_id, size_t size);
virtual void MemoryDeallocate(size_t dev_id, void* ptr, size_t size);
virtual void* MemoryAllocateHost(size_t dev_id, size_t size);
virtual void MemoryDeallocateHost(size_t dev_id, void* ptr, size_t size);
virtual void* MemoryAllocateUnified(size_t dev_id, size_t size);
virtual void MemoryDeallocateUnified(size_t dev_id, void* ptr, size_t size);
virtual void MemorySet(size_t dev_id, void* ptr, uint8_t value, size_t size);
virtual void MemoryStats(size_t dev_id, size_t* total, size_t* free);
virtual size_t GetMinChunkSize(size_t dev_id);
virtual size_t GetInitAllocSize(size_t dev_id);
virtual size_t GetReallocSize(size_t dev_id);
virtual size_t GetMaxAllocSize(size_t dev_id);
virtual size_t GetMaxChunkSize(size_t dev_id);
virtual size_t GetExtraPaddingSize(size_t dev_id);
private:
const std::string type_;
const uint8_t priority_;
const bool is_custom_;
size_t AllocSize(size_t dev_id, bool realloc);
size_t AvailableAllocSize(size_t dev_id);
};
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if !defined(_WIN32) && !defined(__APPLE__)
#include <cstddef>
#include <cstring>
#ifdef __cplusplus
extern "C" {
#endif
#define PADDLE_CUSTOM_RUNTIME_MAJOR_VERSION 0
#define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
#define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
typedef enum {
C_SUCCESS = 0, // success
C_WARNING, // results may not meet expectation (such as an asynchronous
// interface is actually synchronous)
C_FAILED, // resource exhausted/query failed
C_ERROR, // invalid argument/wrong usage/uninitialized
C_INTERNAL_ERROR // plugin error
} C_Status;
typedef struct C_Device_st { int id; } * C_Device;
typedef struct C_Stream_st* C_Stream;
typedef struct C_Event_st* C_Event;
typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data,
C_Status* status);
struct C_DeviceInterface {
// Core fill it and plugin must to check it
size_t size;
///////////////////////
// device manage api //
///////////////////////
/**
* @brief Initialize hardware
*
*/
C_Status (*initialize)();
/**
* @brief Deinitialize hardware
*
*/
C_Status (*finalize)();
/**
* @brief Initialize device
*
* @param[C_Device] device Core fill it with a logical id, and then plugin
* must replace it with a physical id
*/
C_Status (*init_device)(const C_Device device);
/**
* @brief Set current device
*
* @param[C_Device] device Core fill it with a physical id
*/
C_Status (*set_device)(const C_Device device);
/**
* @brief Get current device
*
* @param[C_Device] device Plugin fill it with a physical id
*/
C_Status (*get_device)(const C_Device device);
/**
* @brief Deinitialize device
*
* @param[C_Device] device Core fill it with a physical id
*/
C_Status (*deinit_device)(const C_Device device);
/**
* @brief Create a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream*] stream Plugin create a stream and fill it
*/
C_Status (*create_stream)(const C_Device device, C_Stream* stream);
/**
* @brief Destroy a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
*/
C_Status (*destroy_stream)(const C_Device device, C_Stream stream);
/**
* @brief Query a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
*/
C_Status (*query_stream)(const C_Device device, C_Stream stream);
/**
* @brief Add a callback to stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[C_Callback] callback
* @param[void*] user_data
*/
C_Status (*stream_add_callback)(const C_Device device, C_Stream stream,
C_Callback callback, void* user_data);
/**
* @brief Create an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event*] event Plugin create an event and fill it
*/
C_Status (*create_event)(const C_Device device, C_Event* event);
/**
* @brief Record an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[C_Event] event
*/
C_Status (*record_event)(const C_Device device, C_Stream stream,
C_Event event);
/**
* @brief Destroy an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event] event
*/
C_Status (*destroy_event)(const C_Device device, C_Event event);
/**
* @brief Query an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event] event
*/
C_Status (*query_event)(const C_Device device, C_Event event);
/**
* @brief Synchronize a device
*
* @param[C_Device] device Core fill it with a physical id
*/
C_Status (*synchronize_device)(const C_Device device);
/**
* @brief Synchronize a stream
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
*/
C_Status (*synchronize_stream)(const C_Device device, C_Stream stream);
/**
* @brief Synchronize an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Event] event
*/
C_Status (*synchronize_event)(const C_Device device, C_Event event);
/**
* @brief Make a stream wait on an event
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[C_Event] event
*/
C_Status (*stream_wait_event)(const C_Device device, C_Stream stream,
C_Event event);
void* reserved_dev_api[8];
///////////////////////
// memory manage api //
///////////////////////
/**
* @brief Device memory allocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void**] ptr Plugin allocate an address and fill it
* @param[size_t] size
*/
C_Status (*device_memory_allocate)(const C_Device device, void** ptr,
size_t size);
/**
* @brief Device memory deallocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[size_t] size
*/
C_Status (*device_memory_deallocate)(const C_Device device, void* ptr,
size_t size);
/**
* @brief Device memory set
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[unsigned char] value
* @param[size_t] size
*/
C_Status (*device_memory_set)(const C_Device device, void* ptr,
unsigned char value, size_t size);
/**
* @brief Host memory allocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void**] ptr Plugin allocate an address and fill it
* @param[size_t] size
*/
C_Status (*host_memory_allocate)(const C_Device device, void** ptr,
size_t size);
/**
* @brief Host memory deallocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[size_t] size
*/
C_Status (*host_memory_deallocate)(const C_Device device, void* ptr,
size_t size);
/**
* @brief Unified memory allocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void**] ptr Plugin allocate an address and fill it
* @param[size_t] size
*/
C_Status (*unified_memory_allocate)(const C_Device device, void** ptr,
size_t size);
/**
* @brief Unified memory deallocate
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] ptr
* @param[size_t] size
*/
C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr,
size_t size);
/**
* @brief Memory copy from host to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src,
size_t size);
/**
* @brief Memory copy from device to host
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src,
size_t size);
/**
* @brief Memory copy from device to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src,
size_t size);
/**
* @brief Peer memory copy from device to device
*
* @param[C_Device] dst_device Core fill it with a physical id
* @param[C_Device] src_device Core fill it with a physical id
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*memory_copy_p2p)(const C_Device dst_device,
const C_Device src_device, void* dst,
const void* src, size_t size);
/**
* @brief Asynchonrize memory copy from host to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream,
void* dst, const void* src, size_t size);
/**
* @brief Asynchonrize memory copy from device to host
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream,
void* dst, const void* src, size_t size);
/**
* @brief Asynchonrize memory copy from device to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream,
void* dst, const void* src, size_t size);
/**
* @brief Peer asynchonrize memory copy from host to device
*
* @param[C_Device] device Core fill it with a physical id
* @param[C_Stream] stream
* @param[void*] dst
* @param[void*] src
* @param[size_t] size
*/
C_Status (*async_memory_copy_p2p)(const C_Device dst_device,
const C_Device src_device, C_Stream stream,
void* dst, const void* src, size_t size);
void* reserved_mem_api[8];
//////////////
// info api //
//////////////
/**
* @brief Get visible device count
*
* @param[size_t*] count Plugin fill it
*/
C_Status (*get_device_count)(size_t* count);
/**
* @brief Get visible device list
*
* @param[size_t*] devices Plugin fill it
*/
C_Status (*get_device_list)(size_t* devices);
/**
* @brief Device memory statistic
*
* @param[C_Device] device Core fill it with a physical id
* @param[size_t*] total_memory
* @param[size_t*] free_memory
* @param[size_t*] used_memory
*/
C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory,
size_t* free_memory);
/**
* @brief Device minimum chunk size
*
* @param[size_t*] count
*/
C_Status (*device_min_chunk_size)(const C_Device device, size_t* count);
/**
* @brief Device maximum chunk size
*
* @param[size_t*] count
*/
C_Status (*device_max_chunk_size)(const C_Device device, size_t* count);
/**
* @brief Device maximum alloc size
*
* @param[size_t*] count
*/
C_Status (*device_max_alloc_size)(const C_Device device, size_t* count);
/**
* @brief Device extra padding size
*
* @param[size_t*] size
*/
C_Status (*device_extra_padding_size)(const C_Device device, size_t* size);
/**
* @brief Device initial allocated size
*
* @param[size_t*] size
*/
C_Status (*device_init_alloc_size)(const C_Device device, size_t* size);
/**
* @brief Device reallocated size
*
* @param[size_t*] size
*/
C_Status (*device_realloc_size)(const C_Device device, size_t* size);
/**
* @brief Get compute capability
*
* @param[size_t*] compute_capability
*/
C_Status (*get_compute_capability)(size_t* compute_capability);
/**
* @brief Get runtime version
*
* @param[size_t*] version
*/
C_Status (*get_runtime_version)(size_t* version);
/**
* @brief Get driver version
*
* @param[size_t*] version
*/
C_Status (*get_driver_version)(size_t* version);
void* reserved_info_api[8];
///////////////
// other api //
///////////////
void* reserved_other_api[8];
};
struct CustomRuntimeVersion {
size_t major, minor, patch;
};
struct CustomRuntimeParams {
// Core fill it and plugin must to check it
size_t size;
// Plugin fill it
C_DeviceInterface* interface;
// Plugin fill it and Core will to check it
CustomRuntimeVersion version;
// Plugin fill it
char* device_type;
// Plugin fill it
char* sub_device_type;
char reserved[32];
};
// Plugin implement it and fill CustomRuntimeParams
void InitPlugin(CustomRuntimeParams*);
#ifdef __cplusplus
} // extern "C"
#endif
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/device_guard.h"
namespace paddle {
namespace platform {
// Even this source file does not contains any code, it is better to keep this
// source file for cmake dependency.
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/device_manager.h"
namespace paddle {
namespace platform {
class DeviceGuard {
public:
explicit inline DeviceGuard(const Place& place)
: dev_type_(PlaceHelper::GetDeviceType(place)) {
prev_id = DeviceManager::GetDevice(dev_type_);
cur_id = PlaceHelper::GetDeviceId(place);
if (cur_id != prev_id) {
DeviceManager::SetDevice(dev_type_, cur_id);
}
}
inline ~DeviceGuard() {
if (cur_id != prev_id) {
DeviceManager::SetDevice(dev_type_, prev_id);
}
}
DeviceGuard(const DeviceGuard& o) = delete;
DeviceGuard& operator=(const DeviceGuard& o) = delete;
private:
size_t prev_id, cur_id;
std::string dev_type_;
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/device_manager.h"
#if !defined(_WIN32)
#include <dirent.h>
#else
#endif
#include <functional>
#include <regex>
namespace paddle {
namespace platform {
void Device::CreateStream(stream::Stream* stream,
const stream::Stream::Priority& priority,
const stream::Stream::Flag& flag) {
impl_->CreateStream(dev_id_, stream, priority, flag);
}
void Device::DestroyStream(stream::Stream* stream) {
impl_->DestroyStream(dev_id_, stream);
}
void Device::SynchronizeStream(const stream::Stream* stream) {
impl_->SynchronizeStream(dev_id_, stream);
}
bool Device::QueryStream(const stream::Stream* stream) {
return impl_->QueryStream(dev_id_, stream);
}
void Device::AddCallback(stream::Stream* stream,
stream::Stream::Callback* callback) {
impl_->AddCallback(dev_id_, stream, callback);
}
void Device::CreateEvent(event::Event* event, event::Event::Flag flags) {
impl_->CreateEvent(dev_id_, event, flags);
}
void Device::DestroyEvent(event::Event* event) {
impl_->DestroyEvent(dev_id_, event);
}
void Device::RecordEvent(const event::Event* event,
const stream::Stream* stream) {
impl_->RecordEvent(dev_id_, event, stream);
}
void Device::SynchronizeEvent(const event::Event* event) {
impl_->SynchronizeEvent(dev_id_, event);
}
bool Device::QueryEvent(const event::Event* event) {
return impl_->QueryEvent(dev_id_, event);
}
void Device::StreamWaitEvent(const stream::Stream* stream,
const event::Event* event) {
impl_->StreamWaitEvent(dev_id_, stream, event);
}
void Device::MemoryCopyH2D(void* dst, const void* src, size_t size,
const stream::Stream* stream) {
impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream);
}
void Device::MemoryCopyD2H(void* dst, const void* src, size_t size,
const stream::Stream* stream) {
impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream);
}
void Device::MemoryCopyD2D(void* dst, const void* src, size_t size,
const stream::Stream* stream) {
impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream);
}
void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
size_t size, const stream::Stream* stream) {
impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream);
}
void* Device::MemoryAllocate(size_t size) {
return impl_->MemoryAllocate(dev_id_, size);
}
void Device::MemoryDeallocate(void* ptr, size_t size) {
impl_->MemoryDeallocate(dev_id_, ptr, size);
}
void* Device::MemoryAllocateHost(size_t size) {
return impl_->MemoryAllocateHost(dev_id_, size);
}
void Device::MemoryDeallocateHost(void* ptr, size_t size) {
impl_->MemoryDeallocateHost(dev_id_, ptr, size);
}
void* Device::MemoryAllocateUnified(size_t size) {
return impl_->MemoryAllocateUnified(dev_id_, size);
}
void Device::MemoryDeallocateUnified(void* ptr, size_t size) {
impl_->MemoryDeallocateUnified(dev_id_, ptr, size);
}
void Device::MemorySet(void* ptr, uint8_t value, size_t size) {
impl_->MemorySet(dev_id_, ptr, value, size);
}
std::string Device::Type() { return impl_->Type(); }
static pten::RWLock _global_device_manager_rw_lock;
bool DeviceManager::Register(std::unique_ptr<DeviceInterface> device_impl) {
pten::AutoWRLock lock(&_global_device_manager_rw_lock);
VLOG(4) << "Register Device - " << device_impl->Type();
auto device_type = device_impl->Type();
auto& dev_impl_map = Instance().device_impl_map_;
auto& dev_map = Instance().device_map_;
if (dev_impl_map.find(device_type) == dev_impl_map.end()) {
dev_impl_map.insert(
std::pair<std::string, std::unique_ptr<DeviceInterface>>(
device_type, std::move(device_impl)));
auto& dev_impl = dev_impl_map[device_type];
auto& dev_vec = dev_map[device_type];
VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
dev_vec.emplace_back(new Device(i, dev_impl.get()));
}
} else {
auto& plat = dev_impl_map[device_type];
if (plat->IsCustom() && plat->Priority() > device_impl->Priority()) {
dev_impl_map[device_type] = std::move(device_impl);
auto& dev_impl = dev_impl_map[device_type];
auto& dev_vec = dev_map[device_type];
dev_vec.clear();
VLOG(4) << "GetDeviceCount is " << dev_impl->GetDeviceCount();
for (size_t i = 0; i < dev_impl->GetDeviceCount(); ++i) {
dev_vec.emplace_back(new Device(i, dev_impl.get()));
}
} else {
return false;
}
}
return true;
}
DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
const std::string& device_type) {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
if (dev_impl_map.find(device_type) != dev_impl_map.end()) {
return dev_impl_map.at(device_type).get();
} else {
LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
PADDLE_THROW(
platform::errors::Fatal("Unregistered device type %s.", device_type));
return nullptr;
}
}
Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_map = Instance().device_map_;
auto dev_type = PlaceHelper::GetDeviceType(place);
auto dev_id = PlaceHelper::GetDeviceId(place);
PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(),
platform::errors::NotFound(
"Unable to find Device with type %s.", dev_type));
auto& dev_vec = dev_map[dev_type];
PADDLE_ENFORCE_LT(
dev_id, dev_vec.size(),
platform::errors::OutOfRange(
"The visible devices count of type %s is %d, but dev_id is %d.",
dev_type, dev_vec.size(), dev_id));
return dev_vec[dev_id].get();
}
std::vector<std::string> DeviceManager::GetAllDeviceTypes() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
devices.push_back(iter->first);
}
return devices;
}
std::vector<std::string> DeviceManager::GetAllCustomDeviceTypes() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
if (iter->second->IsCustom()) {
devices.push_back(iter->first);
}
}
return devices;
}
std::vector<std::string> DeviceManager::GetAllDeviceList() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
size_t device_count = iter->second->GetDeviceCount();
std::string dev_type = iter->second->Type();
if (device_count == 1) {
devices.push_back(dev_type);
} else {
for (size_t i = 0; i < device_count; ++i) {
devices.push_back(dev_type + ":" + std::to_string(i));
}
}
}
return devices;
}
std::vector<std::string> DeviceManager::GetAllCustomDeviceList() {
pten::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
std::vector<std::string> devices;
for (auto iter = dev_impl_map.cbegin(); iter != dev_impl_map.cend(); ++iter) {
size_t device_count = iter->second->GetDeviceCount();
std::string dev_type = iter->second->Type();
if (iter->second->IsCustom()) {
if (device_count == 1) {
devices.push_back(dev_type);
} else {
for (size_t i = 0; i < device_count; ++i) {
devices.push_back(dev_type + ":" + std::to_string(i));
}
}
}
}
return devices;
}
bool DeviceManager::HasDeviceType(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl != nullptr;
}
bool DeviceManager::IsCustom(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->IsCustom();
}
void DeviceManager::Initialize(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->Initialize();
}
void DeviceManager::Finalize(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->Finalize();
}
void DeviceManager::SynchronizeDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->SynchronizeDevice(device_id);
}
void DeviceManager::InitDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->InitDevice(device_id);
}
void DeviceManager::DeInitDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->DeInitDevice(device_id);
}
void DeviceManager::SetDevice(const std::string& device_type,
size_t device_id) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->SetDevice(device_id);
}
void DeviceManager::SetDevice(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
DeviceManager::SetDevice(device_type, device_id);
}
int DeviceManager::GetDevice(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetDevice();
}
size_t DeviceManager::GetMinChunkSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetMinChunkSize(device_id);
}
size_t DeviceManager::GetMaxChunkSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetMaxChunkSize(device_id);
}
size_t DeviceManager::GetMaxAllocSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetMaxAllocSize(device_id);
}
size_t DeviceManager::GetInitAllocSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetInitAllocSize(device_id);
}
size_t DeviceManager::GetReallocSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetReallocSize(device_id);
}
size_t DeviceManager::GetExtraPaddingSize(const Place& place) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetExtraPaddingSize(device_id);
}
void DeviceManager::MemoryStats(const Place& place, size_t* total,
size_t* free) {
auto device_type = PlaceHelper::GetDeviceType(place);
auto device_id = PlaceHelper::GetDeviceId(place);
auto dev_impl = GetDeviceInterfaceWithType(device_type);
dev_impl->MemoryStats(device_id, total, free);
}
size_t DeviceManager::GetDeviceCount(const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetDeviceCount();
}
std::vector<size_t> DeviceManager::GetDeviceList(
const std::string& device_type) {
auto dev_impl = GetDeviceInterfaceWithType(device_type);
return dev_impl->GetDeviceList();
}
DeviceManager& DeviceManager::Instance() {
static DeviceManager platform_manager;
return platform_manager;
}
std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
std::vector<std::string> libraries;
std::regex express(".*\\.so");
std::match_results<std::string::iterator> results;
DIR* dir = nullptr;
dirent* ptr = nullptr;
dir = opendir(library_dir.c_str());
if (dir == nullptr) {
VLOG(4) << "open CustomDevice library_dir: " << library_dir << " failed";
} else {
while ((ptr = readdir(dir)) != nullptr) {
std::string filename(ptr->d_name);
if (std::regex_match(filename.begin(), filename.end(), results,
express)) {
libraries.push_back(library_dir + '/' + filename);
VLOG(4) << "found CustomDevice library: " << libraries.back()
<< std::endl;
}
}
closedir(dir);
}
return libraries;
}
bool LoadCustomDevice(const std::string& library_dir) {
std::vector<std::string> libs = ListAllLibraries(library_dir);
for (const auto& lib_path : libs) {
auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
LoadCustomRuntimeLib(dso_handle);
}
return true;
}
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/device_base.h"
#include "paddle/fluid/platform/device/device_ext.h"
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/pten/backends/dynload/port.h"
#include "paddle/pten/core/utils/rw_lock.h"
namespace paddle {
namespace platform {
class Device final {
public:
Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {}
// Stream
// ! Create an asynchronous stream
void CreateStream(
stream::Stream* stream, const stream::Stream::Priority& priority =
stream::Stream::Priority::kNormal,
const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag);
// ! Destroys an asynchronous stream.
void DestroyStream(stream::Stream* stream);
// ! Waits for stream tasks to complete.
void SynchronizeStream(const stream::Stream* stream);
// ! Queries an asynchronous stream for completion status.
bool QueryStream(const stream::Stream* stream);
// ! Add a callback to a compute stream.
void AddCallback(stream::Stream* stream, stream::Stream::Callback* callback);
// Event
// ! Create an event.
void CreateEvent(event::Event* event, event::Event::Flag flags);
// ! Destroy an event.
void DestroyEvent(event::Event* event);
// ! Records an event.
void RecordEvent(const event::Event* event, const stream::Stream* stream);
// ! Waits for event to complete.
void SynchronizeEvent(const event::Event* event);
// ! Queries an event for completion status.
bool QueryEvent(const event::Event* event);
// ! Make a compute stream wait on an event
void StreamWaitEvent(const stream::Stream* stream, const event::Event* event);
// Memory
void MemoryCopyH2D(void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr);
void MemoryCopyD2H(void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr);
void MemoryCopyD2D(void* dst, const void* src, size_t size,
const stream::Stream* stream = nullptr);
void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src,
size_t size, const stream::Stream* stream = nullptr);
void* MemoryAllocate(size_t size);
void MemoryDeallocate(void* ptr, size_t size);
void* MemoryAllocateHost(size_t size);
void MemoryDeallocateHost(void* ptr, size_t size);
void* MemoryAllocateUnified(size_t size);
void MemoryDeallocateUnified(void* ptr, size_t size);
void MemorySet(void* ptr, uint8_t value, size_t size);
std::string Type();
private:
size_t dev_id_;
DeviceInterface* impl_;
};
class DeviceManager {
public:
static bool Register(std::unique_ptr<DeviceInterface> device);
static bool RegisterPinnedDevice(DeviceInterface* device);
static Device* GetDeviceWithPlace(const Place& place);
static std::vector<std::string> GetAllDeviceTypes();
static std::vector<std::string> GetAllCustomDeviceTypes();
static std::vector<std::string> GetAllDeviceList();
static std::vector<std::string> GetAllCustomDeviceList();
static bool HasDeviceType(const std::string& device_type);
static bool IsCustom(const std::string& device_type);
// platform & device
static void Initialize(const std::string& device_type);
static void Finalize(const std::string& device_type);
static void SynchronizeDevice(const Place& place);
static void InitDevice(const Place& place);
static void DeInitDevice(const Place& place);
static void SetDevice(const std::string& device_type, size_t device_id);
static void SetDevice(const Place& place);
static int GetDevice(const std::string& device_type);
static size_t GetMinChunkSize(const Place& place);
static size_t GetMaxChunkSize(const Place& place);
static size_t GetMaxAllocSize(const Place& place);
static size_t GetInitAllocSize(const Place& place);
static size_t GetReallocSize(const Place& place);
static size_t GetExtraPaddingSize(const Place& place);
static void MemoryStats(const Place& place, size_t* total, size_t* free);
static size_t GetDeviceCount(const std::string& device_type);
static std::vector<size_t> GetDeviceList(const std::string& device_type);
private:
DISABLE_COPY_AND_ASSIGN(DeviceManager);
DeviceManager() {}
static DeviceManager& Instance();
static DeviceInterface* GetDeviceInterfaceWithType(
const std::string& device_type);
std::unordered_map<std::string, std::unique_ptr<DeviceInterface>>
device_impl_map_;
std::unordered_map<std::string, std::vector<std::unique_ptr<Device>>>
device_map_;
};
bool LoadCustomRuntimeLib(void* dso_handle);
bool LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
std::unique_ptr<C_DeviceInterface> device_interface,
void* dso_handle);
bool LoadCustomDevice(const std::string& library_path);
class Registrar {
public:
template <typename DeviceT>
explicit Registrar(DeviceT* device_ptr) {
DeviceManager::Register(std::unique_ptr<DeviceT>(device_ptr));
}
void Touch() {}
};
} // namespace platform
} // namespace paddle
#endif
......@@ -38,3 +38,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/platform/device/callback_manager.h"
#include "paddle/fluid/platform/device/custom/enforce_custom.h"
#include "paddle/fluid/platform/device/device_guard.h"
#include "paddle/fluid/platform/device/device_manager.h"
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/stream.h"
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/event.h"
#include "paddle/fluid/platform/device/device_guard.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/stream.h"
namespace paddle {
namespace platform {
namespace event {
event_t Event::raw_event() const { return event_; }
void Event::set_event(event_t event) { event_ = event; }
Event::Event(const Place& place, event_t event)
: place_(place),
device_(platform::DeviceManager::GetDeviceWithPlace(place)),
event_(event),
own_data_(false) {}
Event::~Event() { Destroy(); }
bool Event::Init(const Place& place, Flag flags) {
place_ = place;
DeviceGuard guard(place_);
device_->CreateEvent(this, flags);
VLOG(3) << "Init Event: " << event_ << ", place: " << place_
<< ", flag:" << static_cast<int>(flags);
own_data_ = true;
return true;
}
void Event::Destroy() {
if (own_data_) {
DeviceGuard guard(place_);
device_->DestroyEvent(this);
own_data_ = false;
}
}
void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
bool Event::Query() const { return device_->QueryEvent(this); }
void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
const Place& Event::GetPlace() const { return place_; }
} // namespace event
} // namespace platform
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -433,8 +433,9 @@ PADDLE_DEFINE_EXPORTED_double(
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_CUSTOM_DEVICE)
/**
* Memory related FLAG
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -284,7 +284,7 @@ if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
if(NOT APPLE AND NOT WIN32)
target_link_libraries(paddle_pybind rt)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -293,6 +293,7 @@ from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401
from .framework import MLUPlace # noqa: F401
from .framework import CustomPlace # noqa: F401
from .autograd import grad # noqa: F401
from .autograd import no_grad # noqa: F401
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册