未验证 提交 c48a9ad5 编写于 作者: W Wilber 提交者: GitHub

[Pten] Replace platform::Place to pten::Place. (#38899)

* add pten::Place data structure.

* update ci problem

* fix ci problem

* update

* using platform::Place=pten::Place

* remove BOOST_GET_CONST for CPUPlace and GPUPlace

* compile pass 25%.

* compile pass 45%

* compile pass 60%

* remove boost_get for xpu npu mlu and ipu

* compile pass on cpu and gpu.

* fix compile problem

* fix compile error.

* update

* fix ci problem

* update

* ci approve

* fix ci problem

* fix ci eager test problem

* remove BOOST_GET_CONST

* fix npu compile
上级 1dbc8632
......@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) {
if (framework::IsFastEagerDeletionModeEnabled()) {
gc.reset(new framework::UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
max_memory_size));
}
}
#endif
......
......@@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var,
iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
} else {
#ifdef PADDLE_WITH_CUDA
char* temp_ptr =
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
char* temp_ptr = new char[tensor->numel() *
framework::SizeOfType(tensor->type())]; // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(), temp_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
......@@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var,
iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
} else {
#ifdef PADDLE_WITH_CUDA
char* temp_ptr =
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
char* temp_ptr = new char[tensor->numel() *
framework::SizeOfType(tensor->type())]; // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(), temp_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
......@@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
}
void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& io_buffer_itr,
butil::IOBufBytesIterator& io_buffer_itr, // NOLINT
const platform::DeviceContext& ctx) {
const auto place = ctx.GetPlace();
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
......@@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
// IO Buffer
if (platform::is_cpu_place(place)) {
unsigned long data_len;
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(tensor_data, data_len);
} else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
unsigned long data_len;
char* temp_ptr =
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);
unsigned long data_len; // NOLINT
char* temp_ptr = new char[tensor->numel() *
framework::SizeOfType(tensor->type())]; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), (void*)temp_ptr,
tensor->numel() * framework::SizeOfType(tensor->type()),
stream);
memory::Copy(
place, tensor_data, platform::CPUPlace(), (void*)temp_ptr, // NOLINT
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
delete[] temp_ptr;
#endif
}
}
void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& io_buffer_itr,
const platform::DeviceContext& ctx) {
void DeserializeSelectedRows(
framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& io_buffer_itr, // NOLINT
const platform::DeviceContext& ctx) {
const auto place = ctx.GetPlace();
auto* slr = var->GetMutable<framework::SelectedRows>();
framework::Tensor* tensor = slr->mutable_value();
......@@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
// IO Buffer
if (platform::is_cpu_place(place)) {
unsigned long data_len;
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(tensor_data, data_len);
} else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
char* temp_ptr =
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
unsigned long data_len;
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
char* temp_ptr = new char[tensor->numel() *
framework::SizeOfType(tensor->type())]; // NOLINT
unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(temp_ptr, data_len);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), temp_ptr,
memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr,
tensor->numel() * framework::SizeOfType(tensor->type()),
stream);
delete[] temp_ptr;
......
......@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(), temp_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
micro_id = static_cast<int>(temp_ptr_float[0]);
......
......@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const paddle::platform::CPUPlace& place) {
void operator()(const paddle::platform::CPUPlace& place) const {
paddle::platform::CPUDeviceContext* ctx =
dynamic_cast<paddle::platform::CPUDeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
......@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
// TODO(jiabin): Support xpu here from gradient_accumulator.cc
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const paddle::platform::CUDAPlace& place) {
void operator()(const paddle::platform::CUDAPlace& place) const {
paddle::platform::CUDADeviceContext* ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
......@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const paddle::platform::CUDAPlace& place) {
void operator()(const paddle::platform::CUDAPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -76,7 +76,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
// TODO(jiabin): Support Npu here from gradient_accumulator.cc
// there is NO blas in CUDAPinnedPlace
void operator()(const paddle::platform::CUDAPinnedPlace& place) {
void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -84,14 +84,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#ifdef PADDLE_WITH_ASCEND_CL
void operator()(const paddle::platform::NPUPlace& place) {
void operator()(const paddle::platform::NPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::NPUPlace& place) {
void operator()(const paddle::platform::NPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif
#ifdef PADDLE_WITH_XPU
void operator()(const paddle::platform::XPUPlace& place) {
void operator()(const paddle::platform::XPUPlace& place) const {
paddle::platform::XPUDeviceContext* ctx =
dynamic_cast<paddle::platform::XPUDeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
}
#else
void operator()(const paddle::platform::XPUPlace& place) {
void operator()(const paddle::platform::XPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif
#ifdef PADDLE_WITH_MLU
void operator()(const paddle::platform::MLUPlace& place) {
void operator()(const paddle::platform::MLUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::MLUPlace& place) {
void operator()(const paddle::platform::MLUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -132,14 +132,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif
#ifdef PADDLE_WITH_IPU
void operator()(const paddle::platform::IPUPlace& place) {
void operator()(const paddle::platform::IPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::IPUPlace& place) {
void operator()(const paddle::platform::IPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -147,7 +147,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
void operator()(const paddle::platform::NPUPinnedPlace& place) {
void operator()(const paddle::platform::NPUPinnedPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
private:
int64_t numel_;
const T* x_;
T* y_;
mutable T* y_;
};
template <typename DeviceContext, typename T>
......@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>()); \
boost::apply_visitor(func, place); \
paddle::platform::VisitPlace(place, func); \
return; \
}
......@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
boost::apply_visitor(func, place); \
paddle::platform::VisitPlace(place, func); \
return; \
}
......
......@@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
VLOG(6) << "Get Device id";
if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::SetDeviceId(
BOOST_GET_CONST(paddle::platform::CUDAPlace, place).device);
paddle::platform::SetDeviceId(place.device);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif
} else if (paddle::platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
paddle::platform::SetXPUDeviceId(
BOOST_GET_CONST(paddle::platform::XPUPlace, place).device);
paddle::platform::SetXPUDeviceId(place.device);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with XPU if use XPUPlace."));
#endif
} else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::SetNPUDeviceId(
BOOST_GET_CONST(paddle::platform::NPUPlace, place).device);
paddle::platform::SetNPUDeviceId(place.device);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace."));
......
......@@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (is_xpu_place(expected_kernel_key.place_) &&
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) {
......@@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
paddle::platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
......
......@@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
<< " dst_place: " << dst_place;
PADDLE_ENFORCE_NE(
in.place().which(), dst_place.which(),
in.place().GetType(), dst_place.GetType(),
platform::errors::Unavailable("Currently, model parallelism is only "
"supported between CPU and CUDA."));
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......@@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc(
const framework::proto::VarType::Type &dtype, int64_t numel,
const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) {
if (platform::is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
platform::errors::InvalidArgument(
......@@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc(
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
} else if (is_xpu_place(places[0])) {
} else if (platform::is_xpu_place(places[0])) {
#if defined(PADDLE_WITH_XPU_BKCL)
PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
platform::errors::InvalidArgument(
......@@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc(
void AllReduceOpHandle::SyncNCCLAllReduce() {
if (FLAGS_sync_nccl_allreduce) {
for (auto &p : places_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
int dev_id = p.device;
auto *nccl_ctxs =
nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
auto &nccl_ctx = nccl_ctxs->at(dev_id);
......
......@@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor(
}
int index = 0;
for (uint32_t i = 0; i < places.size(); i++) {
int id = BOOST_GET_CONST(platform::XPUPlace, places_[i]).device;
int id = places_[i].device;
if (place_to_index_.find(id) == place_to_index_.end()) {
place_to_index_[id] = index;
index++;
......@@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops);
continue;
} else {
cur_place =
BOOST_GET_CONST(platform::XPUPlace, dev_ctxes_.begin()->first);
cur_place = dev_ctxes_.begin()->first;
int cur_index = place_to_index_[cur_place.device];
RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index);
}
......
......@@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase {
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
int dev_id = place.device;
auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
auto comm = bkcl_ctx.comm_;
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......@@ -83,8 +84,7 @@ void BroadcastOpHandle::BroadcastOneVar(
} else if (platform::is_gpu_place(in_tensor.place())) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
VarHandle *out_handle = nullptr;
int root_id =
BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;
int root_id = in_tensor.place().device;
std::vector<std::function<void()>> broadcast_calls;
int type = platform::ToNCCLDataType(in_tensor.type());
......@@ -94,8 +94,7 @@ void BroadcastOpHandle::BroadcastOneVar(
Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
int dst_id =
BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place()).device;
int dst_id = out_var_handle->place().device;
auto &nccl_ctx = nccl_ctxs_->at(dst_id);
......@@ -145,7 +144,7 @@ void BroadcastOpHandle::BroadcastOneVar(
} else {
#if defined(PADDLE_WITH_XPU_BKCL)
VarHandle *out_handle = nullptr;
int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device;
int root_id = in_tensor.place().device;
std::vector<std::function<void()>> broadcast_calls;
int type = platform::ToBKCLDataType(in_tensor.type());
......@@ -155,8 +154,7 @@ void BroadcastOpHandle::BroadcastOneVar(
Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
int dst_id =
BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device;
int dst_id = out_var_handle->place().device;
auto &bkcl_ctx = bkcl_ctxs_->at(dst_id);
......@@ -232,7 +230,7 @@ void BroadcastOpHandle::InitOutputValue(
PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) {
if (platform::is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
......
......@@ -46,8 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device);
platform::CUDADeviceGuard guard(place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipEventCreateWithFlags(&event_, hipEventDisableTiming));
......@@ -72,7 +71,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
auto gpu_place = dev_ctx_->GetPlace();
platform::CUDADeviceGuard guard(gpu_place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
......@@ -85,8 +84,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
void EagerDeletionOpHandle::InitCUDA() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
int dev_id = dev_ctxes_.begin()->first.device;
events_[dev_id] = nullptr;
#endif
}
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(skip_fused_all_reduce_check, false, "");
......@@ -102,7 +103,7 @@ void FusedAllReduceOpHandle::RunImpl() {
gpuStream_t compute_stream{nullptr};
if (FLAGS_allreduce_record_one_event) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]);
auto gpu_place = platform::CUDAPlace(places_[0].GetDeviceId());
compute_stream =
platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream();
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
......@@ -291,7 +292,7 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
if (!platform::is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true;
}
}
......
......@@ -354,7 +354,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
float* cpu_data = new float[tensor->numel()];
memory::Copy(platform::CPUPlace(), static_cast<void*>(cpu_data),
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
tensor->place(),
static_cast<const void*>(tensor->data<float>()),
tensor->numel() * sizeof(float));
bool flag = false;
......
......@@ -132,7 +132,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(tensor_.place()));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, tensor_.place()).device;
int dev_id = tensor_.place().device;
PADDLE_ENFORCE_EQ(
(dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true,
platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d",
......
......@@ -102,7 +102,7 @@ class NCCLOpHandleBase : public OpHandleBase {
}
for (auto& p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
int dev_id = p.first.device;
if (inter_events_.find(dev_id) != inter_events_.end()) {
continue;
}
......@@ -133,7 +133,7 @@ class NCCLOpHandleBase : public OpHandleBase {
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
int dev_id = place.device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
......@@ -181,7 +181,7 @@ class NCCLOpHandleBase : public OpHandleBase {
void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype, ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
int dev_id = place.device;
auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
......@@ -213,7 +213,7 @@ class NCCLOpHandleBase : public OpHandleBase {
PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
int dev_id = place.device;
auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
......@@ -246,7 +246,7 @@ class NCCLOpHandleBase : public OpHandleBase {
void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
int dev_id = place.device;
auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
......
......@@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
void OpHandleBase::InitCUDA() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
int dev_id = p.first.device;
platform::SetDeviceId(dev_id);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
......@@ -61,9 +61,7 @@ void OpHandleBase::InitCUDA() {
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place())
.device;
int dev_id = out_var_handle->place().device;
out_var_handle->SetGenerateEvent(events_.at(dev_id));
}
}
......@@ -74,7 +72,7 @@ void OpHandleBase::InitCUDA() {
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
int dev_id = place.device;
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
......@@ -109,7 +107,7 @@ void OpHandleBase::InitXPU() {
platform::errors::InvalidArgument(
"%s should have only one dev_ctx.", Name()));
auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
int dev_id = place.device;
platform::SetXPUDeviceId(dev_id);
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
......@@ -309,7 +307,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!events_.empty()) { // Use event
for (auto &p : dev_ctxes_) {
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
auto dev_id = p.first.device;
auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
#ifdef PADDLE_WITH_HIP
......@@ -332,8 +330,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
} else {
auto *ctx = dev_ctxes_.at(p);
auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
cuda_ctx->RecordEvent(
events_.at(BOOST_GET_CONST(platform::CUDAPlace, p).device), callback);
cuda_ctx->RecordEvent(events_.at(p.device), callback);
}
#else
callback();
......
......@@ -45,7 +45,7 @@ static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
for (auto &op : op_handles) {
auto &dev_ctx = op->DeviceContext();
auto &p = dev_ctx.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
int dev_id = p.device;
auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
PADDLE_DEFINE_EXPORTED_bool(
......@@ -125,7 +126,8 @@ void ReduceOpHandle::RunImpl() {
// TODO(gongwb): add cpu support
if (collective_context.endpoints_.size() <= 1 ||
is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
platform::is_cpu_place(in_places[0]) ||
platform::is_cpu_place(t_out_p)) {
GatherLocalSelectedRowsFunctor functor(
in_selected_rows, in_places, dev_ctxes_, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
......@@ -172,13 +174,13 @@ void ReduceOpHandle::RunImpl() {
out_var_handle->place(), pre_in.type());
auto out_p = out_var_handle->place();
int root_id = BOOST_GET_CONST(platform::CUDAPlace, out_p).device;
int root_id = out_p.device;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i];
auto &lod_tensor = *lod_tensors[i];
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
int dev_id = p.device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
void *buffer = const_cast<void *>(lod_tensor.data());
......@@ -218,13 +220,13 @@ void ReduceOpHandle::RunImpl() {
out_var_handle->place(), pre_in.type());
auto out_p = out_var_handle->place();
int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device;
int root_id = out_p.device;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i];
auto &lod_tensor = *lod_tensors[i];
int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
int dev_id = p.device;
auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
void *buffer = const_cast<void *>(lod_tensor.data());
......
......@@ -61,8 +61,8 @@ struct ScaleLossGradFunctor {
} else if (platform::is_xpu_place(place_)) {
#if defined(PADDLE_WITH_XPU)
OutT cast_coeff = static_cast<OutT>(coeff_);
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), out_data,
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_));
memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
SizeOfType(out_dtype_));
VLOG(10) << place_ << "RUN Scale loss grad op";
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -73,9 +73,8 @@ struct ScaleLossGradFunctor {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
stream);
memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
SizeOfType(out_dtype_), stream);
VLOG(10) << place_ << "RUN Scale loss grad op";
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......
......@@ -86,8 +86,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
void ShareTensorBufferOpHandle::InitCUDA() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
int dev_id = dev_ctxes_.begin()->first.device;
events_[dev_id] = nullptr;
#endif
}
......
......@@ -165,7 +165,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
int dev_id = place.device;
auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
auto &nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
......
......@@ -106,9 +106,12 @@ struct EnforceShapeAndDTypeEQVisitor {
void operator()(const LoDTensor& src) {
auto& tensor = dst_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(
src.place().GetType(), tensor.place().GetType(),
platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal. The src place "
"is %s, but the dst place is %s",
src.place().DebugString(), tensor.place().DebugString()));
PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
platform::errors::PreconditionNotMet(
"The dtype of the two variables is not equal."));
......@@ -127,9 +130,12 @@ struct EnforceShapeAndDTypeEQVisitor {
void operator()(const SelectedRows& src) {
auto& selected_rows = dst_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(
src.place().GetType(), selected_rows.place().GetType(),
platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal. The src place "
"is %s, but the dst place is %s",
src.place().DebugString(), selected_rows.place().DebugString()));
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
platform::errors::PreconditionNotMet(
"The dtype of the two variables is not equal."));
......
......@@ -138,7 +138,7 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
// init device, DLDevice type with device_type and device_id
auto place = tensor.place();
t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place);
t_.device = paddle::platform::VisitPlace(place, internal::DLDeviceVisitor());
// init dtype
t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
......
......@@ -63,8 +63,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) {
CHECK_EQ(0, dl_tensor.device.device_id);
} else if (platform::is_gpu_place(place)) {
CHECK_EQ(kDLGPU, dl_tensor.device.device_type);
CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device,
dl_tensor.device.device_id);
CHECK_EQ(place.device, dl_tensor.device.device_id);
} else if (platform::is_cuda_pinned_place(place)) {
CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type);
CHECK_EQ(0, dl_tensor.device.device_id);
......
......@@ -72,7 +72,7 @@ Executor::~Executor() {
#ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache,
// this is needed to have mkl-dnn unit tests working
ClearMKLDNNCache(place_, this);
platform::ClearMKLDNNCache(place_, this);
#endif
}
......@@ -443,31 +443,26 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
if (platform::is_gpu_place(place_)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
} else {
gc.reset(new DefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
gc.reset(new DefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(
platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
#endif
} else if (platform::is_cpu_place(place_)) {
gc.reset(new CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
gc.reset(new CPUGarbageCollector(place_, max_memory_size));
} else if (platform::is_xpu_place(place_)) {
#ifdef PADDLE_WITH_XPU
gc.reset(new XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
gc.reset(new XPUGarbageCollector(place_, max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_ipu_place(place_)) {
#ifdef PADDLE_WITH_IPU
gc.reset(new IPUGarbageCollector(
BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
gc.reset(new IPUGarbageCollector(place_, max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
......@@ -476,16 +471,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#ifdef PADDLE_WITH_ASCEND_CL
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(
......@@ -494,11 +487,9 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
} else if (platform::is_mlu_place(place_)) {
#ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
gc.reset(new MLUDefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(
......
......@@ -137,8 +137,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
const int expand_embed_dim,
const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
BOOST_GET_CONST(platform::CUDAPlace, place)))
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
......@@ -203,8 +202,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys, uint64_t* total_keys,
const int64_t* gpu_len, int slot_num, int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
BOOST_GET_CONST(platform::CUDAPlace, place)))
platform::DeviceContextPool::Instance().Get(place))
->stream();
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
......@@ -225,8 +223,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
const int hidden_size, const int expand_embed_dim,
const int64_t total_length, const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
BOOST_GET_CONST(platform::CUDAPlace, place)))
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
......
......@@ -45,7 +45,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
} else if (platform::is_gpu_place(place)) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
int device_id = place.GetDeviceId();
LoDTensor& total_keys_tensor = keys_tensor[device_id];
uint64_t* total_keys = reinterpret_cast<uint64_t*>(
total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
......@@ -131,7 +131,7 @@ void BoxWrapper::PushSparseGradCase(
"Warning:: CPUPlace is not supported in PaddleBox now."));
} else if (platform::is_gpu_place(place)) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
int device_id = place.GetDeviceId();
LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
uint64_t* total_keys =
reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
......@@ -143,8 +143,7 @@ void BoxWrapper::PushSparseGradCase(
push_boxps_timer.Start();
int ret = boxps_ptr_->PushSparseGPU(
total_keys, reinterpret_cast<void*>(total_grad_values_gpu),
static_cast<int>(total_length),
BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId());
static_cast<int>(total_length), place.GetDeviceId());
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
"PushSparseGPU failed in BoxPS."));
push_boxps_timer.Pause();
......
......@@ -764,8 +764,7 @@ void FleetWrapper::PushDenseVarsAsync(
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace());
memory::Copy(platform::CUDAPinnedPlace(), pin_g,
BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
memory::Copy(platform::CUDAPinnedPlace(), pin_g, place, g_data,
sizeof(float) * count, stream);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
......@@ -821,8 +820,7 @@ void FleetWrapper::PushDenseVarsAsync(
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g =
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
memory::Copy(platform::CPUPlace(), pin_g,
BOOST_GET_CONST(platform::XPUPlace, place), g_data,
memory::Copy(platform::CPUPlace(), pin_g, place, g_data,
sizeof(float) * count);
float* g = pin_g;
......
......@@ -116,14 +116,12 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
tensor->numel() * SizeOfType(tensor->type()));
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
tensor->data(), tensor->numel() * SizeOfType(tensor->type()),
nullptr);
#endif
#ifdef PADDLE_WITH_XPU
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
tensor->data(), tensor->numel() * SizeOfType(tensor->type()));
#endif
}
......@@ -158,8 +156,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else
memcpy(tensor_data, req_var.data().data(),
......@@ -197,8 +194,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#else
memcpy(tensor_data, req_var.data().data(),
......
......@@ -791,7 +791,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
"Warning:: CPUPlace is not supported in GpuPs now."));
} else if (platform::is_gpu_place(place)) {
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = reinterpret_cast<uint64_t*>(
......@@ -859,7 +859,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
PADDLE_THROW(platform::errors::Unimplemented(
"Warning:: CPUPlace is not supported in GPUPS now."));
} else if (platform::is_gpu_place(place)) {
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys =
......
......@@ -113,8 +113,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
const int hidden_size,
const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
BOOST_GET_CONST(platform::CUDAPlace, place)))
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
......@@ -132,8 +131,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
const int64_t* gpu_len, int slot_num,
int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
BOOST_GET_CONST(platform::CUDAPlace, place)))
platform::DeviceContextPool::Instance().Get(place))
->stream();
CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
origin_keys, total_keys, gpu_len, slot_num, total_len);
......@@ -148,8 +146,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const int64_t total_length,
const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(
BOOST_GET_CONST(platform::CUDAPlace, place)))
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
......
......@@ -101,7 +101,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
}
StreamGarbageCollector::~StreamGarbageCollector() {
auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
auto place = this->dev_ctx_->GetPlace();
platform::CUDADeviceGuard guard(place.device);
platform::GpuStreamSync(stream_);
platform::GpuDestroyStream(stream_);
......@@ -186,7 +186,7 @@ MLUStreamGarbageCollector::MLUStreamGarbageCollector(
}
MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
auto place = BOOST_GET_CONST(platform::MLUPlace, this->dev_ctx_->GetPlace());
auto place = this->dev_ctx_->GetPlace();
platform::MLUDeviceGuard guard(place.device);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
......
......@@ -46,8 +46,8 @@ void SetMicroId(paddle::framework::Scope* scope,
temp_ptr_float[0] = micro_id;
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), reinterpret_cast<void*>(temp_ptr),
memory::Copy(place, tensor_data, platform::CPUPlace(),
reinterpret_cast<void*>(temp_ptr),
tensor->numel() * framework::SizeOfType(tensor->type()),
stream);
#endif
......
......@@ -117,12 +117,12 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
#ifdef PADDLE_WITH_CUDA
auto stream = copy_streams_[num];
auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto dev_id = place.device;
platform::CUDADeviceGuard guard(dev_id);
#endif
#ifdef PADDLE_WITH_XPU
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
auto dev_id = place.device;
platform::XPUDeviceGuard guard(dev_id);
#endif
......@@ -173,13 +173,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel(), stream);
} else {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel(), stream);
memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
sizeof(T) * root_tensor->numel(), stream);
}
}
#endif
......@@ -193,13 +191,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel());
} else {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel());
memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
sizeof(T) * root_tensor->numel());
}
}
#endif
......@@ -286,7 +282,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
(context->ops_).push_back(local_op_ptr);
}
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto dev_id = place.device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
......@@ -336,15 +332,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
_ForEachDataType_(MergeCallback);
if (!platform::is_cpu_place(thread_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
auto dev_id = thread_tensor->place().device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = thread_tensor->place();
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
auto dev_id = place.device;
platform::XPUDeviceGuard guard(dev_id);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
......@@ -364,15 +359,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
merge_var);
if (!platform::is_cpu_place(root_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
auto dev_id = root_tensor->place().device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = root_tensor->place();
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
auto dev_id = place.device;
platform::XPUDeviceGuard guard(dev_id);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
......@@ -442,7 +436,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
(context->ops_).push_back(local_op_ptr);
}
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto dev_id = place.device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
......
......@@ -67,7 +67,7 @@ Graph *Pass::Apply(Graph *graph) const {
#ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache,
// Passes can change params, tensors, so caching need to be discarded
ClearMKLDNNCache(paddle::platform::CPUPlace());
platform::ClearMKLDNNCache(paddle::platform::CPUPlace());
#endif
VLOG(10) << "finish to apply pass " << Type() << " to graph";
return graph;
......
......@@ -32,10 +32,8 @@ namespace framework {
inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
const paddle::memory::allocation::AllocationPtr &gpu_) {
return gpu_ == nullptr
? paddle::none
: paddle::optional<platform::CUDAPlace>(
BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
return gpu_ == nullptr ? paddle::none
: paddle::optional<platform::CUDAPlace>(gpu_->place());
}
// Vector<T> implements the std::vector interface, and can get Data or
......@@ -369,11 +367,11 @@ class Vector {
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
{
platform::CUDAPlace p(place.GetDeviceId());
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == paddle::none ||
cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
if (cuda_place == paddle::none || cuda_place == p) {
return m_.Data().CUDAData(place);
}
}
......@@ -385,11 +383,11 @@ class Vector {
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
{
platform::CUDAPlace p(place.GetDeviceId());
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == paddle::none ||
cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
if (cuda_place == paddle::none || cuda_place == p) {
return m_.MutableData()->CUDAMutableData(place);
}
}
......
......@@ -131,7 +131,7 @@ NaiveExecutor::~NaiveExecutor() {
#ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache,
// this is needed to have mkl-dnn unit tests working
ClearMKLDNNCache(place_, this);
platform::ClearMKLDNNCache(place_, this);
#endif
}
......
......@@ -43,7 +43,7 @@ class ProfilerGuard {
void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
auto cuda_place = place;
cost_info_->device_memory_bytes =
platform::RecordedGpuMallocSize(cuda_place.device);
#endif
......
......@@ -22,7 +22,7 @@ namespace framework {
size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
int cur_loc = 0;
int place = key.place_.which();
int place = static_cast<int>(key.place_.GetType());
cur_loc += OpKernelType::kPlaceBits;
int data_type = static_cast<int>(key.data_type_) << cur_loc;
......
......@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) {
LibraryType::kCUDNN);
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type["
"CUDNN]");
using CUDAPlace = paddle::platform::CUDAPlace;
......@@ -35,7 +35,7 @@ TEST(OpKernelType, ToString) {
LibraryType::kCUDNN);
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
"data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
"CUDAPlace(0)]:library_"
"Place(gpu:0)]:library_"
"type[CUDNN]");
}
......
......@@ -210,7 +210,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with CUDA support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto dev_id = place.device;
platform::SetDeviceId(dev_id);
#endif
} else if (platform::is_xpu_place(place)) {
......@@ -220,7 +220,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with XPU support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
auto dev_id = place.device;
platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_npu_place(place)) {
......@@ -230,7 +230,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with NPU support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
auto dev_id = place.device;
platform::SetNPUDeviceId(dev_id);
#endif
} else if (platform::is_mlu_place(place)) {
......@@ -240,7 +240,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with MLU support.",
place));
#else
auto dev_id = BOOST_GET_CONST(platform::MLUPlace, place).device;
auto dev_id = place.device;
platform::SetMLUDeviceId(dev_id);
#endif
}
......@@ -1330,7 +1330,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
}
#endif
#ifdef PADDLE_WITH_XPU
if (is_xpu_place(expected_kernel_key.place_) &&
if (platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(type_))) {
......@@ -1343,7 +1343,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
......@@ -1353,7 +1353,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
is_mlu_place(expected_kernel_key.place_)) {
platform::is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
......
......@@ -500,11 +500,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
} else {
gc.reset(new StreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
gc.reset(new StreamGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
......@@ -515,11 +513,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new MLUStreamGarbageCollector(
BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
......@@ -529,8 +525,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
#endif
} else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU)
gc.reset(new XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size));
gc.reset(new XPUGarbageCollector(place, max_memory_size));
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -538,8 +533,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (platform::is_cpu_place(place)) {
gc.reset(new CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size));
gc.reset(new CPUGarbageCollector(place, max_memory_size));
VLOG(10) << "Created GarbageCollector at " << place;
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
......@@ -609,10 +603,9 @@ void InitP2P(const std::vector<platform::Place> &places) {
std::vector<int> devices;
for (int i = 0; i < count; i++) {
if (!is_gpu_place(places[i])) return;
if (!platform::is_gpu_place(places[i])) return;
platform::CUDAPlace device =
BOOST_GET_CONST(platform::CUDAPlace, places[i]);
platform::CUDAPlace device = places[i];
devices.push_back(device.GetDeviceId());
}
......@@ -655,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
platform::errors::Unavailable(
"NPU is not supported in ParallelExecutor"));
PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]),
true, platform::errors::Unavailable(
"NPU is not supported in ParallelExecutor."));
InitP2P(places);
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size());
......
......@@ -135,13 +135,11 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w,
memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]);
#endif
#ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
platform::CPUPlace(), pin_w,
memory::Copy(places_[i], w, platform::CPUPlace(), pin_w,
sizeof(float) * tensor->numel());
#endif
}
......
......@@ -224,23 +224,20 @@ void SectionWorker::TrainFiles() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place_)) {
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
}
}
#elif defined(PADDLE_WITH_ASCEND_CL)
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#endif
} // max_memory_size >= 0
......
......@@ -25,13 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/memcpy.h"
namespace paddle {
namespace platform {
class DeviceContext;
class Place;
} // namespace platform
} // namespace paddle
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
......
......@@ -153,14 +153,12 @@ void TensorFromArray(const T* src, const size_t& array_size,
auto size = array_size * sizeof(T);
if (platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size);
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
......@@ -176,8 +174,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
......@@ -205,14 +202,12 @@ void TensorFromVector(const std::vector<T>& src,
auto size = src.size() * sizeof(T);
if (platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size);
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
......@@ -233,8 +228,7 @@ void TensorFromVector(const std::vector<T>& src,
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
......@@ -252,8 +246,7 @@ void TensorFromVector(const std::vector<T>& src,
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(dst_place)) {
memory::Copy(
BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
......@@ -280,14 +273,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
auto size = src.size() * sizeof(bool);
if (platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size);
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
dst_place, dst_ptr, src_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
......@@ -303,8 +294,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
......@@ -362,37 +352,29 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
auto dst_ptr = static_cast<void*>(dst->data());
if (platform::is_cpu_place(src.place())) {
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
size);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
src_ptr, size,
dst_place, dst_ptr, src.place(), src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
#if defined(PADDLE_WITH_XPU)
else if (platform::is_xpu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
size);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
src_ptr, size,
dst_place, dst_ptr, src.place(), src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
......@@ -412,37 +394,29 @@ inline void TensorToVector(const Tensor& src,
auto dst_ptr = static_cast<void*>(array);
if (platform::is_cpu_place(src.place())) {
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
size);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
src_ptr, size,
dst_place, dst_ptr, src.place(), src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
#if defined(PADDLE_WITH_XPU)
else if (platform::is_xpu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
size);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
src_ptr, size,
dst_place, dst_ptr, src.place(), src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
......@@ -467,8 +441,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
"The input tensor should be CPU device, but actually it is in %s.",
src.place()));
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
template <>
......@@ -488,8 +461,7 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
"The input tensor should be CPU device, but actually it is in %s.",
src.place()));
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]);
......
......@@ -86,7 +86,7 @@ void BKCLParallelContext::Init() {
}
BcastBKCLId(bkcl_ids, 0);
int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
int xpu_id = place_.device;
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
......@@ -111,7 +111,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) {
}
BcastBKCLId(bkcl_ids, 0);
int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
int xpu_id = place_.device;
VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
<< " ring id: " << ring_id;
......
......@@ -78,7 +78,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) {
void operator()(const platform::CPUPlace& place) const {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
......@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#ifdef PADDLE_WITH_XPU
void operator()(const platform::XPUPlace& place) {
void operator()(const platform::XPUPlace& place) const {
using XPUType = typename XPUTypeTrait<T>::Type;
platform::XPUDeviceContext* ctx = dynamic_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
......@@ -100,7 +100,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
r, XPUAPIErrorMsg[r]));
}
#else
void operator()(const platform::XPUPlace& place) {
void operator()(const platform::XPUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -109,7 +109,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const platform::CUDAPlace& place) {
void operator()(const platform::CUDAPlace& place) const {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
......@@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const platform::CUDAPlace& place) {
void operator()(const platform::CUDAPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -126,7 +126,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif
#ifdef PADDLE_WITH_MLU
void operator()(const platform::MLUPlace& place) {
void operator()(const platform::MLUPlace& place) const {
// TODO(fwg): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
......@@ -134,7 +134,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
place));
}
#else
void operator()(const platform::MLUPlace& place) {
void operator()(const platform::MLUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -143,7 +143,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void operator()(const platform::NPUPlace& place) {
void operator()(const platform::NPUPlace& place) const {
// TODO(zhiqiu): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
......@@ -151,7 +151,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
place));
}
#else
void operator()(const platform::NPUPlace& place) {
void operator()(const platform::NPUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -159,21 +159,21 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
void operator()(const platform::NPUPinnedPlace& place) {
void operator()(const platform::NPUPinnedPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
void operator()(const platform::CUDAPinnedPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
// there is NO support in IPUPlace
void operator()(const platform::IPUPlace& place) {
void operator()(const platform::IPUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
......@@ -183,7 +183,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
private:
int64_t numel_;
const T* x_;
T* y_;
mutable T* y_;
};
#ifdef PADDLE_WITH_XPU
......@@ -248,7 +248,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
boost::apply_visitor(func, place); \
platform::VisitPlace(place, func); \
return; \
}
......
......@@ -86,7 +86,7 @@ void HCCLParallelContext::Init() {
}
BcastHCCLId(hccl_ids, 0, server_fd);
int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device;
int npu_id = place_.device;
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
......@@ -96,10 +96,10 @@ void HCCLParallelContext::Init() {
&hccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, npu_id,
ring_id);
compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::NPUPlace, place_).device));
comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::NPUPlace, place_).device));
compute_events_.emplace_back(
platform::NpuEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(
platform::NpuEventResourcePool::Instance().New(place_.device));
}
}
......@@ -117,7 +117,7 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
}
BcastHCCLId(hccl_ids, 0, server_fd);
int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device;
int npu_id = place_.device;
VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
<< " ring id: " << ring_id;
......@@ -125,10 +125,10 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
platform::HCCLCommContext::Instance().CreateHCCLComm(
&hccl_ids[0], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id);
compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::NPUPlace, place_).device));
comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::NPUPlace, place_).device));
compute_events_.emplace_back(
platform::NpuEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(
platform::NpuEventResourcePool::Instance().New(place_.device));
}
void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
......
......@@ -193,7 +193,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
grad_var_->MutableVar()->GetMutable<framework::SelectedRows>();
if (grad_t->mutable_value()->IsInitialized()) {
#ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place());
if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
#endif
grad_t->mutable_rows()->clear();
grad_t->mutable_value()->clear();
......@@ -211,7 +211,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
grad_t->clear();
}
#ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place());
if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
#endif
}
}
......
......@@ -77,7 +77,7 @@ void NCCLParallelContext::Init() {
}
BcastNCCLId(nccl_ids, 0, server_fd);
int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
int gpu_id = place_.device;
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
......@@ -88,10 +88,9 @@ void NCCLParallelContext::Init() {
ring_id);
compute_events_.emplace_back(
platform::CudaEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
platform::CudaEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(
platform::CudaEventResourcePool::Instance().New(place_.device));
}
}
......@@ -111,7 +110,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
}
BcastNCCLId(nccl_ids, 0, server_fd);
int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
int gpu_id = place_.device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id;
......@@ -119,10 +118,10 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
platform::NCCLCommContext::Instance().CreateComm(
&nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
compute_events_.emplace_back(
platform::CudaEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(
platform::CudaEventResourcePool::Instance().New(place_.device));
}
void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
......
......@@ -194,7 +194,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (is_xpu_place(expected_kernel_key.place_) &&
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) {
......@@ -207,7 +207,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
paddle::platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
......@@ -217,7 +217,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
is_mlu_place(expected_kernel_key.place_)) {
paddle::platform::is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
......
......@@ -835,7 +835,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
// thrown in comm_pool_.
auto next_group = next_group_;
comm_pool_->enqueue([this, run_order, next_group, &group] {
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
auto dev_id = place_.device;
platform::SetXPUDeviceId(dev_id);
FusedAllReduceSchedule(run_order, group, next_group);
{
......
......@@ -87,8 +87,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
std::unique_ptr<framework::GarbageCollector> gc;
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gc.reset(new framework::DefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place), 0));
gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
......@@ -98,8 +97,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
#endif
} else if (platform::is_cuda_pinned_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gc.reset(new framework::CUDAPinnedGarbageCollector(
BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
......@@ -110,8 +108,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
#endif
} else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU)
gc.reset(new framework::XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place), 0));
gc.reset(new framework::XPUGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -119,14 +116,12 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (platform::is_cpu_place(place)) {
gc.reset(new framework::CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place), 0));
gc.reset(new framework::CPUGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
} else if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL)
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
gc.reset(new framework::NPUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place), 0));
gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -135,8 +130,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
#endif
} else if (platform::is_mlu_place(place)) {
#if defined(PADDLE_WITH_MLU)
gc.reset(new framework::MLUDefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::MLUPlace, place), 0));
gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -197,31 +191,28 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
try {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device);
platform::SetDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif
} else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
platform::SetXPUDeviceId(
BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::SetXPUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with XPU if use XPUPlace."));
#endif
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
platform::SetNPUDeviceId(
BOOST_GET_CONST(platform::NPUPlace, place).device);
platform::SetNPUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
platform::SetMLUDeviceId(
BOOST_GET_CONST(platform::MLUPlace, place).device);
platform::SetMLUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU if use MLUPlace."));
......
......@@ -127,7 +127,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
auto dst_gpu_place = place;
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), pt.data.data(), pt.data.length(),
dev_ctx->stream());
......@@ -137,7 +137,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
#endif
} else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place);
auto dst_xpu_place = place;
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), pt.data.data(), pt.data.length());
#else
......@@ -954,14 +954,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
// model.
res->SetPlace(PaddlePlace::kCPU);
} else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
auto xpu_place = place_;
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
}
} else if (platform::is_npu_place(place_)) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
auto npu_place = place_;
res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
} else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
auto gpu_place = place_;
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
......@@ -993,14 +993,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
// model.
res->SetPlace(PaddlePlace::kCPU);
} else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
auto xpu_place = place_;
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
}
} else if (platform::is_npu_place(place_)) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
auto npu_place = place_;
res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
} else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
auto gpu_place = place_;
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
......@@ -1050,7 +1050,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
if (stream != nullptr) {
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
auto gpu_place = place_;
auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
dev_ctx->SetThreadLocalStream(stream);
......@@ -1065,7 +1065,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
auto gpu_place = place_;
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
#ifdef PADDLE_WITH_HIP
......
......@@ -243,7 +243,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
auto dst_gpu_place = place_;
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), dev_ctx->stream());
......@@ -253,7 +253,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#endif
} else if (platform::is_xpu_place(place_)) {
#ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
auto dst_xpu_place = place_;
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length());
......@@ -267,7 +267,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
auto dst_npu_place = place_;
memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), dev_ctx->stream());
......
......@@ -253,7 +253,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, t_place);
auto gpu_place = t_place;
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(),
......@@ -280,7 +280,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#endif
} else if (place_ == PlaceType::kXPU) {
#ifdef PADDLE_WITH_XPU
auto xpu_place = BOOST_GET_CONST(paddle::platform::XPUPlace, t_place);
auto xpu_place = t_place;
paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), xpu_place, t_data,
ele_num * sizeof(T));
......@@ -293,7 +293,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place);
auto npu_place = t_place;
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(),
......
......@@ -134,7 +134,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
"Lite::MemoryCopy CPU->GPU is not yet implemented."));
} else if (platform::is_gpu_place(dst_place) &&
platform::is_gpu_place(src_place)) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
auto gpu_place = src_place;
memory::Copy(
gpu_place, dst_data, gpu_place, src_data, size,
static_cast<const platform::CUDADeviceContext&>(ctx).stream());
......
......@@ -813,8 +813,7 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
}
#endif
platform::CUDAPlace cuda_place =
BOOST_GET_CONST(platform::CUDAPlace, place);
platform::CUDAPlace cuda_place(place.GetDeviceId());
return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
}
#endif
......@@ -838,8 +837,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
}
#endif
platform::CUDAPlace cuda_place =
BOOST_GET_CONST(platform::CUDAPlace, place);
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
}
#endif
......@@ -859,8 +857,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
}
#endif
platform::CUDAPlace cuda_place =
BOOST_GET_CONST(platform::CUDAPlace, place);
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, m_->GetDefaultStream(cuda_place));
}
#endif
......@@ -935,7 +932,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
}
#endif
platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size);
......
......@@ -19,12 +19,7 @@
#include <map>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace platform {
class Place;
} // namespace platform
} // namespace paddle
#include "paddle/fluid/platform//place.h"
namespace paddle {
namespace memory {
......
......@@ -34,7 +34,7 @@ namespace allocation {
bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
allocation->place(), place_,
platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
......
......@@ -144,8 +144,8 @@ class CUDADeviceContextAllocatorPool {
}
AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
auto iter = allocators_.find(
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()));
auto iter =
allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
PADDLE_ENFORCE_NE(
iter, allocators_.end(),
platform::errors::NotFound("No allocator found for CUDAPlace."));
......
......@@ -103,7 +103,7 @@ bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
allocation->place(), place_,
platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug"));
......
......@@ -26,6 +26,7 @@
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
#include "paddle/pten/common/place.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -791,7 +792,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
namespace allocation {
pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
......@@ -799,16 +800,16 @@ pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
}
void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place());
paddle::platform::VisitPlace(
allocation->place(),
legacy::FreeVisitor(allocation->ptr(), allocation->size()));
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation;
}
uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
return boost::apply_visitor(legacy::ReleaseVisitor(), place);
return paddle::platform::VisitPlace(place, legacy::ReleaseVisitor());
}
} // namespace allocation
......
......@@ -24,7 +24,7 @@ namespace allocation {
bool NPUAllocator::IsAllocThreadSafe() const { return true; }
void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
allocation->place(), place_,
platform::errors::PermissionDenied(
"NPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
......
......@@ -164,8 +164,7 @@ void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators =
allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
uint64_t released_size = 0;
for (StreamSafeCUDAAllocator* allocator : allocators) {
released_size += allocator->ProcessUnfreedAllocationsWithRelease();
......@@ -192,7 +191,7 @@ uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
return underlying_allocator_->Release(place_);
}
std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
std::map<platform::Place, std::vector<StreamSafeCUDAAllocator*>>
StreamSafeCUDAAllocator::allocator_map_;
SpinLock StreamSafeCUDAAllocator::allocator_map_lock_;
......
......@@ -65,7 +65,7 @@ class StreamSafeCUDAAllocator : public Allocator {
void ProcessUnfreedAllocations();
uint64_t ProcessUnfreedAllocationsWithRelease();
static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>>
static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
allocator_map_;
static SpinLock allocator_map_lock_;
......
......@@ -23,8 +23,7 @@ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
if (platform::is_gpu_place(place_)) {
buddy_allocator_.reset(new memory::detail::BuddyAllocator(
std::unique_ptr<memory::detail::SystemAllocator>(
new memory::detail::GPUAllocator(
BOOST_GET_CONST(platform::CUDAPlace, place_).device)),
new memory::detail::GPUAllocator(place_.device)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
} else {
PADDLE_THROW(platform::errors::Unavailable(
......
此差异已折叠。
......@@ -16,12 +16,6 @@
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace paddle {
namespace platform {
struct CUDAPlace;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
using framework::Tensor;
......
......@@ -25,8 +25,7 @@ struct GetTensorValue<platform::CUDADeviceContext, T> {
const framework::Tensor& tensor) const {
const T* data = tensor.data<T>();
T value;
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
const auto gpu_place = dev_ctx.GetPlace();
memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
dev_ctx.stream());
return value;
......
......@@ -117,9 +117,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
}
int64_t total_num = h_starts[xs_size];
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
dev_ctx.stream());
memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
(xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
// copy each tensor's data address to device
auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
......@@ -134,8 +133,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
h_xs[i] = xs[i]->data<T>();
h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
}
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream());
memory::Copy(dev_ctx.GetPlace(), d_xs, cpu_place, h_xs,
2 * xs_size * sizeof(T*), dev_ctx.stream());
// Launch Kernel
int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
......
......@@ -41,8 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
MPDType cpu_scale_data;
if (platform::is_xpu_place(scale->place())) {
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
BOOST_GET_CONST(platform::XPUPlace, scale->place()),
static_cast<const void*>(scale_data), sizeof(MPDType));
scale->place(), static_cast<const void*>(scale_data),
sizeof(MPDType));
} else {
cpu_scale_data = (*scale_data);
......@@ -87,8 +87,7 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
dev_ctx.Wait();
}
memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, sizeof(bool));
dev_ctx.GetPlace(), found_inf_data, sizeof(bool));
}
if (cpu_found_inf_data) {
......@@ -142,9 +141,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
sizeof(bool));
memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(),
&cpu_found_inf_data, sizeof(bool));
}
};
......
......@@ -114,9 +114,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
for (int i = 0; i < xs_size; i++) {
h_starts[i + 1] = h_starts[i] + outs[i]->numel();
}
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
dev_ctx.stream());
memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
(xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
// copy each tensor of "outs" data address array to device
auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
......@@ -128,9 +127,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
for (size_t i = 0; i < xs_size; ++i) {
h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
}
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*),
dev_ctx.stream());
memory::Copy(dev_ctx.GetPlace(), d_out_addrs, cpu_place, h_out_addrs,
xs_size * sizeof(T*), dev_ctx.stream());
// launch cuda kernel
int64_t total_num = h_starts[xs_size];
......
......@@ -187,9 +187,7 @@ class LazyZerosNPU {
framework::TensorCopy(*x, place, dev_ctx, out);
} else if (zero_ptr != dst_ptr) {
auto size = out->numel() * framework::SizeOfType(out->type());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size,
stream);
memory::Copy(place, dst_ptr, place, zero_ptr, size, stream);
}
}
}
......
......@@ -43,8 +43,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
bool cpu_found_inf_data = false;
if (platform::is_xpu_place(found_inf->place())) {
memory::Copy(platform::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data),
BOOST_GET_CONST(platform::XPUPlace, found_inf->place()),
static_cast<void*>(&cpu_found_inf_data), found_inf->place(),
static_cast<const void*>(found_inf_data), sizeof(bool));
} else {
cpu_found_inf_data = (*found_inf_data);
......@@ -97,16 +96,16 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
MPDType cpu_pre_loss_scaling_data;
if (platform::is_xpu_place(bad_in->place())) {
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_bad_in_data),
BOOST_GET_CONST(platform::XPUPlace, bad_in->place()),
static_cast<const void*>(bad_in_data), sizeof(int));
bad_in->place(), static_cast<const void*>(bad_in_data),
sizeof(int));
} else {
cpu_bad_in_data = (*bad_in_data);
}
if (platform::is_xpu_place(good_in->place())) {
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_good_in_data),
BOOST_GET_CONST(platform::XPUPlace, good_in->place()),
static_cast<const void*>(good_in_data), sizeof(int));
good_in->place(), static_cast<const void*>(good_in_data),
sizeof(int));
} else {
cpu_good_in_data = (*good_in_data);
}
......@@ -114,7 +113,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
if (platform::is_xpu_place(pre_loss_scaling->place())) {
memory::Copy(
platform::CPUPlace(), static_cast<void*>(&cpu_pre_loss_scaling_data),
BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()),
pre_loss_scaling->place(),
static_cast<const void*>(pre_loss_scaling_data), sizeof(MPDType));
} else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
......@@ -146,15 +145,13 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
}
}
// copy to device
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
sizeof(int));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
good_out_data, platform::CPUPlace(), &cpu_good_out_data,
sizeof(int));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
updated_loss_scaling_data, platform::CPUPlace(),
&cpu_updated_loss_scaling_data, sizeof(MPDType));
memory::Copy(dev_ctx.GetPlace(), bad_out_data, platform::CPUPlace(),
&cpu_bad_out_data, sizeof(int));
memory::Copy(dev_ctx.GetPlace(), good_out_data, platform::CPUPlace(),
&cpu_good_out_data, sizeof(int));
memory::Copy(dev_ctx.GetPlace(), updated_loss_scaling_data,
platform::CPUPlace(), &cpu_updated_loss_scaling_data,
sizeof(MPDType));
}
};
......
......@@ -25,8 +25,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -27,8 +27,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -26,8 +26,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -25,8 +25,7 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream();
auto cuda_place =
BOOST_GET_CONST(platform::CUDAPlace, in_old_num_accumulates->place());
auto cuda_place = in_old_num_accumulates->place();
memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
stream);
......@@ -44,8 +43,7 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
auto cuda_place =
BOOST_GET_CONST(platform::CUDAPlace, out_old_num_accumulates->place());
auto cuda_place = out_old_num_accumulates->place();
memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
......
......@@ -57,8 +57,7 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
int64_t size = x->numel();
int device_id =
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
int device_id = ctx.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
......
......@@ -102,8 +102,7 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
std::vector<int> error_info; // only for checking positive matrix
error_info.resize(batch_count);
memory::Copy(platform::CPUPlace(), error_info.data(),
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
for (int i = 0; i < batch_count; ++i) {
......
......@@ -306,7 +306,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
num_classes, num_samples));
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
auto place = dev_ctx.GetPlace();
int batch_size = label->numel();
// Algorithm:
......@@ -397,8 +397,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
(NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
1) *
vec_size;
int device_id =
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
int device_id = ctx.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
auto seed_offset = gen_cuda->IncrementOffset(offset);
......
......@@ -33,7 +33,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......
......@@ -34,7 +34,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
"The place of ExecutionContext should be CUDAPlace."));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
int dev_id = ctx.GetPlace().device;
int root_dev_id = ctx.Attr<int>("root");
auto in = ctx.Input<framework::Tensor>("X");
......
......@@ -40,7 +40,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
"The place of ExecutionContext should be XPUPlace."));
#if defined(PADDLE_WITH_XPU_BKCL)
int dev_id = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()).device;
int dev_id = ctx.GetPlace().device;
int root_dev_id = ctx.Attr<int>("root");
auto in = ctx.Input<framework::Tensor>("X");
......
......@@ -24,7 +24,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct XPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -24,7 +24,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct XPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -24,7 +24,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct XPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -22,7 +22,6 @@ namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
struct CPUPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
......@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
struct CUDAPlace;
struct float16;
} // namespace platform
} // namespace paddle
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册