未验证 提交 c48a9ad5 编写于 作者: W Wilber 提交者: GitHub

[Pten] Replace platform::Place to pten::Place. (#38899)

* add pten::Place data structure.

* update ci problem

* fix ci problem

* update

* using platform::Place=pten::Place

* remove BOOST_GET_CONST for CPUPlace and GPUPlace

* compile pass 25%.

* compile pass 45%

* compile pass 60%

* remove boost_get for xpu npu mlu and ipu

* compile pass on cpu and gpu.

* fix compile problem

* fix compile error.

* update

* fix ci problem

* update

* ci approve

* fix ci problem

* fix ci eager test problem

* remove BOOST_GET_CONST

* fix npu compile
上级 1dbc8632
...@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC( ...@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
if (framework::IsFastEagerDeletionModeEnabled()) { if (framework::IsFastEagerDeletionModeEnabled()) {
gc.reset(new framework::UnsafeFastGPUGarbageCollector( gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); max_memory_size));
} }
} }
#endif #endif
......
...@@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var, ...@@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var,
iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len); iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
char* temp_ptr = char* temp_ptr = new char[tensor->numel() *
new char[tensor->numel() * framework::SizeOfType(tensor->type())]; framework::SizeOfType(tensor->type())]; // NOLINT
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
platform::CPUPlace(), temp_ptr, platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
tensor->numel() * framework::SizeOfType(tensor->type()), stream); tensor->numel() * framework::SizeOfType(tensor->type()), stream);
auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
iobuf->append(reinterpret_cast<const char*>(&data_len), 8); iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
...@@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var, ...@@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var,
iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len); iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
char* temp_ptr = char* temp_ptr = new char[tensor->numel() *
new char[tensor->numel() * framework::SizeOfType(tensor->type())]; framework::SizeOfType(tensor->type())]; // NOLINT
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
platform::CPUPlace(), temp_ptr, platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
tensor->numel() * framework::SizeOfType(tensor->type()), stream); tensor->numel() * framework::SizeOfType(tensor->type()), stream);
auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
iobuf->append(reinterpret_cast<const char*>(&data_len), 8); iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
...@@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg, ...@@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
} }
void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
butil::IOBufBytesIterator& io_buffer_itr, butil::IOBufBytesIterator& io_buffer_itr, // NOLINT
const platform::DeviceContext& ctx) { const platform::DeviceContext& ctx) {
const auto place = ctx.GetPlace(); const auto place = ctx.GetPlace();
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>(); framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
...@@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, ...@@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
// IO Buffer // IO Buffer
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
unsigned long data_len; unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(tensor_data, data_len); io_buffer_itr.copy_and_forward(tensor_data, data_len);
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
unsigned long data_len; unsigned long data_len; // NOLINT
char* temp_ptr = char* temp_ptr = new char[tensor->numel() *
new char[tensor->numel() * framework::SizeOfType(tensor->type())]; framework::SizeOfType(tensor->type())]; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, memory::Copy(
platform::CPUPlace(), (void*)temp_ptr, place, tensor_data, platform::CPUPlace(), (void*)temp_ptr, // NOLINT
tensor->numel() * framework::SizeOfType(tensor->type()), tensor->numel() * framework::SizeOfType(tensor->type()), stream);
stream);
delete[] temp_ptr; delete[] temp_ptr;
#endif #endif
} }
} }
void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, void DeserializeSelectedRows(
butil::IOBufBytesIterator& io_buffer_itr, framework::Variable* var, const VarMsg& msg,
const platform::DeviceContext& ctx) { butil::IOBufBytesIterator& io_buffer_itr, // NOLINT
const platform::DeviceContext& ctx) {
const auto place = ctx.GetPlace(); const auto place = ctx.GetPlace();
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
framework::Tensor* tensor = slr->mutable_value(); framework::Tensor* tensor = slr->mutable_value();
...@@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, ...@@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
tensor->mutable_data(place, VarMessageToVarType(msg.data_type())); tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
// IO Buffer // IO Buffer
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
unsigned long data_len; unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(tensor_data, data_len); io_buffer_itr.copy_and_forward(tensor_data, data_len);
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
char* temp_ptr = char* temp_ptr = new char[tensor->numel() *
new char[tensor->numel() * framework::SizeOfType(tensor->type())]; framework::SizeOfType(tensor->type())]; // NOLINT
unsigned long data_len; unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(temp_ptr, data_len); io_buffer_itr.copy_and_forward(temp_ptr, data_len);
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr,
platform::CPUPlace(), temp_ptr,
tensor->numel() * framework::SizeOfType(tensor->type()), tensor->numel() * framework::SizeOfType(tensor->type()),
stream); stream);
delete[] temp_ptr; delete[] temp_ptr;
......
...@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx, ...@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy( memory::Copy(
platform::CPUPlace(), temp_ptr, platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
tensor->numel() * framework::SizeOfType(tensor->type()), stream); tensor->numel() * framework::SizeOfType(tensor->type()), stream);
float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr); float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
micro_id = static_cast<int>(temp_ptr_float[0]); micro_id = static_cast<int>(temp_ptr_float[0]);
......
...@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
TensorAddFunctor(int64_t numel, const T* x, T* y) TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {} : numel_(numel), x_(x), y_(y) {}
void operator()(const paddle::platform::CPUPlace& place) { void operator()(const paddle::platform::CPUPlace& place) const {
paddle::platform::CPUDeviceContext* ctx = paddle::platform::CPUDeviceContext* ctx =
dynamic_cast<paddle::platform::CPUDeviceContext*>( dynamic_cast<paddle::platform::CPUDeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
...@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
// TODO(jiabin): Support xpu here from gradient_accumulator.cc // TODO(jiabin): Support xpu here from gradient_accumulator.cc
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const paddle::platform::CUDAPlace& place) { void operator()(const paddle::platform::CUDAPlace& place) const {
paddle::platform::CUDADeviceContext* ctx = paddle::platform::CUDADeviceContext* ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>( dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
...@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
#else #else
void operator()(const paddle::platform::CUDAPlace& place) { void operator()(const paddle::platform::CUDAPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -76,7 +76,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -76,7 +76,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
// TODO(jiabin): Support Npu here from gradient_accumulator.cc // TODO(jiabin): Support Npu here from gradient_accumulator.cc
// there is NO blas in CUDAPinnedPlace // there is NO blas in CUDAPinnedPlace
void operator()(const paddle::platform::CUDAPinnedPlace& place) { void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -84,14 +84,14 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -84,14 +84,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
void operator()(const paddle::platform::NPUPlace& place) { void operator()(const paddle::platform::NPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
place)); place));
} }
#else #else
void operator()(const paddle::platform::NPUPlace& place) { void operator()(const paddle::platform::NPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
void operator()(const paddle::platform::XPUPlace& place) { void operator()(const paddle::platform::XPUPlace& place) const {
paddle::platform::XPUDeviceContext* ctx = paddle::platform::XPUDeviceContext* ctx =
dynamic_cast<paddle::platform::XPUDeviceContext*>( dynamic_cast<paddle::platform::XPUDeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_)); xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
} }
#else #else
void operator()(const paddle::platform::XPUPlace& place) { void operator()(const paddle::platform::XPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
void operator()(const paddle::platform::MLUPlace& place) { void operator()(const paddle::platform::MLUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
place)); place));
} }
#else #else
void operator()(const paddle::platform::MLUPlace& place) { void operator()(const paddle::platform::MLUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -132,14 +132,14 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -132,14 +132,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif #endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
void operator()(const paddle::platform::IPUPlace& place) { void operator()(const paddle::platform::IPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
place)); place));
} }
#else #else
void operator()(const paddle::platform::IPUPlace& place) { void operator()(const paddle::platform::IPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -147,7 +147,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -147,7 +147,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#endif #endif
void operator()(const paddle::platform::NPUPinnedPlace& place) { void operator()(const paddle::platform::NPUPinnedPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied( PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
private: private:
int64_t numel_; int64_t numel_;
const T* x_; const T* x_;
T* y_; mutable T* y_;
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { ...@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \ if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(), \ TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>()); \ dst_tensor->mutable_data<cpp_type>()); \
boost::apply_visitor(func, place); \ paddle::platform::VisitPlace(place, func); \
return; \ return; \
} }
...@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) { ...@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
TensorAddFunctor<cpp_type> func( \ TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \ numel, src_tensor.data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \ dst_tensor->mutable_data<cpp_type>(place)); \
boost::apply_visitor(func, place); \ paddle::platform::VisitPlace(place, func); \
return; \ return; \
} }
......
...@@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins, ...@@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
VLOG(6) << "Get Device id"; VLOG(6) << "Get Device id";
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::SetDeviceId( paddle::platform::SetDeviceId(place.device);
BOOST_GET_CONST(paddle::platform::CUDAPlace, place).device);
#else #else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU if use CUDAPlace.")); "PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif #endif
} else if (paddle::platform::is_xpu_place(place)) { } else if (paddle::platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
paddle::platform::SetXPUDeviceId( paddle::platform::SetXPUDeviceId(place.device);
BOOST_GET_CONST(paddle::platform::XPUPlace, place).device);
#else #else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with XPU if use XPUPlace.")); "PaddlePaddle should compile with XPU if use XPUPlace."));
#endif #endif
} else if (paddle::platform::is_npu_place(place)) { } else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::SetNPUDeviceId( paddle::platform::SetNPUDeviceId(place.device);
BOOST_GET_CONST(paddle::platform::NPUPlace, place).device);
#else #else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace.")); "PaddlePaddle should compile with NPU if use NPUPlace."));
......
...@@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs, ...@@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
auto& kernels = kernels_iter->second; auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (is_xpu_place(expected_kernel_key.place_) && if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() || (kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) { paddle::platform::is_in_xpu_black_list(op.Type()))) {
...@@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs, ...@@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) { paddle::platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type() VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key << ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!"; << ", fallbacking to CPU one!";
......
...@@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, ...@@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
<< " dst_place: " << dst_place; << " dst_place: " << dst_place;
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in.place().which(), dst_place.which(), in.place().GetType(), dst_place.GetType(),
platform::errors::Unavailable("Currently, model parallelism is only " platform::errors::Unavailable("Currently, model parallelism is only "
"supported between CPU and CUDA.")); "supported between CPU and CUDA."));
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
...@@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc(
const framework::proto::VarType::Type &dtype, int64_t numel, const framework::proto::VarType::Type &dtype, int64_t numel,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_names) { const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) { if (platform::is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc(
PADDLE_THROW( PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with GPU.")); platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif #endif
} else if (is_xpu_place(places[0])) { } else if (platform::is_xpu_place(places[0])) {
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_, PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc( ...@@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc(
void AllReduceOpHandle::SyncNCCLAllReduce() { void AllReduceOpHandle::SyncNCCLAllReduce() {
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
for (auto &p : places_) { for (auto &p : places_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device; int dev_id = p.device;
auto *nccl_ctxs = auto *nccl_ctxs =
nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_); nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
auto &nccl_ctx = nccl_ctxs->at(dev_id); auto &nccl_ctx = nccl_ctxs->at(dev_id);
......
...@@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor( ...@@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor(
} }
int index = 0; int index = 0;
for (uint32_t i = 0; i < places.size(); i++) { for (uint32_t i = 0; i < places.size(); i++) {
int id = BOOST_GET_CONST(platform::XPUPlace, places_[i]).device; int id = places_[i].device;
if (place_to_index_.find(id) == place_to_index_.end()) { if (place_to_index_.find(id) == place_to_index_.end()) {
place_to_index_[id] = index; place_to_index_[id] = index;
index++; index++;
...@@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream( ...@@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops); RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops);
continue; continue;
} else { } else {
cur_place = cur_place = dev_ctxes_.begin()->first;
BOOST_GET_CONST(platform::XPUPlace, dev_ctxes_.begin()->first);
int cur_index = place_to_index_[cur_place.device]; int cur_index = place_to_index_[cur_place.device];
RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index); RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index);
} }
......
...@@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase { ...@@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_)); "The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_); auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; int dev_id = place.device;
auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id); auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
auto comm = bkcl_ctx.comm_; auto comm = bkcl_ctx.comm_;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -83,8 +84,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -83,8 +84,7 @@ void BroadcastOpHandle::BroadcastOneVar(
} else if (platform::is_gpu_place(in_tensor.place())) { } else if (platform::is_gpu_place(in_tensor.place())) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
VarHandle *out_handle = nullptr; VarHandle *out_handle = nullptr;
int root_id = int root_id = in_tensor.place().device;
BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;
std::vector<std::function<void()>> broadcast_calls; std::vector<std::function<void()>> broadcast_calls;
int type = platform::ToNCCLDataType(in_tensor.type()); int type = platform::ToNCCLDataType(in_tensor.type());
...@@ -94,8 +94,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -94,8 +94,7 @@ void BroadcastOpHandle::BroadcastOneVar(
Variable *out_var = var_scopes.at(out_var_handle->scope_idx()) Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
int dst_id = int dst_id = out_var_handle->place().device;
BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place()).device;
auto &nccl_ctx = nccl_ctxs_->at(dst_id); auto &nccl_ctx = nccl_ctxs_->at(dst_id);
...@@ -145,7 +144,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -145,7 +144,7 @@ void BroadcastOpHandle::BroadcastOneVar(
} else { } else {
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
VarHandle *out_handle = nullptr; VarHandle *out_handle = nullptr;
int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device; int root_id = in_tensor.place().device;
std::vector<std::function<void()>> broadcast_calls; std::vector<std::function<void()>> broadcast_calls;
int type = platform::ToBKCLDataType(in_tensor.type()); int type = platform::ToBKCLDataType(in_tensor.type());
...@@ -155,8 +154,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -155,8 +154,7 @@ void BroadcastOpHandle::BroadcastOneVar(
Variable *out_var = var_scopes.at(out_var_handle->scope_idx()) Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
int dst_id = int dst_id = out_var_handle->place().device;
BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device;
auto &bkcl_ctx = bkcl_ctxs_->at(dst_id); auto &bkcl_ctx = bkcl_ctxs_->at(dst_id);
...@@ -232,7 +230,7 @@ void BroadcastOpHandle::InitOutputValue( ...@@ -232,7 +230,7 @@ void BroadcastOpHandle::InitOutputValue(
PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound( PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.", "Variable %s is not found in scopes.",
out_var_handle->name())); out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) { if (platform::is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true, PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU.")); "Places of input and output must be all on GPU."));
......
...@@ -46,8 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -46,8 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>( dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(place.device);
BOOST_GET_CONST(platform::CUDAPlace, place).device);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipEventCreateWithFlags(&event_, hipEventDisableTiming)); hipEventCreateWithFlags(&event_, hipEventDisableTiming));
...@@ -72,7 +71,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -72,7 +71,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
EagerDeletionOpHandle::~EagerDeletionOpHandle() { EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (event_) { if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); auto gpu_place = dev_ctx_->GetPlace();
platform::CUDADeviceGuard guard(gpu_place.device); platform::CUDADeviceGuard guard(gpu_place.device);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
...@@ -85,8 +84,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -85,8 +84,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
void EagerDeletionOpHandle::InitCUDA() { void EagerDeletionOpHandle::InitCUDA() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id = int dev_id = dev_ctxes_.begin()->first.device;
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
events_[dev_id] = nullptr; events_[dev_id] = nullptr;
#endif #endif
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h" #include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_bool(skip_fused_all_reduce_check, false, ""); DEFINE_bool(skip_fused_all_reduce_check, false, "");
...@@ -102,7 +103,7 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -102,7 +103,7 @@ void FusedAllReduceOpHandle::RunImpl() {
gpuStream_t compute_stream{nullptr}; gpuStream_t compute_stream{nullptr};
if (FLAGS_allreduce_record_one_event) { if (FLAGS_allreduce_record_one_event) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]); auto gpu_place = platform::CUDAPlace(places_[0].GetDeviceId());
compute_stream = compute_stream =
platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream(); platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream();
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
...@@ -291,7 +292,7 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( ...@@ -291,7 +292,7 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
var, platform::errors::NotFound( var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name)); "The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { if (!platform::is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true; return true;
} }
} }
......
...@@ -354,7 +354,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, ...@@ -354,7 +354,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
float* cpu_data = new float[tensor->numel()]; float* cpu_data = new float[tensor->numel()];
memory::Copy(platform::CPUPlace(), static_cast<void*>(cpu_data), memory::Copy(platform::CPUPlace(), static_cast<void*>(cpu_data),
BOOST_GET_CONST(platform::XPUPlace, tensor->place()), tensor->place(),
static_cast<const void*>(tensor->data<float>()), static_cast<const void*>(tensor->data<float>()),
tensor->numel() * sizeof(float)); tensor->numel() * sizeof(float));
bool flag = false; bool flag = false;
......
...@@ -132,7 +132,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -132,7 +132,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>( auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(tensor_.place())); platform::DeviceContextPool::Instance().Get(tensor_.place()));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, tensor_.place()).device; int dev_id = tensor_.place().device;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true, (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true,
platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d", platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d",
......
...@@ -102,7 +102,7 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -102,7 +102,7 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
for (auto& p : dev_ctxes_) { for (auto& p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = p.first.device;
if (inter_events_.find(dev_id) != inter_events_.end()) { if (inter_events_.find(dev_id) != inter_events_.end()) {
continue; continue;
} }
...@@ -133,7 +133,7 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -133,7 +133,7 @@ class NCCLOpHandleBase : public OpHandleBase {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_)); "The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = place.device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
...@@ -181,7 +181,7 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -181,7 +181,7 @@ class NCCLOpHandleBase : public OpHandleBase {
void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff, void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype, ncclRedOp_t op) { size_t count, ncclDataType_t datatype, ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = place.device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
...@@ -213,7 +213,7 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -213,7 +213,7 @@ class NCCLOpHandleBase : public OpHandleBase {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound( nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_)); "Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = place.device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
...@@ -246,7 +246,7 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -246,7 +246,7 @@ class NCCLOpHandleBase : public OpHandleBase {
void InterBroadCast(platform::Place place, void* sendbuff, size_t count, void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) { ncclDataType_t datatype, ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = place.device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
......
...@@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { ...@@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
void OpHandleBase::InitCUDA() { void OpHandleBase::InitCUDA() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = p.first.device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
...@@ -61,9 +61,7 @@ void OpHandleBase::InitCUDA() { ...@@ -61,9 +61,7 @@ void OpHandleBase::InitCUDA() {
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) { if (out_var_handle) {
int dev_id = int dev_id = out_var_handle->place().device;
BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place())
.device;
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
...@@ -74,7 +72,7 @@ void OpHandleBase::InitCUDA() { ...@@ -74,7 +72,7 @@ void OpHandleBase::InitCUDA() {
"Operator %s should have only one dev_ctx, but got %d.", Name(), "Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size())); dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first; auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = place.device;
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) { if (out_var_handle) {
...@@ -109,7 +107,7 @@ void OpHandleBase::InitXPU() { ...@@ -109,7 +107,7 @@ void OpHandleBase::InitXPU() {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"%s should have only one dev_ctx.", Name())); "%s should have only one dev_ctx.", Name()));
auto &place = dev_ctxes_.begin()->first; auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; int dev_id = place.device;
platform::SetXPUDeviceId(dev_id); platform::SetXPUDeviceId(dev_id);
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
...@@ -309,7 +307,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) { ...@@ -309,7 +307,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!events_.empty()) { // Use event if (!events_.empty()) { // Use event
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; auto dev_id = p.first.device;
auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second); auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -332,8 +330,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, ...@@ -332,8 +330,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
} else { } else {
auto *ctx = dev_ctxes_.at(p); auto *ctx = dev_ctxes_.at(p);
auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx); auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
cuda_ctx->RecordEvent( cuda_ctx->RecordEvent(events_.at(p.device), callback);
events_.at(BOOST_GET_CONST(platform::CUDAPlace, p).device), callback);
} }
#else #else
callback(); callback();
......
...@@ -45,7 +45,7 @@ static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph( ...@@ -45,7 +45,7 @@ static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
for (auto &op : op_handles) { for (auto &op : op_handles) {
auto &dev_ctx = op->DeviceContext(); auto &dev_ctx = op->DeviceContext();
auto &p = dev_ctx.begin()->first; auto &p = dev_ctx.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device; int dev_id = p.device;
auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars); auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release()); graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(
...@@ -125,7 +126,8 @@ void ReduceOpHandle::RunImpl() { ...@@ -125,7 +126,8 @@ void ReduceOpHandle::RunImpl() {
// TODO(gongwb): add cpu support // TODO(gongwb): add cpu support
if (collective_context.endpoints_.size() <= 1 || if (collective_context.endpoints_.size() <= 1 ||
is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { platform::is_cpu_place(in_places[0]) ||
platform::is_cpu_place(t_out_p)) {
GatherLocalSelectedRowsFunctor functor( GatherLocalSelectedRowsFunctor functor(
in_selected_rows, in_places, dev_ctxes_, t_out_p, in_selected_rows, in_places, dev_ctxes_, t_out_p,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
...@@ -172,13 +174,13 @@ void ReduceOpHandle::RunImpl() { ...@@ -172,13 +174,13 @@ void ReduceOpHandle::RunImpl() {
out_var_handle->place(), pre_in.type()); out_var_handle->place(), pre_in.type());
auto out_p = out_var_handle->place(); auto out_p = out_var_handle->place();
int root_id = BOOST_GET_CONST(platform::CUDAPlace, out_p).device; int root_id = out_p.device;
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < var_scopes.size(); ++i) { for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i]; auto &p = in_places[i];
auto &lod_tensor = *lod_tensors[i]; auto &lod_tensor = *lod_tensors[i];
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device; int dev_id = p.device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto &nccl_ctx = nccl_ctxs_->at(dev_id);
void *buffer = const_cast<void *>(lod_tensor.data()); void *buffer = const_cast<void *>(lod_tensor.data());
...@@ -218,13 +220,13 @@ void ReduceOpHandle::RunImpl() { ...@@ -218,13 +220,13 @@ void ReduceOpHandle::RunImpl() {
out_var_handle->place(), pre_in.type()); out_var_handle->place(), pre_in.type());
auto out_p = out_var_handle->place(); auto out_p = out_var_handle->place();
int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device; int root_id = out_p.device;
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < var_scopes.size(); ++i) { for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i]; auto &p = in_places[i];
auto &lod_tensor = *lod_tensors[i]; auto &lod_tensor = *lod_tensors[i];
int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device; int dev_id = p.device;
auto &bkcl_ctx = bkcl_ctxs_->at(dev_id); auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
void *buffer = const_cast<void *>(lod_tensor.data()); void *buffer = const_cast<void *>(lod_tensor.data());
......
...@@ -61,8 +61,8 @@ struct ScaleLossGradFunctor { ...@@ -61,8 +61,8 @@ struct ScaleLossGradFunctor {
} else if (platform::is_xpu_place(place_)) { } else if (platform::is_xpu_place(place_)) {
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
OutT cast_coeff = static_cast<OutT>(coeff_); OutT cast_coeff = static_cast<OutT>(coeff_);
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), out_data, memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_)); SizeOfType(out_dtype_));
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -73,9 +73,8 @@ struct ScaleLossGradFunctor { ...@@ -73,9 +73,8 @@ struct ScaleLossGradFunctor {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
OutT cast_coeff = static_cast<OutT>(coeff_); OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream(); auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data, memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), SizeOfType(out_dtype_), stream);
stream);
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
......
...@@ -86,8 +86,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype( ...@@ -86,8 +86,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
void ShareTensorBufferOpHandle::InitCUDA() { void ShareTensorBufferOpHandle::InitCUDA() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int dev_id = int dev_id = dev_ctxes_.begin()->first.device;
BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
events_[dev_id] = nullptr; events_[dev_id] = nullptr;
#endif #endif
} }
......
...@@ -165,7 +165,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -165,7 +165,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
in_numel)); in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel; out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = place.device;
auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false); auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
auto &nccl_ctx = nccl_ctxs->at(dev_id); auto &nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
......
...@@ -106,9 +106,12 @@ struct EnforceShapeAndDTypeEQVisitor { ...@@ -106,9 +106,12 @@ struct EnforceShapeAndDTypeEQVisitor {
void operator()(const LoDTensor& src) { void operator()(const LoDTensor& src) {
auto& tensor = dst_->Get<LoDTensor>(); auto& tensor = dst_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(), PADDLE_ENFORCE_EQ(
platform::errors::PreconditionNotMet( src.place().GetType(), tensor.place().GetType(),
"The place type of the two variables is not equal.")); platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal. The src place "
"is %s, but the dst place is %s",
src.place().DebugString(), tensor.place().DebugString()));
PADDLE_ENFORCE_EQ(src.type(), tensor.type(), PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The dtype of the two variables is not equal.")); "The dtype of the two variables is not equal."));
...@@ -127,9 +130,12 @@ struct EnforceShapeAndDTypeEQVisitor { ...@@ -127,9 +130,12 @@ struct EnforceShapeAndDTypeEQVisitor {
void operator()(const SelectedRows& src) { void operator()(const SelectedRows& src) {
auto& selected_rows = dst_->Get<SelectedRows>(); auto& selected_rows = dst_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(), PADDLE_ENFORCE_EQ(
platform::errors::PreconditionNotMet( src.place().GetType(), selected_rows.place().GetType(),
"The place type of the two variables is not equal.")); platform::errors::PreconditionNotMet(
"The place type of the two variables is not equal. The src place "
"is %s, but the dst place is %s",
src.place().DebugString(), selected_rows.place().DebugString()));
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The dtype of the two variables is not equal.")); "The dtype of the two variables is not equal."));
......
...@@ -138,7 +138,7 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { ...@@ -138,7 +138,7 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
// init device, DLDevice type with device_type and device_id // init device, DLDevice type with device_type and device_id
auto place = tensor.place(); auto place = tensor.place();
t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place); t_.device = paddle::platform::VisitPlace(place, internal::DLDeviceVisitor());
// init dtype // init dtype
t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());
......
...@@ -63,8 +63,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) { ...@@ -63,8 +63,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) {
CHECK_EQ(0, dl_tensor.device.device_id); CHECK_EQ(0, dl_tensor.device.device_id);
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
CHECK_EQ(kDLGPU, dl_tensor.device.device_type); CHECK_EQ(kDLGPU, dl_tensor.device.device_type);
CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device, CHECK_EQ(place.device, dl_tensor.device.device_id);
dl_tensor.device.device_id);
} else if (platform::is_cuda_pinned_place(place)) { } else if (platform::is_cuda_pinned_place(place)) {
CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type); CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type);
CHECK_EQ(0, dl_tensor.device.device_id); CHECK_EQ(0, dl_tensor.device.device_id);
......
...@@ -72,7 +72,7 @@ Executor::~Executor() { ...@@ -72,7 +72,7 @@ Executor::~Executor() {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache, // Clear mkl-dnn cache,
// this is needed to have mkl-dnn unit tests working // this is needed to have mkl-dnn unit tests working
ClearMKLDNNCache(place_, this); platform::ClearMKLDNNCache(place_, this);
#endif #endif
} }
...@@ -443,31 +443,26 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -443,31 +443,26 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
} else { } else {
gc.reset(new DefaultStreamGarbageCollector( gc.reset(new DefaultStreamGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
} }
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle")); platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
#endif #endif
} else if (platform::is_cpu_place(place_)) { } else if (platform::is_cpu_place(place_)) {
gc.reset(new CPUGarbageCollector( gc.reset(new CPUGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
} else if (platform::is_xpu_place(place_)) { } else if (platform::is_xpu_place(place_)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
gc.reset(new XPUGarbageCollector( gc.reset(new XPUGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif #endif
} else if (platform::is_ipu_place(place_)) { } else if (platform::is_ipu_place(place_)) {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
gc.reset(new IPUGarbageCollector( gc.reset(new IPUGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
...@@ -476,16 +471,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -476,16 +471,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU."; VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector( gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use " "Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU.")); "GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU."; VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector( gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
} }
#else #else
PADDLE_THROW( PADDLE_THROW(
...@@ -494,11 +487,9 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -494,11 +487,9 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
} else if (platform::is_mlu_place(place_)) { } else if (platform::is_mlu_place(place_)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector( gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
} else { } else {
gc.reset(new MLUDefaultStreamGarbageCollector( gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
} }
#else #else
PADDLE_THROW( PADDLE_THROW(
......
...@@ -137,8 +137,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, ...@@ -137,8 +137,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
const int expand_embed_dim, const int expand_embed_dim,
const int64_t total_length) { const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(place))
BOOST_GET_CONST(platform::CUDAPlace, place)))
->stream(); ->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr()); float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
...@@ -203,8 +202,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -203,8 +202,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys, uint64_t* total_keys, uint64_t** origin_keys, uint64_t* total_keys,
const int64_t* gpu_len, int slot_num, int total_len) { const int64_t* gpu_len, int slot_num, int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(place))
BOOST_GET_CONST(platform::CUDAPlace, place)))
->stream(); ->stream();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512), hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
...@@ -225,8 +223,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, ...@@ -225,8 +223,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
const int hidden_size, const int expand_embed_dim, const int hidden_size, const int expand_embed_dim,
const int64_t total_length, const int batch_size) { const int64_t total_length, const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(place))
BOOST_GET_CONST(platform::CUDAPlace, place)))
->stream(); ->stream();
auto slot_lengths_lod = slot_lengths; auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) { for (int i = 1; i < slot_lengths_lod.size(); i++) {
......
...@@ -45,7 +45,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, ...@@ -45,7 +45,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); int device_id = place.GetDeviceId();
LoDTensor& total_keys_tensor = keys_tensor[device_id]; LoDTensor& total_keys_tensor = keys_tensor[device_id];
uint64_t* total_keys = reinterpret_cast<uint64_t*>( uint64_t* total_keys = reinterpret_cast<uint64_t*>(
total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place)); total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
...@@ -131,7 +131,7 @@ void BoxWrapper::PushSparseGradCase( ...@@ -131,7 +131,7 @@ void BoxWrapper::PushSparseGradCase(
"Warning:: CPUPlace is not supported in PaddleBox now.")); "Warning:: CPUPlace is not supported in PaddleBox now."));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); int device_id = place.GetDeviceId();
LoDTensor& cached_total_keys_tensor = keys_tensor[device_id]; LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
uint64_t* total_keys = uint64_t* total_keys =
reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>()); reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
...@@ -143,8 +143,7 @@ void BoxWrapper::PushSparseGradCase( ...@@ -143,8 +143,7 @@ void BoxWrapper::PushSparseGradCase(
push_boxps_timer.Start(); push_boxps_timer.Start();
int ret = boxps_ptr_->PushSparseGPU( int ret = boxps_ptr_->PushSparseGPU(
total_keys, reinterpret_cast<void*>(total_grad_values_gpu), total_keys, reinterpret_cast<void*>(total_grad_values_gpu),
static_cast<int>(total_length), static_cast<int>(total_length), place.GetDeviceId());
BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId());
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
"PushSparseGPU failed in BoxPS.")); "PushSparseGPU failed in BoxPS."));
push_boxps_timer.Pause(); push_boxps_timer.Pause();
......
...@@ -764,8 +764,7 @@ void FleetWrapper::PushDenseVarsAsync( ...@@ -764,8 +764,7 @@ void FleetWrapper::PushDenseVarsAsync(
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>(); LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(), float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace()); platform::CUDAPinnedPlace());
memory::Copy(platform::CUDAPinnedPlace(), pin_g, memory::Copy(platform::CUDAPinnedPlace(), pin_g, place, g_data,
BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
sizeof(float) * count, stream); sizeof(float) * count, stream);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
...@@ -821,8 +820,7 @@ void FleetWrapper::PushDenseVarsAsync( ...@@ -821,8 +820,7 @@ void FleetWrapper::PushDenseVarsAsync(
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>(); LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g = float* pin_g =
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace()); pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
memory::Copy(platform::CPUPlace(), pin_g, memory::Copy(platform::CPUPlace(), pin_g, place, g_data,
BOOST_GET_CONST(platform::XPUPlace, place), g_data,
sizeof(float) * count); sizeof(float) * count);
float* g = pin_g; float* g = pin_g;
......
...@@ -116,14 +116,12 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope, ...@@ -116,14 +116,12 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
tensor->numel() * SizeOfType(tensor->type())); tensor->numel() * SizeOfType(tensor->type()));
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(platform::CPUPlace(), data_ptr, memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data(), tensor->numel() * SizeOfType(tensor->type()), tensor->data(), tensor->numel() * SizeOfType(tensor->type()),
nullptr); nullptr);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
memory::Copy(platform::CPUPlace(), data_ptr, memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
tensor->data(), tensor->numel() * SizeOfType(tensor->type())); tensor->data(), tensor->numel() * SizeOfType(tensor->type()));
#endif #endif
} }
...@@ -158,8 +156,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, ...@@ -158,8 +156,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
tensor->mutable_data(place, ToVarType(req_var.data_type())); tensor->mutable_data(place, ToVarType(req_var.data_type()));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream); tensor->numel() * SizeOfType(tensor->type()), stream);
#else #else
memcpy(tensor_data, req_var.data().data(), memcpy(tensor_data, req_var.data().data(),
...@@ -197,8 +194,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, ...@@ -197,8 +194,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
tensor->mutable_data(place, ToVarType(req_var.data_type())); tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data, memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type())); tensor->numel() * SizeOfType(tensor->type()));
#else #else
memcpy(tensor_data, req_var.data().data(), memcpy(tensor_data, req_var.data().data(),
......
...@@ -791,7 +791,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -791,7 +791,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
"Warning:: CPUPlace is not supported in GpuPs now.")); "Warning:: CPUPlace is not supported in GpuPs now."));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id); int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = reinterpret_cast<uint64_t*>( uint64_t* total_keys = reinterpret_cast<uint64_t*>(
...@@ -859,7 +859,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -859,7 +859,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Warning:: CPUPlace is not supported in GPUPS now.")); "Warning:: CPUPlace is not supported in GPUPS now."));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id); int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index]; LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = uint64_t* total_keys =
......
...@@ -113,8 +113,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, ...@@ -113,8 +113,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
const int hidden_size, const int hidden_size,
const int64_t total_length) { const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(place))
BOOST_GET_CONST(platform::CUDAPlace, place)))
->stream(); ->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr()); float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
...@@ -132,8 +131,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -132,8 +131,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
const int64_t* gpu_len, int slot_num, const int64_t* gpu_len, int slot_num,
int total_len) { int total_len) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(place))
BOOST_GET_CONST(platform::CUDAPlace, place)))
->stream(); ->stream();
CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>( CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
origin_keys, total_keys, gpu_len, slot_num, total_len); origin_keys, total_keys, gpu_len, slot_num, total_len);
...@@ -148,8 +146,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, ...@@ -148,8 +146,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const int64_t total_length, const int64_t total_length,
const int batch_size) { const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>( auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(place))
BOOST_GET_CONST(platform::CUDAPlace, place)))
->stream(); ->stream();
auto slot_lengths_lod = slot_lengths; auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) { for (int i = 1; i < slot_lengths_lod.size(); i++) {
......
...@@ -101,7 +101,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, ...@@ -101,7 +101,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
} }
StreamGarbageCollector::~StreamGarbageCollector() { StreamGarbageCollector::~StreamGarbageCollector() {
auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace()); auto place = this->dev_ctx_->GetPlace();
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
platform::GpuStreamSync(stream_); platform::GpuStreamSync(stream_);
platform::GpuDestroyStream(stream_); platform::GpuDestroyStream(stream_);
...@@ -186,7 +186,7 @@ MLUStreamGarbageCollector::MLUStreamGarbageCollector( ...@@ -186,7 +186,7 @@ MLUStreamGarbageCollector::MLUStreamGarbageCollector(
} }
MLUStreamGarbageCollector::~MLUStreamGarbageCollector() { MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
auto place = BOOST_GET_CONST(platform::MLUPlace, this->dev_ctx_->GetPlace()); auto place = this->dev_ctx_->GetPlace();
platform::MLUDeviceGuard guard(place.device); platform::MLUDeviceGuard guard(place.device);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_)); PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_)); PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
......
...@@ -46,8 +46,8 @@ void SetMicroId(paddle::framework::Scope* scope, ...@@ -46,8 +46,8 @@ void SetMicroId(paddle::framework::Scope* scope,
temp_ptr_float[0] = micro_id; temp_ptr_float[0] = micro_id;
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, memory::Copy(place, tensor_data, platform::CPUPlace(),
platform::CPUPlace(), reinterpret_cast<void*>(temp_ptr), reinterpret_cast<void*>(temp_ptr),
tensor->numel() * framework::SizeOfType(tensor->type()), tensor->numel() * framework::SizeOfType(tensor->type()),
stream); stream);
#endif #endif
......
...@@ -117,12 +117,12 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { ...@@ -117,12 +117,12 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto stream = copy_streams_[num]; auto stream = copy_streams_[num];
auto event = events_[num]; auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = place.device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; auto dev_id = place.device;
platform::XPUDeviceGuard guard(dev_id); platform::XPUDeviceGuard guard(dev_id);
#endif #endif
...@@ -173,13 +173,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, ...@@ -173,13 +173,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place); thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>(); T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) { if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel(), stream); sizeof(T) * root_tensor->numel(), stream);
} else { } else {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()), sizeof(T) * root_tensor->numel(), stream);
root_ptr, sizeof(T) * root_tensor->numel(), stream);
} }
} }
#endif #endif
...@@ -193,13 +191,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, ...@@ -193,13 +191,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place); thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>(); T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) { if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr, memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel()); sizeof(T) * root_tensor->numel());
} else { } else {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr, memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()), sizeof(T) * root_tensor->numel());
root_ptr, sizeof(T) * root_tensor->numel());
} }
} }
#endif #endif
...@@ -286,7 +282,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -286,7 +282,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
(context->ops_).push_back(local_op_ptr); (context->ops_).push_back(local_op_ptr);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = place.device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
...@@ -336,15 +332,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, ...@@ -336,15 +332,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
_ForEachDataType_(MergeCallback); _ForEachDataType_(MergeCallback);
if (!platform::is_cpu_place(thread_tensor->place())) { if (!platform::is_cpu_place(thread_tensor->place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto dev_id = auto dev_id = thread_tensor->place().device;
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data(), 0, cudaMemset(thread_tensor->data(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type())); thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto place = thread_tensor->place(); auto place = thread_tensor->place();
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; auto dev_id = place.device;
platform::XPUDeviceGuard guard(dev_id); platform::XPUDeviceGuard guard(dev_id);
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -364,15 +359,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, ...@@ -364,15 +359,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
merge_var); merge_var);
if (!platform::is_cpu_place(root_tensor->place())) { if (!platform::is_cpu_place(root_tensor->place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto dev_id = auto dev_id = root_tensor->place().device;
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data(), 0, cudaMemset(root_tensor->data(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type())); root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto place = root_tensor->place(); auto place = root_tensor->place();
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; auto dev_id = place.device;
platform::XPUDeviceGuard guard(dev_id); platform::XPUDeviceGuard guard(dev_id);
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -442,7 +436,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -442,7 +436,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
(context->ops_).push_back(local_op_ptr); (context->ops_).push_back(local_op_ptr);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = place.device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
......
...@@ -67,7 +67,7 @@ Graph *Pass::Apply(Graph *graph) const { ...@@ -67,7 +67,7 @@ Graph *Pass::Apply(Graph *graph) const {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache, // Clear mkl-dnn cache,
// Passes can change params, tensors, so caching need to be discarded // Passes can change params, tensors, so caching need to be discarded
ClearMKLDNNCache(paddle::platform::CPUPlace()); platform::ClearMKLDNNCache(paddle::platform::CPUPlace());
#endif #endif
VLOG(10) << "finish to apply pass " << Type() << " to graph"; VLOG(10) << "finish to apply pass " << Type() << " to graph";
return graph; return graph;
......
...@@ -32,10 +32,8 @@ namespace framework { ...@@ -32,10 +32,8 @@ namespace framework {
inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace( inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
const paddle::memory::allocation::AllocationPtr &gpu_) { const paddle::memory::allocation::AllocationPtr &gpu_) {
return gpu_ == nullptr return gpu_ == nullptr ? paddle::none
? paddle::none : paddle::optional<platform::CUDAPlace>(gpu_->place());
: paddle::optional<platform::CUDAPlace>(
BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
} }
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
...@@ -369,11 +367,11 @@ class Vector { ...@@ -369,11 +367,11 @@ class Vector {
// get cuda ptr. immutable // get cuda ptr. immutable
const T *CUDAData(platform::Place place) const { const T *CUDAData(platform::Place place) const {
{ {
platform::CUDAPlace p(place.GetDeviceId());
auto &mtx = m_.Data().Mutex(); auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == paddle::none || if (cuda_place == paddle::none || cuda_place == p) {
cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
return m_.Data().CUDAData(place); return m_.Data().CUDAData(place);
} }
} }
...@@ -385,11 +383,11 @@ class Vector { ...@@ -385,11 +383,11 @@ class Vector {
// get cuda ptr. mutable // get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) { T *CUDAMutableData(platform::Place place) {
{ {
platform::CUDAPlace p(place.GetDeviceId());
auto &mtx = m_.Data().Mutex(); auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == paddle::none || if (cuda_place == paddle::none || cuda_place == p) {
cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
return m_.MutableData()->CUDAMutableData(place); return m_.MutableData()->CUDAMutableData(place);
} }
} }
......
...@@ -131,7 +131,7 @@ NaiveExecutor::~NaiveExecutor() { ...@@ -131,7 +131,7 @@ NaiveExecutor::~NaiveExecutor() {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache, // Clear mkl-dnn cache,
// this is needed to have mkl-dnn unit tests working // this is needed to have mkl-dnn unit tests working
ClearMKLDNNCache(place_, this); platform::ClearMKLDNNCache(place_, this);
#endif #endif
} }
......
...@@ -43,7 +43,7 @@ class ProfilerGuard { ...@@ -43,7 +43,7 @@ class ProfilerGuard {
void TotalCUDAAllocatedMemorySize(const platform::Place& place) { void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); auto cuda_place = place;
cost_info_->device_memory_bytes = cost_info_->device_memory_bytes =
platform::RecordedGpuMallocSize(cuda_place.device); platform::RecordedGpuMallocSize(cuda_place.device);
#endif #endif
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
int cur_loc = 0; int cur_loc = 0;
int place = key.place_.which(); int place = static_cast<int>(key.place_.GetType());
cur_loc += OpKernelType::kPlaceBits; cur_loc += OpKernelType::kPlaceBits;
int data_type = static_cast<int>(key.data_type_) << cur_loc; int data_type = static_cast<int>(key.data_type_) << cur_loc;
......
...@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) { ...@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) {
LibraryType::kCUDNN); LibraryType::kCUDNN);
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type[" "data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type["
"CUDNN]"); "CUDNN]");
using CUDAPlace = paddle::platform::CUDAPlace; using CUDAPlace = paddle::platform::CUDAPlace;
...@@ -35,7 +35,7 @@ TEST(OpKernelType, ToString) { ...@@ -35,7 +35,7 @@ TEST(OpKernelType, ToString) {
LibraryType::kCUDNN); LibraryType::kCUDNN);
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
"data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
"CUDAPlace(0)]:library_" "Place(gpu:0)]:library_"
"type[CUDNN]"); "type[CUDNN]");
} }
......
...@@ -210,7 +210,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -210,7 +210,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with CUDA support.", "reinstall Paddle with CUDA support.",
place)); place));
#else #else
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = place.device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
...@@ -220,7 +220,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -220,7 +220,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with XPU support.", "reinstall Paddle with XPU support.",
place)); place));
#else #else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; auto dev_id = place.device;
platform::SetXPUDeviceId(dev_id); platform::SetXPUDeviceId(dev_id);
#endif #endif
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
...@@ -230,7 +230,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -230,7 +230,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with NPU support.", "reinstall Paddle with NPU support.",
place)); place));
#else #else
auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device; auto dev_id = place.device;
platform::SetNPUDeviceId(dev_id); platform::SetNPUDeviceId(dev_id);
#endif #endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
...@@ -240,7 +240,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -240,7 +240,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
"reinstall Paddle with MLU support.", "reinstall Paddle with MLU support.",
place)); place));
#else #else
auto dev_id = BOOST_GET_CONST(platform::MLUPlace, place).device; auto dev_id = place.device;
platform::SetMLUDeviceId(dev_id); platform::SetMLUDeviceId(dev_id);
#endif #endif
} }
...@@ -1330,7 +1330,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1330,7 +1330,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (is_xpu_place(expected_kernel_key.place_) && if (platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() || (kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(type_))) { paddle::platform::is_in_xpu_black_list(type_))) {
...@@ -1343,7 +1343,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1343,7 +1343,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) { platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_ VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key << ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!"; << ", fallbacking to CPU one!";
...@@ -1353,7 +1353,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1353,7 +1353,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
is_mlu_place(expected_kernel_key.place_)) { platform::is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << type_ VLOG(3) << "missing MLU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key << ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!"; << ", fallbacking to CPU one!";
......
...@@ -500,11 +500,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -500,11 +500,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
} else { } else {
gc.reset(new StreamGarbageCollector( gc.reset(new StreamGarbageCollector(place, max_memory_size));
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
} }
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else #else
...@@ -515,11 +513,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -515,11 +513,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector( gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
} else { } else {
gc.reset(new MLUStreamGarbageCollector( gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
} }
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else #else
...@@ -529,8 +525,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -529,8 +525,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
gc.reset(new XPUGarbageCollector( gc.reset(new XPUGarbageCollector(place, max_memory_size));
BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size));
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -538,8 +533,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -538,8 +533,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif #endif
} else if (platform::is_cpu_place(place)) { } else if (platform::is_cpu_place(place)) {
gc.reset(new CPUGarbageCollector( gc.reset(new CPUGarbageCollector(place, max_memory_size));
BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
} else { } else {
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
...@@ -609,10 +603,9 @@ void InitP2P(const std::vector<platform::Place> &places) { ...@@ -609,10 +603,9 @@ void InitP2P(const std::vector<platform::Place> &places) {
std::vector<int> devices; std::vector<int> devices;
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
if (!is_gpu_place(places[i])) return; if (!platform::is_gpu_place(places[i])) return;
platform::CUDAPlace device = platform::CUDAPlace device = places[i];
BOOST_GET_CONST(platform::CUDAPlace, places[i]);
devices.push_back(device.GetDeviceId()); devices.push_back(device.GetDeviceId());
} }
...@@ -655,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -655,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) { : member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]), PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]),
platform::errors::Unavailable( true, platform::errors::Unavailable(
"NPU is not supported in ParallelExecutor")); "NPU is not supported in ParallelExecutor."));
InitP2P(places); InitP2P(places);
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size()); member_->places_.size());
......
...@@ -135,13 +135,11 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -135,13 +135,11 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>(); float* w = tensor->data<float>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), pin_w,
platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]); sizeof(float) * tensor->numel(), copy_streams_[i]);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w, memory::Copy(places_[i], w, platform::CPUPlace(), pin_w,
platform::CPUPlace(), pin_w,
sizeof(float) * tensor->numel()); sizeof(float) * tensor->numel());
#endif #endif
} }
......
...@@ -224,23 +224,20 @@ void SectionWorker::TrainFiles() { ...@@ -224,23 +224,20 @@ void SectionWorker::TrainFiles() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
} }
} }
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU."; VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector( gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use " "Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU.")); "GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU."; VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector( gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
} }
#endif #endif
} // max_memory_size >= 0 } // max_memory_size >= 0
......
...@@ -25,13 +25,7 @@ limitations under the License. */ ...@@ -25,13 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class DeviceContext;
class Place;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -153,14 +153,12 @@ void TensorFromArray(const T* src, const size_t& array_size, ...@@ -153,14 +153,12 @@ void TensorFromArray(const T* src, const size_t& array_size,
auto size = array_size * sizeof(T); auto size = array_size * sizeof(T);
if (platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
src_place, src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, dst_place, dst_ptr, src_place, src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
...@@ -176,8 +174,7 @@ void TensorFromArray(const T* src, const size_t& array_size, ...@@ -176,8 +174,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
// 2. async copy npu pinned tensor -> npu tensor // 2. async copy npu pinned tensor -> npu tensor
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event // 3. record event
...@@ -205,14 +202,12 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -205,14 +202,12 @@ void TensorFromVector(const std::vector<T>& src,
auto size = src.size() * sizeof(T); auto size = src.size() * sizeof(T);
if (platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
src_place, src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, dst_place, dst_ptr, src_place, src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
...@@ -233,8 +228,7 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -233,8 +228,7 @@ void TensorFromVector(const std::vector<T>& src,
// 2. async copy npu pinned tensor -> npu tensor // 2. async copy npu pinned tensor -> npu tensor
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event // 3. record event
...@@ -252,8 +246,7 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -252,8 +246,7 @@ void TensorFromVector(const std::vector<T>& src,
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(dst_place)) { if (platform::is_mlu_place(dst_place)) {
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place, dst_place, dst_ptr, src_place, src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
} }
#endif #endif
...@@ -280,14 +273,12 @@ inline void TensorFromVector(const std::vector<bool>& src, ...@@ -280,14 +273,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
auto size = src.size() * sizeof(bool); auto size = src.size() * sizeof(bool);
if (platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
src_place, src_ptr, size);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, dst_place, dst_ptr, src_place, src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
...@@ -303,8 +294,7 @@ inline void TensorFromVector(const std::vector<bool>& src, ...@@ -303,8 +294,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
// 2. async copy npu pinned tensor -> npu tensor // 2. async copy npu pinned tensor -> npu tensor
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event // 3. record event
...@@ -362,37 +352,29 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -362,37 +352,29 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
auto dst_ptr = static_cast<void*>(dst->data()); auto dst_ptr = static_cast<void*>(dst->data());
if (platform::is_cpu_place(src.place())) { if (platform::is_cpu_place(src.place())) {
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), dst_place, dst_ptr, src.place(), src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
else if (platform::is_xpu_place(src.place())) { // NOLINT else if (platform::is_xpu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
size);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()), dst_place, dst_ptr, src.place(), src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
} }
#endif #endif
...@@ -412,37 +394,29 @@ inline void TensorToVector(const Tensor& src, ...@@ -412,37 +394,29 @@ inline void TensorToVector(const Tensor& src,
auto dst_ptr = static_cast<void*>(array); auto dst_ptr = static_cast<void*>(array);
if (platform::is_cpu_place(src.place())) { if (platform::is_cpu_place(src.place())) {
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), dst_place, dst_ptr, src.place(), src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
else if (platform::is_xpu_place(src.place())) { // NOLINT else if (platform::is_xpu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
size);
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()), dst_place, dst_ptr, src.place(), src_ptr, size,
src_ptr, size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream()); reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
} }
#endif #endif
...@@ -467,8 +441,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) { ...@@ -467,8 +441,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
"The input tensor should be CPU device, but actually it is in %s.", "The input tensor should be CPU device, but actually it is in %s.",
src.place())); src.place()));
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
} }
template <> template <>
...@@ -488,8 +461,7 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) { ...@@ -488,8 +461,7 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
"The input tensor should be CPU device, but actually it is in %s.", "The input tensor should be CPU device, but actually it is in %s.",
src.place())); src.place()));
memory::Copy(dst_place, dst_ptr, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
for (unsigned int i = 0; i < src.numel(); i++) { for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]); (*dst)[i] = static_cast<bool>(array[i]);
......
...@@ -86,7 +86,7 @@ void BKCLParallelContext::Init() { ...@@ -86,7 +86,7 @@ void BKCLParallelContext::Init() {
} }
BcastBKCLId(bkcl_ids, 0); BcastBKCLId(bkcl_ids, 0);
int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; int xpu_id = place_.device;
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
...@@ -111,7 +111,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) { ...@@ -111,7 +111,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) {
} }
BcastBKCLId(bkcl_ids, 0); BcastBKCLId(bkcl_ids, 0);
int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; int xpu_id = place_.device;
VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
<< " ring id: " << ring_id; << " ring id: " << ring_id;
......
...@@ -78,7 +78,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -78,7 +78,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
TensorAddFunctor(int64_t numel, const T* x, T* y) TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {} : numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) { void operator()(const platform::CPUPlace& place) const {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>( platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx); auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
...@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
void operator()(const platform::XPUPlace& place) { void operator()(const platform::XPUPlace& place) const {
using XPUType = typename XPUTypeTrait<T>::Type; using XPUType = typename XPUTypeTrait<T>::Type;
platform::XPUDeviceContext* ctx = dynamic_cast<platform::XPUDeviceContext*>( platform::XPUDeviceContext* ctx = dynamic_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
...@@ -100,7 +100,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -100,7 +100,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
r, XPUAPIErrorMsg[r])); r, XPUAPIErrorMsg[r]));
} }
#else #else
void operator()(const platform::XPUPlace& place) { void operator()(const platform::XPUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -109,7 +109,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -109,7 +109,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const platform::CUDAPlace& place) { void operator()(const platform::CUDAPlace& place) const {
platform::CUDADeviceContext* ctx = platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>( dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
...@@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
#else #else
void operator()(const platform::CUDAPlace& place) { void operator()(const platform::CUDAPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -126,7 +126,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -126,7 +126,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
void operator()(const platform::MLUPlace& place) { void operator()(const platform::MLUPlace& place) const {
// TODO(fwg): SUPPORT it // TODO(fwg): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
...@@ -134,7 +134,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -134,7 +134,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
place)); place));
} }
#else #else
void operator()(const platform::MLUPlace& place) { void operator()(const platform::MLUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -143,7 +143,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -143,7 +143,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
void operator()(const platform::NPUPlace& place) { void operator()(const platform::NPUPlace& place) const {
// TODO(zhiqiu): SUPPORT it // TODO(zhiqiu): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
...@@ -151,7 +151,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -151,7 +151,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
place)); place));
} }
#else #else
void operator()(const platform::NPUPlace& place) { void operator()(const platform::NPUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -159,21 +159,21 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -159,21 +159,21 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#endif #endif
void operator()(const platform::NPUPinnedPlace& place) { void operator()(const platform::NPUPinnedPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
place)); place));
} }
// there is NO blas in CUDAPinnedPlace // there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) { void operator()(const platform::CUDAPinnedPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
place)); place));
} }
// there is NO support in IPUPlace // there is NO support in IPUPlace
void operator()(const platform::IPUPlace& place) { void operator()(const platform::IPUPlace& place) const {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) " "Gradient accumulation on place (%s) "
"is not supported in imperative mode", "is not supported in imperative mode",
...@@ -183,7 +183,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -183,7 +183,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
private: private:
int64_t numel_; int64_t numel_;
const T* x_; const T* x_;
T* y_; mutable T* y_;
}; };
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -248,7 +248,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -248,7 +248,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
TensorAddFunctor<cpp_type> func( \ TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \ numel, src_tensor.data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \ dst_tensor->mutable_data<cpp_type>(place)); \
boost::apply_visitor(func, place); \ platform::VisitPlace(place, func); \
return; \ return; \
} }
......
...@@ -86,7 +86,7 @@ void HCCLParallelContext::Init() { ...@@ -86,7 +86,7 @@ void HCCLParallelContext::Init() {
} }
BcastHCCLId(hccl_ids, 0, server_fd); BcastHCCLId(hccl_ids, 0, server_fd);
int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device; int npu_id = place_.device;
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
VLOG(0) << "init hccl context nranks: " << strategy_.nranks_ VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
...@@ -96,10 +96,10 @@ void HCCLParallelContext::Init() { ...@@ -96,10 +96,10 @@ void HCCLParallelContext::Init() {
&hccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, npu_id, &hccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, npu_id,
ring_id); ring_id);
compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( compute_events_.emplace_back(
BOOST_GET_CONST(platform::NPUPlace, place_).device)); platform::NpuEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( comm_events_.emplace_back(
BOOST_GET_CONST(platform::NPUPlace, place_).device)); platform::NpuEventResourcePool::Instance().New(place_.device));
} }
} }
...@@ -117,7 +117,7 @@ void HCCLParallelContext::InitWithRingID(int ring_id) { ...@@ -117,7 +117,7 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
} }
BcastHCCLId(hccl_ids, 0, server_fd); BcastHCCLId(hccl_ids, 0, server_fd);
int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device; int npu_id = place_.device;
VLOG(0) << "init hccl context nranks: " << strategy_.nranks_ VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
<< " ring id: " << ring_id; << " ring id: " << ring_id;
...@@ -125,10 +125,10 @@ void HCCLParallelContext::InitWithRingID(int ring_id) { ...@@ -125,10 +125,10 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
platform::HCCLCommContext::Instance().CreateHCCLComm( platform::HCCLCommContext::Instance().CreateHCCLComm(
&hccl_ids[0], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id); &hccl_ids[0], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id);
compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( compute_events_.emplace_back(
BOOST_GET_CONST(platform::NPUPlace, place_).device)); platform::NpuEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New( comm_events_.emplace_back(
BOOST_GET_CONST(platform::NPUPlace, place_).device)); platform::NpuEventResourcePool::Instance().New(place_.device));
} }
void HCCLParallelContext::AllReduceByStream(const framework::Variable &src, void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
......
...@@ -193,7 +193,7 @@ void VarBase::ClearGradient(bool set_to_zero) { ...@@ -193,7 +193,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
grad_var_->MutableVar()->GetMutable<framework::SelectedRows>(); grad_var_->MutableVar()->GetMutable<framework::SelectedRows>();
if (grad_t->mutable_value()->IsInitialized()) { if (grad_t->mutable_value()->IsInitialized()) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place()); if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
#endif #endif
grad_t->mutable_rows()->clear(); grad_t->mutable_rows()->clear();
grad_t->mutable_value()->clear(); grad_t->mutable_value()->clear();
...@@ -211,7 +211,7 @@ void VarBase::ClearGradient(bool set_to_zero) { ...@@ -211,7 +211,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
grad_t->clear(); grad_t->clear();
} }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place()); if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
#endif #endif
} }
} }
......
...@@ -77,7 +77,7 @@ void NCCLParallelContext::Init() { ...@@ -77,7 +77,7 @@ void NCCLParallelContext::Init() {
} }
BcastNCCLId(nccl_ids, 0, server_fd); BcastNCCLId(nccl_ids, 0, server_fd);
int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; int gpu_id = place_.device;
for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
...@@ -88,10 +88,9 @@ void NCCLParallelContext::Init() { ...@@ -88,10 +88,9 @@ void NCCLParallelContext::Init() {
ring_id); ring_id);
compute_events_.emplace_back( compute_events_.emplace_back(
platform::CudaEventResourcePool::Instance().New( platform::CudaEventResourcePool::Instance().New(place_.device));
BOOST_GET_CONST(platform::CUDAPlace, place_).device)); comm_events_.emplace_back(
comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( platform::CudaEventResourcePool::Instance().New(place_.device));
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
} }
} }
...@@ -111,7 +110,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { ...@@ -111,7 +110,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
} }
BcastNCCLId(nccl_ids, 0, server_fd); BcastNCCLId(nccl_ids, 0, server_fd);
int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; int gpu_id = place_.device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id; << " ring id: " << ring_id;
...@@ -119,10 +118,10 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { ...@@ -119,10 +118,10 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
platform::NCCLCommContext::Instance().CreateComm( platform::NCCLCommContext::Instance().CreateComm(
&nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( compute_events_.emplace_back(
BOOST_GET_CONST(platform::CUDAPlace, place_).device)); platform::CudaEventResourcePool::Instance().New(place_.device));
comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( comm_events_.emplace_back(
BOOST_GET_CONST(platform::CUDAPlace, place_).device)); platform::CudaEventResourcePool::Instance().New(place_.device));
} }
void NCCLParallelContext::AllReduceByStream(const framework::Variable &src, void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
......
...@@ -194,7 +194,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -194,7 +194,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second; auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (is_xpu_place(expected_kernel_key.place_) && if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() || (kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) { paddle::platform::is_in_xpu_black_list(op.Type()))) {
...@@ -207,7 +207,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -207,7 +207,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) { paddle::platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type() VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key << ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!"; << ", fallbacking to CPU one!";
...@@ -217,7 +217,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins, ...@@ -217,7 +217,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
is_mlu_place(expected_kernel_key.place_)) { paddle::platform::is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << op.Type() VLOG(3) << "missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key << ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!"; << ", fallbacking to CPU one!";
......
...@@ -835,7 +835,7 @@ void Reducer::MarkGroupReady(size_t group_index) { ...@@ -835,7 +835,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
// thrown in comm_pool_. // thrown in comm_pool_.
auto next_group = next_group_; auto next_group = next_group_;
comm_pool_->enqueue([this, run_order, next_group, &group] { comm_pool_->enqueue([this, run_order, next_group, &group] {
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; auto dev_id = place_.device;
platform::SetXPUDeviceId(dev_id); platform::SetXPUDeviceId(dev_id);
FusedAllReduceSchedule(run_order, group, next_group); FusedAllReduceSchedule(run_order, group, next_group);
{ {
......
...@@ -87,8 +87,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -87,8 +87,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
std::unique_ptr<framework::GarbageCollector> gc; std::unique_ptr<framework::GarbageCollector> gc;
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gc.reset(new framework::DefaultStreamGarbageCollector( gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));
BOOST_GET_CONST(platform::CUDAPlace, place), 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
#else #else
...@@ -98,8 +97,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -98,8 +97,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
#endif #endif
} else if (platform::is_cuda_pinned_place(place)) { } else if (platform::is_cuda_pinned_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gc.reset(new framework::CUDAPinnedGarbageCollector( gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));
BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
#else #else
...@@ -110,8 +108,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -110,8 +108,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
gc.reset(new framework::XPUGarbageCollector( gc.reset(new framework::XPUGarbageCollector(place, 0));
BOOST_GET_CONST(platform::XPUPlace, place), 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -119,14 +116,12 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -119,14 +116,12 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif #endif
} else if (platform::is_cpu_place(place)) { } else if (platform::is_cpu_place(place)) {
gc.reset(new framework::CPUGarbageCollector( gc.reset(new framework::CPUGarbageCollector(place, 0));
BOOST_GET_CONST(platform::CPUPlace, place), 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
gc.reset(new framework::NPUUnsafeFastGarbageCollector( gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
BOOST_GET_CONST(platform::NPUPlace, place), 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -135,8 +130,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -135,8 +130,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
#endif #endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#if defined(PADDLE_WITH_MLU) #if defined(PADDLE_WITH_MLU)
gc.reset(new framework::MLUDefaultStreamGarbageCollector( gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
BOOST_GET_CONST(platform::MLUPlace, place), 0));
VLOG(10) << "Created GarbageCollector at " << place; VLOG(10) << "Created GarbageCollector at " << place;
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -197,31 +191,28 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -197,31 +191,28 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
try { try {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device); platform::SetDeviceId(place.device);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU if use CUDAPlace.")); "PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
platform::SetXPUDeviceId( platform::SetXPUDeviceId(place.device);
BOOST_GET_CONST(platform::XPUPlace, place).device);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with XPU if use XPUPlace.")); "PaddlePaddle should compile with XPU if use XPUPlace."));
#endif #endif
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
platform::SetNPUDeviceId( platform::SetNPUDeviceId(place.device);
BOOST_GET_CONST(platform::NPUPlace, place).device);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace.")); "PaddlePaddle should compile with NPU if use NPUPlace."));
#endif #endif
} else if (platform::is_mlu_place(place)) { } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
platform::SetMLUDeviceId( platform::SetMLUDeviceId(place.device);
BOOST_GET_CONST(platform::MLUPlace, place).device);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU if use MLUPlace.")); "PaddlePaddle should compile with MLU if use MLUPlace."));
......
...@@ -127,7 +127,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, ...@@ -127,7 +127,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); auto dst_gpu_place = place;
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), pt.data.data(), pt.data.length(), platform::CPUPlace(), pt.data.data(), pt.data.length(),
dev_ctx->stream()); dev_ctx->stream());
...@@ -137,7 +137,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, ...@@ -137,7 +137,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place); auto dst_xpu_place = place;
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), pt.data.data(), pt.data.length()); platform::CPUPlace(), pt.data.data(), pt.data.length());
#else #else
...@@ -954,14 +954,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( ...@@ -954,14 +954,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
// model. // model.
res->SetPlace(PaddlePlace::kCPU); res->SetPlace(PaddlePlace::kCPU);
} else { } else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); auto xpu_place = place_;
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} }
} else if (platform::is_npu_place(place_)) { } else if (platform::is_npu_place(place_)) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_); auto npu_place = place_;
res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = place_;
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
} }
return res; return res;
...@@ -993,14 +993,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -993,14 +993,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
// model. // model.
res->SetPlace(PaddlePlace::kCPU); res->SetPlace(PaddlePlace::kCPU);
} else { } else {
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); auto xpu_place = place_;
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} }
} else if (platform::is_npu_place(place_)) { } else if (platform::is_npu_place(place_)) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_); auto npu_place = place_;
res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = place_;
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
} }
return res; return res;
...@@ -1050,7 +1050,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { ...@@ -1050,7 +1050,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
if (stream != nullptr) { if (stream != nullptr) {
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_); auto gpu_place = place_;
auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>( auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place)); pool.Get(gpu_place));
dev_ctx->SetThreadLocalStream(stream); dev_ctx->SetThreadLocalStream(stream);
...@@ -1065,7 +1065,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() { ...@@ -1065,7 +1065,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_); auto gpu_place = place_;
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place)); pool.Get(gpu_place));
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
......
...@@ -243,7 +243,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -243,7 +243,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_)); static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto dst_gpu_place = place_;
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), dev_ctx->stream()); inputs[i].data.length(), dev_ctx->stream());
...@@ -253,7 +253,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -253,7 +253,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#endif #endif
} else if (platform::is_xpu_place(place_)) { } else if (platform::is_xpu_place(place_)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); auto dst_xpu_place = place_;
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length()); inputs[i].data.length());
...@@ -267,7 +267,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -267,7 +267,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto *dev_ctx = auto *dev_ctx =
static_cast<const platform::NPUDeviceContext *>(pool.Get(place_)); static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_); auto dst_npu_place = place_;
memory::Copy(dst_npu_place, static_cast<void *>(input_ptr), memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), dev_ctx->stream()); inputs[i].data.length(), dev_ctx->stream());
......
...@@ -253,7 +253,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -253,7 +253,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, t_place); auto gpu_place = t_place;
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place)); pool.Get(gpu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
...@@ -280,7 +280,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -280,7 +280,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#endif #endif
} else if (place_ == PlaceType::kXPU) { } else if (place_ == PlaceType::kXPU) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto xpu_place = BOOST_GET_CONST(paddle::platform::XPUPlace, t_place); auto xpu_place = t_place;
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), xpu_place, t_data, static_cast<void *>(data), xpu_place, t_data,
ele_num * sizeof(T)); ele_num * sizeof(T));
...@@ -293,7 +293,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -293,7 +293,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place); auto npu_place = t_place;
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place)); pool.Get(npu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
......
...@@ -134,7 +134,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data, ...@@ -134,7 +134,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
"Lite::MemoryCopy CPU->GPU is not yet implemented.")); "Lite::MemoryCopy CPU->GPU is not yet implemented."));
} else if (platform::is_gpu_place(dst_place) && } else if (platform::is_gpu_place(dst_place) &&
platform::is_gpu_place(src_place)) { platform::is_gpu_place(src_place)) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place); auto gpu_place = src_place;
memory::Copy( memory::Copy(
gpu_place, dst_data, gpu_place, src_data, size, gpu_place, dst_data, gpu_place, src_data, size,
static_cast<const platform::CUDADeviceContext&>(ctx).stream()); static_cast<const platform::CUDADeviceContext&>(ctx).stream());
......
...@@ -813,8 +813,7 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( ...@@ -813,8 +813,7 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
} }
#endif #endif
platform::CUDAPlace cuda_place = platform::CUDAPlace cuda_place(place.GetDeviceId());
BOOST_GET_CONST(platform::CUDAPlace, place);
return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
} }
#endif #endif
...@@ -838,8 +837,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, ...@@ -838,8 +837,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
} }
#endif #endif
platform::CUDAPlace cuda_place = platform::CUDAPlace cuda_place(place.GetDeviceId());
BOOST_GET_CONST(platform::CUDAPlace, place);
return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
} }
#endif #endif
...@@ -859,8 +857,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ...@@ -859,8 +857,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
} }
#endif #endif
platform::CUDAPlace cuda_place = platform::CUDAPlace cuda_place(place.GetDeviceId());
BOOST_GET_CONST(platform::CUDAPlace, place);
return Release(cuda_place, m_->GetDefaultStream(cuda_place)); return Release(cuda_place, m_->GetDefaultStream(cuda_place));
} }
#endif #endif
...@@ -935,7 +932,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, ...@@ -935,7 +932,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
} }
#endif #endif
platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place); platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size); ->Allocate(size);
......
...@@ -19,12 +19,7 @@ ...@@ -19,12 +19,7 @@
#include <map> #include <map>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform//place.h"
namespace paddle {
namespace platform {
class Place;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace memory { namespace memory {
......
...@@ -34,7 +34,7 @@ namespace allocation { ...@@ -34,7 +34,7 @@ namespace allocation {
bool CUDAAllocator::IsAllocThreadSafe() const { return true; } bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
void CUDAAllocator::FreeImpl(pten::Allocation* allocation) { void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, allocation->place(), place_,
platform::errors::PermissionDenied( platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug")); "GPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedGpuFree(allocation->ptr(), allocation->size(), platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
......
...@@ -144,8 +144,8 @@ class CUDADeviceContextAllocatorPool { ...@@ -144,8 +144,8 @@ class CUDADeviceContextAllocatorPool {
} }
AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) { AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
auto iter = allocators_.find( auto iter =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace())); allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
iter, allocators_.end(), iter, allocators_.end(),
platform::errors::NotFound("No allocator found for CUDAPlace.")); platform::errors::NotFound("No allocator found for CUDAPlace."));
......
...@@ -103,7 +103,7 @@ bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; } ...@@ -103,7 +103,7 @@ bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) { void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, allocation->place(), place_,
platform::errors::PermissionDenied( platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug")); "GPU memory is freed in incorrect device. This may be a bug"));
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/pten/common/place.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -791,7 +792,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { ...@@ -791,7 +792,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
namespace allocation { namespace allocation {
pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
auto *tmp_alloc = new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord( platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size); static_cast<void *>(tmp_alloc), place_, size);
...@@ -799,16 +800,16 @@ pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { ...@@ -799,16 +800,16 @@ pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
} }
void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) { void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
boost::apply_visitor( paddle::platform::VisitPlace(
legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place(),
allocation->place()); legacy::FreeVisitor(allocation->ptr(), allocation->size()));
platform::MemEvenRecorder::Instance().PopMemRecord( platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_); static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) { uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
return boost::apply_visitor(legacy::ReleaseVisitor(), place); return paddle::platform::VisitPlace(place, legacy::ReleaseVisitor());
} }
} // namespace allocation } // namespace allocation
......
...@@ -24,7 +24,7 @@ namespace allocation { ...@@ -24,7 +24,7 @@ namespace allocation {
bool NPUAllocator::IsAllocThreadSafe() const { return true; } bool NPUAllocator::IsAllocThreadSafe() const { return true; }
void NPUAllocator::FreeImpl(pten::Allocation* allocation) { void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, allocation->place(), place_,
platform::errors::PermissionDenied( platform::errors::PermissionDenied(
"NPU memory is freed in incorrect device. This may be a bug")); "NPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedNPUFree(allocation->ptr(), allocation->size(), platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
......
...@@ -164,8 +164,7 @@ void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) { ...@@ -164,8 +164,7 @@ void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_); std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators = std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
uint64_t released_size = 0; uint64_t released_size = 0;
for (StreamSafeCUDAAllocator* allocator : allocators) { for (StreamSafeCUDAAllocator* allocator : allocators) {
released_size += allocator->ProcessUnfreedAllocationsWithRelease(); released_size += allocator->ProcessUnfreedAllocationsWithRelease();
...@@ -192,7 +191,7 @@ uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { ...@@ -192,7 +191,7 @@ uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
return underlying_allocator_->Release(place_); return underlying_allocator_->Release(place_);
} }
std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>> std::map<platform::Place, std::vector<StreamSafeCUDAAllocator*>>
StreamSafeCUDAAllocator::allocator_map_; StreamSafeCUDAAllocator::allocator_map_;
SpinLock StreamSafeCUDAAllocator::allocator_map_lock_; SpinLock StreamSafeCUDAAllocator::allocator_map_lock_;
......
...@@ -65,7 +65,7 @@ class StreamSafeCUDAAllocator : public Allocator { ...@@ -65,7 +65,7 @@ class StreamSafeCUDAAllocator : public Allocator {
void ProcessUnfreedAllocations(); void ProcessUnfreedAllocations();
uint64_t ProcessUnfreedAllocationsWithRelease(); uint64_t ProcessUnfreedAllocationsWithRelease();
static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>> static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
allocator_map_; allocator_map_;
static SpinLock allocator_map_lock_; static SpinLock allocator_map_lock_;
......
...@@ -23,8 +23,7 @@ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p) ...@@ -23,8 +23,7 @@ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
buddy_allocator_.reset(new memory::detail::BuddyAllocator( buddy_allocator_.reset(new memory::detail::BuddyAllocator(
std::unique_ptr<memory::detail::SystemAllocator>( std::unique_ptr<memory::detail::SystemAllocator>(
new memory::detail::GPUAllocator( new memory::detail::GPUAllocator(place_.device)),
BOOST_GET_CONST(platform::CUDAPlace, place_).device)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
} else { } else {
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
......
此差异已折叠。
...@@ -16,12 +16,6 @@ ...@@ -16,12 +16,6 @@
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace paddle {
namespace platform {
struct CUDAPlace;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
......
...@@ -25,8 +25,7 @@ struct GetTensorValue<platform::CUDADeviceContext, T> { ...@@ -25,8 +25,7 @@ struct GetTensorValue<platform::CUDADeviceContext, T> {
const framework::Tensor& tensor) const { const framework::Tensor& tensor) const {
const T* data = tensor.data<T>(); const T* data = tensor.data<T>();
T value; T value;
const auto gpu_place = const auto gpu_place = dev_ctx.GetPlace();
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
dev_ctx.stream()); dev_ctx.stream());
return value; return value;
......
...@@ -117,9 +117,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> { ...@@ -117,9 +117,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
} }
int64_t total_num = h_starts[xs_size]; int64_t total_num = h_starts[xs_size];
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
dev_ctx.stream());
// copy each tensor's data address to device // copy each tensor's data address to device
auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
...@@ -134,8 +133,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> { ...@@ -134,8 +133,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
h_xs[i] = xs[i]->data<T>(); h_xs[i] = xs[i]->data<T>();
h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace()); h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
} }
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, memory::Copy(dev_ctx.GetPlace(), d_xs, cpu_place, h_xs,
cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream()); 2 * xs_size * sizeof(T*), dev_ctx.stream());
// Launch Kernel // Launch Kernel
int threads_per_block = std::min(static_cast<int64_t>(1024), total_num); int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
......
...@@ -41,8 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> { ...@@ -41,8 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
MPDType cpu_scale_data; MPDType cpu_scale_data;
if (platform::is_xpu_place(scale->place())) { if (platform::is_xpu_place(scale->place())) {
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data), memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
BOOST_GET_CONST(platform::XPUPlace, scale->place()), scale->place(), static_cast<const void*>(scale_data),
static_cast<const void*>(scale_data), sizeof(MPDType)); sizeof(MPDType));
} else { } else {
cpu_scale_data = (*scale_data); cpu_scale_data = (*scale_data);
...@@ -87,8 +87,7 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> { ...@@ -87,8 +87,7 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
dev_ctx.Wait(); dev_ctx.Wait();
} }
memory::Copy(platform::CPUPlace(), &cpu_found_inf_data, memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), dev_ctx.GetPlace(), found_inf_data, sizeof(bool));
found_inf_data, sizeof(bool));
} }
if (cpu_found_inf_data) { if (cpu_found_inf_data) {
...@@ -142,9 +141,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> { ...@@ -142,9 +141,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
if (dev_ctx.x_context()->xpu_stream) { if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait(); dev_ctx.Wait();
} }
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(),
found_inf_data, platform::CPUPlace(), &cpu_found_inf_data, &cpu_found_inf_data, sizeof(bool));
sizeof(bool));
} }
}; };
......
...@@ -114,9 +114,8 @@ class LazyZeros<platform::CUDADeviceContext, T> { ...@@ -114,9 +114,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
for (int i = 0; i < xs_size; i++) { for (int i = 0; i < xs_size; i++) {
h_starts[i + 1] = h_starts[i] + outs[i]->numel(); h_starts[i + 1] = h_starts[i] + outs[i]->numel();
} }
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
dev_ctx.stream());
// copy each tensor of "outs" data address array to device // copy each tensor of "outs" data address array to device
auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*)); auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
...@@ -128,9 +127,8 @@ class LazyZeros<platform::CUDADeviceContext, T> { ...@@ -128,9 +127,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
for (size_t i = 0; i < xs_size; ++i) { for (size_t i = 0; i < xs_size; ++i) {
h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace()); h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
} }
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), memory::Copy(dev_ctx.GetPlace(), d_out_addrs, cpu_place, h_out_addrs,
d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*), xs_size * sizeof(T*), dev_ctx.stream());
dev_ctx.stream());
// launch cuda kernel // launch cuda kernel
int64_t total_num = h_starts[xs_size]; int64_t total_num = h_starts[xs_size];
......
...@@ -187,9 +187,7 @@ class LazyZerosNPU { ...@@ -187,9 +187,7 @@ class LazyZerosNPU {
framework::TensorCopy(*x, place, dev_ctx, out); framework::TensorCopy(*x, place, dev_ctx, out);
} else if (zero_ptr != dst_ptr) { } else if (zero_ptr != dst_ptr) {
auto size = out->numel() * framework::SizeOfType(out->type()); auto size = out->numel() * framework::SizeOfType(out->type());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr, memory::Copy(place, dst_ptr, place, zero_ptr, size, stream);
BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size,
stream);
} }
} }
} }
......
...@@ -43,8 +43,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> { ...@@ -43,8 +43,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
bool cpu_found_inf_data = false; bool cpu_found_inf_data = false;
if (platform::is_xpu_place(found_inf->place())) { if (platform::is_xpu_place(found_inf->place())) {
memory::Copy(platform::CPUPlace(), memory::Copy(platform::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data), static_cast<void*>(&cpu_found_inf_data), found_inf->place(),
BOOST_GET_CONST(platform::XPUPlace, found_inf->place()),
static_cast<const void*>(found_inf_data), sizeof(bool)); static_cast<const void*>(found_inf_data), sizeof(bool));
} else { } else {
cpu_found_inf_data = (*found_inf_data); cpu_found_inf_data = (*found_inf_data);
...@@ -97,16 +96,16 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> { ...@@ -97,16 +96,16 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
MPDType cpu_pre_loss_scaling_data; MPDType cpu_pre_loss_scaling_data;
if (platform::is_xpu_place(bad_in->place())) { if (platform::is_xpu_place(bad_in->place())) {
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_bad_in_data), memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_bad_in_data),
BOOST_GET_CONST(platform::XPUPlace, bad_in->place()), bad_in->place(), static_cast<const void*>(bad_in_data),
static_cast<const void*>(bad_in_data), sizeof(int)); sizeof(int));
} else { } else {
cpu_bad_in_data = (*bad_in_data); cpu_bad_in_data = (*bad_in_data);
} }
if (platform::is_xpu_place(good_in->place())) { if (platform::is_xpu_place(good_in->place())) {
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_good_in_data), memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_good_in_data),
BOOST_GET_CONST(platform::XPUPlace, good_in->place()), good_in->place(), static_cast<const void*>(good_in_data),
static_cast<const void*>(good_in_data), sizeof(int)); sizeof(int));
} else { } else {
cpu_good_in_data = (*good_in_data); cpu_good_in_data = (*good_in_data);
} }
...@@ -114,7 +113,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> { ...@@ -114,7 +113,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
if (platform::is_xpu_place(pre_loss_scaling->place())) { if (platform::is_xpu_place(pre_loss_scaling->place())) {
memory::Copy( memory::Copy(
platform::CPUPlace(), static_cast<void*>(&cpu_pre_loss_scaling_data), platform::CPUPlace(), static_cast<void*>(&cpu_pre_loss_scaling_data),
BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()), pre_loss_scaling->place(),
static_cast<const void*>(pre_loss_scaling_data), sizeof(MPDType)); static_cast<const void*>(pre_loss_scaling_data), sizeof(MPDType));
} else { } else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data); cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
...@@ -146,15 +145,13 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> { ...@@ -146,15 +145,13 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
} }
} }
// copy to device // copy to device
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), memory::Copy(dev_ctx.GetPlace(), bad_out_data, platform::CPUPlace(),
bad_out_data, platform::CPUPlace(), &cpu_bad_out_data, &cpu_bad_out_data, sizeof(int));
sizeof(int)); memory::Copy(dev_ctx.GetPlace(), good_out_data, platform::CPUPlace(),
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), &cpu_good_out_data, sizeof(int));
good_out_data, platform::CPUPlace(), &cpu_good_out_data, memory::Copy(dev_ctx.GetPlace(), updated_loss_scaling_data,
sizeof(int)); platform::CPUPlace(), &cpu_updated_loss_scaling_data,
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), sizeof(MPDType));
updated_loss_scaling_data, platform::CPUPlace(),
&cpu_updated_loss_scaling_data, sizeof(MPDType));
} }
}; };
......
...@@ -25,8 +25,6 @@ namespace imperative { ...@@ -25,8 +25,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -27,8 +27,6 @@ namespace imperative { ...@@ -27,8 +27,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -26,8 +26,6 @@ namespace imperative { ...@@ -26,8 +26,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -26,9 +26,6 @@ class EmptyGradOpMaker; ...@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
namespace imperative { namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform {
struct CPUPlace;
} // namespace platform
} // namespace paddle } // namespace paddle
namespace paddle { namespace paddle {
......
...@@ -25,8 +25,7 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>( ...@@ -25,8 +25,7 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
auto cuda_place = auto cuda_place = in_old_num_accumulates->place();
BOOST_GET_CONST(platform::CUDAPlace, in_old_num_accumulates->place());
memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place, memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
in_old_num_accumulates->data<int64_t>(), sizeof(int64_t), in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
stream); stream);
...@@ -44,8 +43,7 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>( ...@@ -44,8 +43,7 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates"); auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates"); auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates"); auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
auto cuda_place = auto cuda_place = out_old_num_accumulates->place();
BOOST_GET_CONST(platform::CUDAPlace, out_old_num_accumulates->place());
memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(), memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
......
...@@ -57,8 +57,7 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T> ...@@ -57,8 +57,7 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
auto* out_data = out->mutable_data<T>(ctx.GetPlace()); auto* out_data = out->mutable_data<T>(ctx.GetPlace());
int64_t size = x->numel(); int64_t size = x->numel();
int device_id = int device_id = ctx.GetPlace().GetDeviceId();
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
......
...@@ -102,8 +102,7 @@ class CholeskyGPUKernel : public framework::OpKernel<T> { ...@@ -102,8 +102,7 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
std::vector<int> error_info; // only for checking positive matrix std::vector<int> error_info; // only for checking positive matrix
error_info.resize(batch_count); error_info.resize(batch_count);
memory::Copy(platform::CPUPlace(), error_info.data(), memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
info_ptr, sizeof(int) * batch_count, dev_ctx.stream()); info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
for (int i = 0; i < batch_count; ++i) { for (int i = 0; i < batch_count; ++i) {
......
...@@ -306,7 +306,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> { ...@@ -306,7 +306,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
num_classes, num_samples)); num_classes, num_samples));
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); auto place = dev_ctx.GetPlace();
int batch_size = label->numel(); int batch_size = label->numel();
// Algorithm: // Algorithm:
...@@ -397,8 +397,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> { ...@@ -397,8 +397,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
(NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
1) * 1) *
vec_size; vec_size;
int device_id = int device_id = ctx.GetPlace().GetDeviceId();
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && (!fix_seed)) { if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
auto seed_offset = gen_cuda->IncrementOffset(offset); auto seed_offset = gen_cuda->IncrementOffset(offset);
......
...@@ -33,7 +33,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now.")); "AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......
...@@ -34,7 +34,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
"The place of ExecutionContext should be CUDAPlace.")); "The place of ExecutionContext should be CUDAPlace."));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device; int dev_id = ctx.GetPlace().device;
int root_dev_id = ctx.Attr<int>("root"); int root_dev_id = ctx.Attr<int>("root");
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
......
...@@ -40,7 +40,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
"The place of ExecutionContext should be XPUPlace.")); "The place of ExecutionContext should be XPUPlace."));
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
int dev_id = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()).device; int dev_id = ctx.GetPlace().device;
int root_dev_id = ctx.Attr<int>("root"); int root_dev_id = ctx.Attr<int>("root");
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
......
...@@ -24,7 +24,6 @@ namespace imperative { ...@@ -24,7 +24,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct XPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -24,7 +24,6 @@ namespace imperative { ...@@ -24,7 +24,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct XPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -24,7 +24,6 @@ namespace imperative { ...@@ -24,7 +24,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct XPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -22,7 +22,6 @@ namespace imperative { ...@@ -22,7 +22,6 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
namespace platform { namespace platform {
struct CPUPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct CUDAPlace;
struct float16; struct float16;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册