未验证 提交 a1006b2b 编写于 作者: H Huang Jiyi 提交者: GitHub

[phi decoupling] decouple dependency to device_context in phi (Part 1) (#50865)

* move DeviceContextPool to phi

* add EmplaceExternalContextFunc

* update namespace

* update cmake

* fix bugs and create context_pool_impl.h

* replace platform::is_xxx_place

* fix bugs

* update generator

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix enforce usage

* Revert "fix enforce usage"

This reverts commit 5f521f08a69713cee506e64a00ec6d9fba709e27.

* fix bugs

* rm XPUDeviceContext and CustomDeviceContext

* fix bugs

* fix fix context init bug

* fix bugs after merge

* fix bugs

* fix name

* fix mutable_data

* update and fix bugs

* fix bugs

* update

* fix bugs

* fix name

* fix bugs

* merge

* fix bugs

* create context_pool in phi/backends

* create context_pool in phi/backends

* fix bugs

* fix xpu bugs

* fix rocm bugs

* fix bugs

* fix bugs

* fix bugs

* fix xpu bugs

* update

* update

* fix bugs

* fix bugs
上级 203a62b8
...@@ -94,4 +94,6 @@ paddle/fluid/pybind/op_function_impl.h ...@@ -94,4 +94,6 @@ paddle/fluid/pybind/op_function_impl.h
paddle/fluid/pybind/*final_state_op_function_impl.h paddle/fluid/pybind/*final_state_op_function_impl.h
paddle/fluid/prim/api/generated/prim_api/* paddle/fluid/prim/api/generated/prim_api/*
paddle/fluid/framework/__init__.py paddle/fluid/framework/__init__.py
paddle/phi/api/profiler/__init__.py
python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
...@@ -165,7 +165,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -165,7 +165,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
conv_desc[i], CUDNN_DEFAULT_MATH)); conv_desc[i], CUDNN_DEFAULT_MATH));
#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
if (!platform::allow_tf32_cudnn) { if (!phi::allow_tf32_cudnn) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(conv_desc[i], platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
CUDNN_FMA_MATH)); CUDNN_FMA_MATH));
......
...@@ -19,13 +19,6 @@ namespace phi { ...@@ -19,13 +19,6 @@ namespace phi {
class DenseTensor; class DenseTensor;
} // namespace phi } // namespace phi
namespace paddle {
namespace framework {} // namespace framework
namespace platform {
class XPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
......
...@@ -159,7 +159,6 @@ cc_library( ...@@ -159,7 +159,6 @@ cc_library(
cudnn_workspace_helper cudnn_workspace_helper
${XPU_CTX_DEPS} ${XPU_CTX_DEPS}
${MLU_CTX_DEPS} ${MLU_CTX_DEPS}
eigen3
phi_backends phi_backends
phi_device_context phi_device_context
generator) generator)
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
DECLARE_bool(use_stream_safe_cuda_allocator); DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_bool(new_executor_use_cuda_graph); DECLARE_bool(new_executor_use_cuda_graph);
......
...@@ -54,7 +54,8 @@ void InitDevice() { ...@@ -54,7 +54,8 @@ void InitDevice() {
} }
EXPECT_GT(static_cast<int>(places.size()), 0); EXPECT_GT(static_cast<int>(places.size()), 0);
paddle::platform::DeviceContextPool::Init(places); paddle::platform::DeviceContextPool::Init(
places, paddle::platform::EmplaceExternalContext);
} }
void TestDeviceInterface(const paddle::platform::Place& place) { void TestDeviceInterface(const paddle::platform::Place& place) {
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include <set> #include <set>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -39,19 +38,11 @@ limitations under the License. */ ...@@ -39,19 +38,11 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h" #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif #endif
#include "paddle/phi/backends/context_pool_utils.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
#endif // PADDLE_WITH_CUDA
DeviceType Place2DeviceType(const platform::Place& place) { DeviceType Place2DeviceType(const platform::Place& place) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
return platform::DeviceType::CPU; return platform::DeviceType::CPU;
...@@ -73,312 +64,76 @@ DeviceType Place2DeviceType(const platform::Place& place) { ...@@ -73,312 +64,76 @@ DeviceType Place2DeviceType(const platform::Place& place) {
} }
} }
static DeviceContextPool* pool = nullptr; void EmplaceExternalContext(
DeviceContextPool& DeviceContextPool::Instance() {
PADDLE_ENFORCE_NOT_NULL(pool,
phi::errors::PreconditionNotMet(
"Need to Create DeviceContextPool firstly!"));
return *pool;
}
/*! \brief Create should only called by Init function */
DeviceContextPool& DeviceContextPool::Init(
const std::vector<platform::Place>& places) {
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}
bool DeviceContextPool::IsInitialized() { return pool != nullptr; }
void DeviceContextPool::SetPool(DeviceContextPool* dev_pool) {
pool = dev_pool;
}
thread_local const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*
DeviceContextPool::external_device_contexts_ = nullptr;
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
VLOG(6) << "DeviceContextPool Get: " << place;
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
ptr;
if (external_device_contexts_ && external_device_contexts_->count(place)) {
ptr = external_device_contexts_;
} else {
ptr = &device_contexts_;
}
auto it = ptr->find(place);
if (it == ptr->end()) {
PADDLE_THROW(platform::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
"or check "
"that your train process set the correct device id if you use "
"Executor.",
place));
}
return it->second.get().get();
}
size_t DeviceContextPool::size() const {
if (external_device_contexts_) {
return external_device_contexts_->size();
}
return device_contexts_.size();
}
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
DeviceContextPool::device_contexts() const {
if (external_device_contexts_) {
return *external_device_contexts_;
}
return device_contexts_;
}
void DeviceContextPool::SetDeviceContexts(
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
dev_ctxs) {
external_device_contexts_ = dev_ctxs;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const platform::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const platform::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const platform::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
std::unique_ptr<DeviceContext> CreateDeviceContext(
const platform::Place& p,
bool disable_setting_default_stream_for_allocator = false,
int stream_priority = 0) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
if (is_gpu_place(p)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
auto& instance = memory::allocation::AllocatorFacade::Instance();
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (is_xpu_place(p)) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(
memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(
memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p)
.get());
dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>* std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context, place_to_device_context,
platform::Place place, const platform::Place& place,
bool disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
int stream_priority) { int stream_priority) {
// lazy evaluation. i.e., only create device context at first `Get` if (platform::is_cuda_pinned_place(place)) {
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<platform::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
PADDLE_ENFORCE_GT(
places.size(),
0,
platform::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (platform::is_gpu_place(p)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("CUDAPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (platform::is_cuda_pinned_place(p)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDAPinnedDeviceContext>( phi::EmplaceDeviceContext<CUDAPinnedDeviceContext>(
place_to_device_context, place_to_device_context,
p, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
#endif
} else if (platform::is_xpu_place(p)) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<XPUDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else #else
PADDLE_THROW( PADDLE_THROW(platform::errors::Unimplemented(
platform::errors::Unimplemented("XPUPlace is not supported. Please " "CUDAPlace is not supported. Please re-compile with WITH_GPU "
"re-compile with WITH_XPU option.")); "option."));
#endif #endif
} else if (platform::is_mlu_place(p)) { } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
EmplaceDeviceContext<MLUDeviceContext>( phi::EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context, place_to_device_context,
p, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please " platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option.")); "re-compile with WITH_MLU option."));
#endif #endif
} else if (platform::is_ipu_place(p)) { } else if (platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
EmplaceDeviceContext<IPUDeviceContext>( phi::EmplaceDeviceContext<IPUDeviceContext>(
place_to_device_context, place_to_device_context,
p, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please " platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option.")); "re-compile with WITH_IPU option."));
#endif #endif
} else if (platform::is_npu_place(p)) { } else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUDeviceContext>( phi::EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context, place_to_device_context,
p, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please " "NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option.")); "re-compile with WITH_ASCEND_CL option."));
#endif #endif
} else if (platform::is_npu_pinned_place(p)) { } else if (platform::is_npu_pinned_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUPinnedDeviceContext>( phi::EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context, place_to_device_context,
p, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
#endif
} else if (platform::is_custom_place(p)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<CustomDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with " "NPUPinnedPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE " "WITH_ASCEND_CL "
"option.")); "option."));
#endif #endif
}
} }
} }
DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) {
EmplaceDeviceContexts(&device_contexts_,
places,
/*disable_setting_default_stream_for_allocator=*/false,
/*stream_priority=*/0);
}
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {} IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
...@@ -390,19 +145,6 @@ void IPUDeviceContext::Wait() const { ...@@ -390,19 +145,6 @@ void IPUDeviceContext::Wait() const {
IPUDeviceContext::~IPUDeviceContext() {} IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() : phi::XPUContext() {
phi::XPUContext::Init();
}
XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : phi::XPUContext(place) {
phi::XPUContext::Init();
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place.device);
}
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -469,14 +211,5 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { ...@@ -469,14 +211,5 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; } const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDeviceContext::CustomDeviceContext(CustomPlace place)
: phi::CustomContext(place) {
Init();
stream_.reset(new phi::stream::Stream(place, stream()));
}
CustomDeviceContext::~CustomDeviceContext() {}
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/custom/custom_context.h"
#include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_decls.h"
...@@ -98,18 +99,6 @@ struct GpuDevice; ...@@ -98,18 +99,6 @@ struct GpuDevice;
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/*Set the value of the global variable allow_tf32_cublas*/
void SetAllowTF32Cublas(bool active);
/*Get the global variable allow_tf32_cublas value*/
bool AllowTF32Cublas();
extern bool allow_tf32_cudnn;
/*Set the value of the global variable allow_tf32_cudnn*/
void SetAllowTF32Cudnn(bool active);
/*Get the global variable allow_tf32_cudnn value*/
bool AllowTF32Cudnn();
#endif // PADDLE_WITH_CUDA
enum DeviceType { enum DeviceType {
CPU = 0, CPU = 0,
CUDA = 1, CUDA = 1,
...@@ -134,14 +123,6 @@ constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE; ...@@ -134,14 +123,6 @@ constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE;
using DeviceContext = phi::DeviceContext; using DeviceContext = phi::DeviceContext;
template <typename Place>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<platform::CPUPlace> {
using TYPE = phi::CPUContext;
};
// Graphcore IPU // Graphcore IPU
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
class IPUDeviceContext class IPUDeviceContext
...@@ -161,35 +142,15 @@ class IPUDeviceContext ...@@ -161,35 +142,15 @@ class IPUDeviceContext
private: private:
IPUPlace place_; IPUPlace place_;
}; };
template <>
struct DefaultDeviceContextType<platform::IPUPlace> {
using TYPE = IPUDeviceContext;
};
#endif #endif
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
class MLUDeviceContext; class MLUDeviceContext;
template <>
struct DefaultDeviceContextType<platform::MLUPlace>;
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
namespace xpu = baidu::xpu::api; namespace xpu = baidu::xpu::api;
class XPUDeviceContext : public phi::XPUContext { using XPUDeviceContext = phi::XPUContext;
public:
XPUDeviceContext();
explicit XPUDeviceContext(XPUPlace place);
virtual ~XPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
void CreateStream() { XPUContext::CreateStream(); }
};
template <>
struct DefaultDeviceContextType<platform::XPUPlace> {
using TYPE = XPUDeviceContext;
};
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -251,11 +212,6 @@ class NPUDeviceContext ...@@ -251,11 +212,6 @@ class NPUDeviceContext
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext); DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
}; };
template <>
struct DefaultDeviceContextType<platform::NPUPlace> {
using TYPE = NPUDeviceContext;
};
// Currently, NPUPinnedDeviceContext is only used to data copying. // Currently, NPUPinnedDeviceContext is only used to data copying.
class NPUPinnedDeviceContext class NPUPinnedDeviceContext
: public DeviceContext, : public DeviceContext,
...@@ -275,19 +231,9 @@ class NPUPinnedDeviceContext ...@@ -275,19 +231,9 @@ class NPUPinnedDeviceContext
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
}; };
template <>
struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
using TYPE = NPUPinnedDeviceContext;
};
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = phi::GPUContext;
};
// Currently, CUDAPinnedDeviceContext is only used to data copying. // Currently, CUDAPinnedDeviceContext is only used to data copying.
class CUDAPinnedDeviceContext class CUDAPinnedDeviceContext
: public DeviceContext, : public DeviceContext,
...@@ -306,90 +252,57 @@ class CUDAPinnedDeviceContext ...@@ -306,90 +252,57 @@ class CUDAPinnedDeviceContext
CUDAPinnedPlace place_; CUDAPinnedPlace place_;
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
}; };
template <>
struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
using TYPE = CUDAPinnedDeviceContext;
};
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDeviceContext : public phi::CustomContext { using CustomDeviceContext = phi::CustomContext;
public:
explicit CustomDeviceContext(CustomPlace place);
virtual ~CustomDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return stream_->AddCallback(callback);
}
void WaitStreamCallback() const { return stream_->WaitCallback(); }
private:
std::shared_ptr<phi::stream::Stream> stream_;
};
template <>
struct DefaultDeviceContextType<platform::CustomPlace> {
using TYPE = CustomDeviceContext;
};
#else
template <>
struct DefaultDeviceContextType<platform::CustomPlace> {
using TYPE = DeviceContext;
};
#endif #endif
void EmplaceDeviceContexts( void EmplaceExternalContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>* std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context, place_to_device_context,
const std::vector<platform::Place>& places, const platform::Place& place,
bool disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
int stream_priority); int stream_priority);
/*! \brief device context pool singleton */ using phi::EmplaceDeviceContexts;
class DeviceContextPool {
public:
static DeviceContextPool& Instance();
/*! \brief Create should only called by Init function */
static DeviceContextPool& Init(const std::vector<platform::Place>& places);
static bool IsInitialized(); using DeviceContextPool = phi::DeviceContextPool;
static void SetPool(DeviceContextPool* dev_pool); } // namespace platform
} // namespace paddle
/*! \brief Return handle of single device context. */ namespace phi {
platform::DeviceContext* Get(const platform::Place& place);
template <typename Place> #ifdef PADDLE_WITH_IPU
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace( template <>
const Place& place) { struct DefaultDeviceContextType<phi::IPUPlace> {
return reinterpret_cast< using TYPE = paddle::platform::IPUDeviceContext;
const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place)); };
} #endif
size_t size() const; #ifdef PADDLE_WITH_MLU
template <>
struct DefaultDeviceContextType<phi::MLUPlace>;
#endif
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>& #ifdef PADDLE_WITH_ASCEND_CL
device_contexts() const; template <>
struct DefaultDeviceContextType<phi::NPUPlace> {
using TYPE = paddle::platform::NPUDeviceContext;
};
static void SetDeviceContexts( template <>
const std::map<Place, struct DefaultDeviceContextType<phi::NPUPinnedPlace> {
std::shared_future<std::unique_ptr<DeviceContext>>>*); using TYPE = paddle::platform::NPUPinnedDeviceContext;
};
#endif
private: #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
explicit DeviceContextPool(const std::vector<platform::Place>& places); template <>
struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> using TYPE = paddle::platform::CUDAPinnedDeviceContext;
device_contexts_;
static thread_local const std::
map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
external_device_contexts_; // not owned
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
}; };
#endif
} // namespace platform } // namespace phi
} // namespace paddle
...@@ -119,59 +119,6 @@ using namespace ::phi::enforce; // NOLINT ...@@ -119,59 +119,6 @@ using namespace ::phi::enforce; // NOLINT
#define PADDLE_MAY_THROW noexcept(false) #define PADDLE_MAY_THROW noexcept(false)
#endif #endif
/*
* Summary: This macro is used to get Variable or internal type
* data (such as LoDTensor or SelectedRows) of the Input and
* Output in op, generally used when call scope.FindVar(Input/
* Output("Name")) or ctx.Input<LoDTensor>().
* Firstly this macro check whether the obtained pointer is null,
* and then return data if it is not null.
*
* Note: This macro is only suitable for specific scenarios and
* does not intended to be widely used. If it cannot meet the
* requirements, please use other PADDLE_ENFORCE** check macro.
*
* Parameters:
*     __PTR: pointer
* __ROLE: (string), Input or Output
* __NAME: (string), Input or Output name
* __OP_TYPE: (string), the op type
*
* Return: The data pointed to by the pointer.
*
* Examples:
* GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
*/
#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \
(([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \
auto* __ptr = (__PTR); \
if (UNLIKELY(nullptr == __ptr)) { \
auto __summary__ = phi::errors::NotFound( \
"Unable to get %s data of %s %s in operator %s. " \
"Possible reasons are:\n" \
" 1. The %s is not the %s of operator %s;\n" \
" 2. The %s has no corresponding variable passed in;\n" \
" 3. The %s corresponding variable is not initialized.", \
phi::demangle( \
typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
.name()), \
__ROLE, \
__NAME, \
__OP_TYPE, \
__NAME, \
__ROLE, \
__OP_TYPE, \
__NAME, \
__NAME); \
auto __message__ = ::paddle::string::Sprintf( \
"%s\n [Hint: pointer " #__PTR " should not be null.]", \
__summary__.error_message()); \
__THROW_ERROR_INTERNAL__( \
phi::ErrorSummary(__summary__.code(), __message__)); \
} \
return *__ptr; \
})())
/* /*
* Summary: This macro is used to check whether op has specified * Summary: This macro is used to check whether op has specified
* Input or Output Variables. Because op's Input and Output * Input or Output Variables. Because op's Input and Output
......
...@@ -282,7 +282,7 @@ void InitDevices(const std::vector<int> devices) { ...@@ -282,7 +282,7 @@ void InitDevices(const std::vector<int> devices) {
} }
} }
#endif #endif
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places, platform::EmplaceExternalContext);
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(FLAGS_paddle_num_threads); platform::SetNumThreads(FLAGS_paddle_num_threads);
......
...@@ -27,7 +27,7 @@ TEST(InitDevices, CPU) { ...@@ -27,7 +27,7 @@ TEST(InitDevices, CPU) {
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MLU) !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MLU)
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U); ASSERT_EQ(pool.Size(), 1U);
#endif #endif
} }
...@@ -39,7 +39,7 @@ TEST(InitDevices, CUDA) { ...@@ -39,7 +39,7 @@ TEST(InitDevices, CUDA) {
int count = paddle::platform::GetGPUDeviceCount(); int count = paddle::platform::GetGPUDeviceCount();
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count)); ASSERT_EQ(pool.Size(), 2U + static_cast<unsigned>(count));
#endif #endif
} }
...@@ -51,7 +51,7 @@ TEST(InitDevices, XPU) { ...@@ -51,7 +51,7 @@ TEST(InitDevices, XPU) {
int count = paddle::platform::GetXPUDeviceCount(); int count = paddle::platform::GetXPUDeviceCount();
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count)); ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
#endif #endif
} }
...@@ -63,7 +63,7 @@ TEST(InitDevices, MLU) { ...@@ -63,7 +63,7 @@ TEST(InitDevices, MLU) {
int count = paddle::platform::GetMLUDeviceCount(); int count = paddle::platform::GetMLUDeviceCount();
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count)); ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
#endif #endif
} }
......
...@@ -2526,10 +2526,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2526,10 +2526,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder); m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("set_cublas_switch", platform::SetAllowTF32Cublas); m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
m.def("get_cublas_switch", platform::AllowTF32Cublas); m.def("get_cublas_switch", phi::AllowTF32Cublas);
m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn); m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
m.def("get_cudnn_switch", platform::AllowTF32Cudnn); m.def("get_cudnn_switch", phi::AllowTF32Cudnn);
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
m.def("clear_executor_cache", []() { m.def("clear_executor_cache", []() {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
......
...@@ -91,7 +91,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) { ...@@ -91,7 +91,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
phi::AddNInferMeta(x_metas, &meta_out); phi::AddNInferMeta(x_metas, &meta_out);
using kernel_signature = using kernel_signature =
void (*)(const platform::DeviceContext&, void (*)(const phi::DeviceContext&,
const std::vector<const phi::SelectedRows*>&, const std::vector<const phi::SelectedRows*>&,
phi::SelectedRows*); phi::SelectedRows*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>(); auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
...@@ -119,7 +119,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) { ...@@ -119,7 +119,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
phi::AddNInferMeta(x_metas, &meta_out); phi::AddNInferMeta(x_metas, &meta_out);
using kernel_signature = using kernel_signature =
void (*)(const platform::DeviceContext&, void (*)(const phi::DeviceContext&,
const std::vector<const phi::TensorBase*>&, const std::vector<const phi::TensorBase*>&,
phi::DenseTensor*); phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>(); auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
...@@ -177,7 +177,7 @@ void embedding_grad_impl(const Tensor& x, ...@@ -177,7 +177,7 @@ void embedding_grad_impl(const Tensor& x,
meta_out.set_dtype(input_weight->dtype()); meta_out.set_dtype(input_weight->dtype());
kernel_out->set_height(input_weight->dims()[0]); kernel_out->set_height(input_weight->dims()[0]);
using kernel_signature = void (*)(const platform::DeviceContext&, using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&, const phi::DenseTensor&,
const phi::DenseTensor&, const phi::DenseTensor&,
const phi::DenseTensor&, const phi::DenseTensor&,
...@@ -194,7 +194,7 @@ void embedding_grad_impl(const Tensor& x, ...@@ -194,7 +194,7 @@ void embedding_grad_impl(const Tensor& x,
auto* kernel_out = SetKernelOutput(weight_grad); auto* kernel_out = SetKernelOutput(weight_grad);
phi::MetaTensor meta_out(kernel_out); phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out); phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
using kernel_signature = void (*)(const platform::DeviceContext&, using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&, const phi::DenseTensor&,
const phi::DenseTensor&, const phi::DenseTensor&,
const phi::DenseTensor&, const phi::DenseTensor&,
...@@ -229,7 +229,7 @@ void embedding_grad_impl(const Tensor& x, ...@@ -229,7 +229,7 @@ void embedding_grad_impl(const Tensor& x,
auto* kernel_out = SetSelectedRowsKernelOutput(weight_grad); auto* kernel_out = SetSelectedRowsKernelOutput(weight_grad);
phi::MetaTensor meta_out(kernel_out); phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out); phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
using kernel_signature = void (*)(const platform::DeviceContext&, using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&, const phi::DenseTensor&,
const phi::SelectedRows&, const phi::SelectedRows&,
const phi::DenseTensor&, const phi::DenseTensor&,
...@@ -247,7 +247,7 @@ void embedding_grad_impl(const Tensor& x, ...@@ -247,7 +247,7 @@ void embedding_grad_impl(const Tensor& x,
phi::MetaTensor meta_out(kernel_out); phi::MetaTensor meta_out(kernel_out);
meta_out.set_dims(input_weight->GetCompleteDims()); meta_out.set_dims(input_weight->GetCompleteDims());
meta_out.set_dtype(input_weight->dtype()); meta_out.set_dtype(input_weight->dtype());
using kernel_signature = void (*)(const platform::DeviceContext&, using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&, const phi::DenseTensor&,
const phi::SelectedRows&, const phi::SelectedRows&,
const phi::DenseTensor&, const phi::DenseTensor&,
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/api/include/context_pool.h" #include "paddle/phi/api/include/context_pool.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/core/allocator.h" #include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
...@@ -35,11 +35,11 @@ DeviceContextPool& DeviceContextPool::Instance() { ...@@ -35,11 +35,11 @@ DeviceContextPool& DeviceContextPool::Instance() {
const phi::DeviceContext* DeviceContextPool::Get(const Place& place) { const phi::DeviceContext* DeviceContextPool::Get(const Place& place) {
auto it = context_map_.find(place); auto it = context_map_.find(place);
if (it == context_map_.end()) { if (it == context_map_.end()) {
if (!paddle::platform::DeviceContextPool::IsInitialized()) { if (!phi::DeviceContextPool::IsInitialized()) {
paddle::framework::InitDevices(); paddle::framework::InitDevices();
} }
// only when we need the specific DeviceContext, get and cache it // only when we need the specific DeviceContext, get and cache it
auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place); auto* dev_ctx = phi::DeviceContextPool::Instance().Get(place);
{ {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
context_map_[place] = dev_ctx; context_map_[place] = dev_ctx;
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/cast_kernel.h"
...@@ -33,7 +33,7 @@ inline bool NeedTransformDataType(const DataType& input, ...@@ -33,7 +33,7 @@ inline bool NeedTransformDataType(const DataType& input,
target == DataType::COMPLEX64 || target == DataType::COMPLEX128); target == DataType::COMPLEX64 || target == DataType::COMPLEX128);
} }
inline bool NeedTransformPlace(const paddle::platform::Place& input, inline bool NeedTransformPlace(const phi::Place& input,
const Backend& target, const Backend& target,
const TransformFlag& transform_flag) { const TransformFlag& transform_flag) {
// NOTE(dev): The default value of TransformFlag is True, if it is set with // NOTE(dev): The default value of TransformFlag is True, if it is set with
...@@ -52,12 +52,12 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input, ...@@ -52,12 +52,12 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
inline bool NeedTransformLayout(const DataLayout& input, inline bool NeedTransformLayout(const DataLayout& input,
const DataLayout& target, const DataLayout& target,
const paddle::platform::Place& place, const phi::Place& place,
const TransformFlag& transform_flag) { const TransformFlag& transform_flag) {
bool ret = transform_flag.need_trans_layout() && bool ret = transform_flag.need_trans_layout() &&
(input != DataLayout::ALL_LAYOUT && (input != DataLayout::ALL_LAYOUT &&
target != DataLayout::ALL_LAYOUT && input != target); target != DataLayout::ALL_LAYOUT && input != target);
if (platform::is_gpu_place(place)) { if (place.GetType() == phi::AllocationType::GPU) {
return false; return false;
} }
return ret; return ret;
...@@ -65,10 +65,10 @@ inline bool NeedTransformLayout(const DataLayout& input, ...@@ -65,10 +65,10 @@ inline bool NeedTransformLayout(const DataLayout& input,
inline phi::DenseTensor TransDataLayout(const phi::DenseTensor& tensor, inline phi::DenseTensor TransDataLayout(const phi::DenseTensor& tensor,
DataLayout layout) { DataLayout layout) {
auto& pool = paddle::platform::DeviceContextPool::Instance(); auto& pool = phi::DeviceContextPool::Instance();
VLOG(3) << "DataLayoutTransform src_layout: " << tensor.layout() VLOG(3) << "DataLayoutTransform src_layout: " << tensor.layout()
<< " dst_layout: " << layout; << " dst_layout: " << layout;
if (platform::is_cpu_place(tensor.place())) { if (tensor.place().GetType() == phi::AllocationType::CPU) {
auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place())); auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
return phi::TransferLayout(*dev_ctx, tensor, layout); return phi::TransferLayout(*dev_ctx, tensor, layout);
} else { } else {
...@@ -139,7 +139,7 @@ phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx, ...@@ -139,7 +139,7 @@ phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
DataType dtype) { DataType dtype) {
auto& pool = paddle::platform::DeviceContextPool::Instance(); auto& pool = phi::DeviceContextPool::Instance();
VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype() VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype()
<< " dst_dtype: " << dtype; << " dst_dtype: " << dtype;
...@@ -147,11 +147,11 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, ...@@ -147,11 +147,11 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
DefaultAllocator alloc(tensor.place()); DefaultAllocator alloc(tensor.place());
phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()}); phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()});
if (platform::is_cpu_place(tensor.place())) { if (tensor.place().GetType() == phi::AllocationType::CPU) {
auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place())); auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
return CastDataType(*dev_ctx, tensor, dtype); return CastDataType(*dev_ctx, tensor, dtype);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if (platform::is_gpu_place(tensor.place())) { } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place())); auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
return CastDataType(*dev_ctx, tensor, dtype); return CastDataType(*dev_ctx, tensor, dtype);
#endif #endif
...@@ -170,7 +170,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, ...@@ -170,7 +170,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
auto& pool = phi::DeviceContextPool::Instance(); auto& pool = phi::DeviceContextPool::Instance();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// NOTE(yy): TransDataPlace should wait for computation of input. // NOTE(yy): TransDataPlace should wait for computation of input.
if (!platform::is_cuda_pinned_place(tensor.place())) { if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
pool.Get(tensor.place())->Wait(); pool.Get(tensor.place())->Wait();
pool.Get(dst_place)->Wait(); pool.Get(dst_place)->Wait();
} }
......
...@@ -112,7 +112,7 @@ DataType ParseDataType(const std::vector<Tensor>& tensors) { ...@@ -112,7 +112,7 @@ DataType ParseDataType(const std::vector<Tensor>& tensors) {
auto n = tensors.size(); auto n = tensors.size();
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
if (tensors[i].type() != dtype) { if (tensors[i].type() != dtype) {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"The data_type of input tensor in list isn't consistent, " "The data_type of input tensor in list isn't consistent, "
"the first tensor is %s, but %dth tensor is %s.", "the first tensor is %s, but %dth tensor is %s.",
dtype, dtype,
......
...@@ -872,7 +872,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -872,7 +872,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
sr_out_trans_map = {'Tensor': 'phi::SelectedRows*'} sr_out_trans_map = {'Tensor': 'phi::SelectedRows*'}
input_names = self.inputs['names'] input_names = self.inputs['names']
input_infos = self.inputs['input_info'] input_infos = self.inputs['input_info']
kernel_args_type_list = ['const platform::DeviceContext&'] kernel_args_type_list = ['const phi::DeviceContext&']
attr_names = self.attrs['names'] attr_names = self.attrs['names']
kernel_param = self.kernel['param'] kernel_param = self.kernel['param']
......
...@@ -127,7 +127,7 @@ class StringsAPI(ForwardAPI): ...@@ -127,7 +127,7 @@ class StringsAPI(ForwardAPI):
} }
input_names = self.inputs['names'] input_names = self.inputs['names']
input_infos = self.inputs['input_info'] input_infos = self.inputs['input_info']
kernel_args_type_list = ['const platform::DeviceContext&'] kernel_args_type_list = ['const phi::DeviceContext&']
attr_names = self.attrs['names'] attr_names = self.attrs['names']
kernel_param = self.kernel['param'] kernel_param = self.kernel['param']
......
...@@ -3,6 +3,7 @@ add_subdirectory(gpu) ...@@ -3,6 +3,7 @@ add_subdirectory(gpu)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc) set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context) set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
set(BACKENDS_DEPS allocator generator)
if(WITH_XBYAK) if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak) list(APPEND BACKENDS_DEPS xbyak)
endif() endif()
...@@ -45,7 +46,8 @@ list( ...@@ -45,7 +46,8 @@ list(
stream.cc stream.cc
event.cc event.cc
device_base.cc device_base.cc
device_manager.cc) device_manager.cc
context_pool.cc)
if(WITH_CUSTOM_DEVICE) if(WITH_CUSTOM_DEVICE)
list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc
...@@ -54,7 +56,6 @@ endif() ...@@ -54,7 +56,6 @@ endif()
add_library(phi_backends "${BACKENDS_SRCS}") add_library(phi_backends "${BACKENDS_SRCS}")
target_link_libraries(phi_backends ${BACKENDS_DEPS}) target_link_libraries(phi_backends ${BACKENDS_DEPS})
add_dependencies(phi_backends eigen3)
# for inference library # for inference library
get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
......
...@@ -26,11 +26,4 @@ limitations under the License. */ ...@@ -26,11 +26,4 @@ limitations under the License. */
#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#ifndef PADDLE_WITH_CUSTOM_KERNEL namespace phi {} // namespace phi
// TODO(wilber): DeviceContextPool nees include fluid file.
#include "paddle/fluid/platform/device_context.h"
namespace phi {
using DeviceContextPool = paddle::platform::DeviceContextPool;
} // namespace phi
#endif
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/context_pool_utils.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
#endif // PADDLE_WITH_CUDA
static DeviceContextPool* pool = nullptr;
DeviceContextPool& DeviceContextPool::Instance() {
PADDLE_ENFORCE_NOT_NULL(pool,
phi::errors::PreconditionNotMet(
"Need to Create DeviceContextPool firstly!"));
return *pool;
}
EmplaceExternalContextFunc DeviceContextPool::emplace_external_context_func_ =
nullptr;
/*! \brief Create should only called by Init function */
DeviceContextPool& DeviceContextPool::Init(
const std::vector<phi::Place>& places, EmplaceExternalContextFunc func) {
emplace_external_context_func_ = func;
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}
bool DeviceContextPool::IsInitialized() { return pool != nullptr; }
void DeviceContextPool::SetPool(DeviceContextPool* dev_pool) {
pool = dev_pool;
}
thread_local const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*
DeviceContextPool::external_device_contexts_ = nullptr;
phi::DeviceContext* DeviceContextPool::Get(const phi::Place& place) {
VLOG(6) << "DeviceContextPool Get: " << place;
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
ptr;
if (external_device_contexts_ && external_device_contexts_->count(place)) {
ptr = external_device_contexts_;
} else {
ptr = &device_contexts_;
}
auto it = ptr->find(place);
if (it == ptr->end()) {
PADDLE_THROW(phi::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
"or check "
"that your train process set the correct device id if you use "
"Executor.",
place));
}
return it->second.get().get();
}
size_t DeviceContextPool::Size() const {
if (external_device_contexts_) {
return external_device_contexts_->size();
}
return device_contexts_.size();
}
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
DeviceContextPool::device_contexts() const {
if (external_device_contexts_) {
return *external_device_contexts_;
}
return device_contexts_;
}
void DeviceContextPool::SetDeviceContexts(
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
dev_ctxs) {
external_device_contexts_ = dev_ctxs;
}
inline void EmplaceNativeContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
if (place.GetType() == phi::AllocationType::CPU) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("GPUPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::XPU) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<phi::XPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<phi::CustomContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif
}
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func) {
PADDLE_ENFORCE_GT(
places.size(),
0,
phi::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
EmplaceNativeContext(place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
if (emplace_external_context_func != nullptr) {
(*emplace_external_context_func)(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
}
}
}
DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) {
EmplaceDeviceContexts(&device_contexts_,
places,
/*disable_setting_default_stream_for_allocator=*/false,
/*stream_priority=*/0,
emplace_external_context_func_);
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <future> // NOLINT
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/macros.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void SetAllowTF32Cublas(bool active);
/*Get the global variable allow_tf32_cublas value*/
bool AllowTF32Cublas();
extern bool allow_tf32_cudnn;
/*Set the value of the global variable allow_tf32_cudnn*/
void SetAllowTF32Cudnn(bool active);
/*Get the global variable allow_tf32_cudnn value*/
bool AllowTF32Cudnn();
#endif // PADDLE_WITH_CUDA
template <typename Place>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<phi::CPUPlace> {
using TYPE = phi::CPUContext;
};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct DefaultDeviceContextType<phi::GPUPlace> {
using TYPE = phi::GPUContext;
};
#endif
#ifdef PADDLE_WITH_XPU
template <>
struct DefaultDeviceContextType<phi::XPUPlace> {
using TYPE = phi::XPUContext;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
template <>
struct DefaultDeviceContextType<phi::CustomPlace> {
using TYPE = phi::CustomContext;
};
#else
template <>
struct DefaultDeviceContextType<phi::CustomPlace> {
using TYPE = DeviceContext;
};
#endif
using EmplaceExternalContextFunc = void (*)(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*,
const phi::Place&,
bool,
int);
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func = nullptr);
/*! \brief device context pool singleton */
class DeviceContextPool {
public:
static DeviceContextPool& Instance();
/*! \brief Create should only called by Init function */
static DeviceContextPool& Init(const std::vector<phi::Place>& places,
EmplaceExternalContextFunc func = nullptr);
static bool IsInitialized();
static void SetPool(DeviceContextPool* dev_pool);
/*! \brief Return handle of single device context. */
phi::DeviceContext* Get(const phi::Place& place);
template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) {
return reinterpret_cast<
const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
}
size_t Size() const;
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
device_contexts() const;
static void SetDeviceContexts(
const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*);
private:
explicit DeviceContextPool(const std::vector<phi::Place>& places);
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
device_contexts_;
static thread_local const std::
map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
external_device_contexts_; // not owned
static EmplaceExternalContextFunc emplace_external_context_func_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/core/generator.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const phi::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
inline std::unique_ptr<DeviceContext> CreateDeviceContext(
const phi::Place& p,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
if (p.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
phi::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(phi::GPUPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (p.GetType() == phi::AllocationType::XPU) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p)
.get());
dev_ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::CPUPlace())
.get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
// lazy evaluation. i.e., only create device context at first `Get`
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
} // namespace phi
...@@ -66,8 +66,10 @@ void CustomContext::SetStream(std::shared_ptr<phi::stream::Stream> stream) { ...@@ -66,8 +66,10 @@ void CustomContext::SetStream(std::shared_ptr<phi::stream::Stream> stream) {
void CustomContext::Wait() const { return impl_->Wait(); } void CustomContext::Wait() const { return impl_->Wait(); }
CustomContext::CustomContext(const CustomPlace& place) CustomContext::CustomContext(const CustomPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {} : DeviceContext(), impl_(std::make_unique<Impl>(place)) {
impl_->Init();
}
CustomContext::~CustomContext() {} CustomContext::~CustomContext() { impl_->Init(); }
} // namespace phi } // namespace phi
...@@ -20,6 +20,10 @@ limitations under the License. */ ...@@ -20,6 +20,10 @@ limitations under the License. */
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
namespace Eigen {
struct DefaultDevice;
} // namespace Eigen
namespace phi { namespace phi {
class CustomContext : public DeviceContext, class CustomContext : public DeviceContext,
...@@ -42,6 +46,15 @@ class CustomContext : public DeviceContext, ...@@ -42,6 +46,15 @@ class CustomContext : public DeviceContext,
// Wait for all operations completion in the stream. // Wait for all operations completion in the stream.
void Wait() const override; void Wait() const override;
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return GetStream()->AddCallback(callback);
}
void WaitStreamCallback() const { return GetStream()->WaitCallback(); }
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
static const char* name() { return "CustomContext"; } static const char* name() { return "CustomContext"; }
public: public:
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/phi/backends/callback_manager.h" #include "paddle/phi/backends/callback_manager.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/custom/enforce_custom.h" #include "paddle/phi/backends/custom/enforce_custom.h"
#include "paddle/phi/backends/device_base.h" #include "paddle/phi/backends/device_base.h"
#include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/device_guard.h"
...@@ -285,8 +285,7 @@ class CustomDevice : public DeviceInterface { ...@@ -285,8 +285,7 @@ class CustomDevice : public DeviceInterface {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size)); pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
} else if (pimpl_->memory_copy_h2d) { } else if (pimpl_->memory_copy_h2d) {
paddle::platform::DeviceContextPool& pool = phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
paddle::platform::DeviceContextPool::Instance();
pool.Get(place)->Wait(); pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_h2d(device, dst, src, size)); pimpl_->memory_copy_h2d(device, dst, src, size));
...@@ -306,8 +305,7 @@ class CustomDevice : public DeviceInterface { ...@@ -306,8 +305,7 @@ class CustomDevice : public DeviceInterface {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size)); pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
} else if (pimpl_->memory_copy_d2h) { } else if (pimpl_->memory_copy_d2h) {
paddle::platform::DeviceContextPool& pool = phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
paddle::platform::DeviceContextPool::Instance();
pool.Get(place)->Wait(); pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_d2h(device, dst, src, size)); pimpl_->memory_copy_d2h(device, dst, src, size));
...@@ -327,8 +325,7 @@ class CustomDevice : public DeviceInterface { ...@@ -327,8 +325,7 @@ class CustomDevice : public DeviceInterface {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size)); pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
} else if (pimpl_->memory_copy_d2d) { } else if (pimpl_->memory_copy_d2d) {
paddle::platform::DeviceContextPool& pool = phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
paddle::platform::DeviceContextPool::Instance();
pool.Get(place)->Wait(); pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_d2d(device, dst, src, size)); pimpl_->memory_copy_d2d(device, dst, src, size));
...@@ -364,8 +361,7 @@ class CustomDevice : public DeviceInterface { ...@@ -364,8 +361,7 @@ class CustomDevice : public DeviceInterface {
MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size); MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
} else { } else {
auto src_place = CustomPlace(Type(), src_dev_id); auto src_place = CustomPlace(Type(), src_dev_id);
paddle::platform::DeviceContextPool& pool = phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
paddle::platform::DeviceContextPool::Instance();
pool.Get(src_place)->Wait(); pool.Get(src_place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size)); pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include <utility> #include <utility>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#include "paddle/phi/kernels/funcs/dropout_impl_util.h" #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
#endif #endif
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/utils/flat_hash_map.h" #include "paddle/utils/flat_hash_map.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/core/expect.h" #include "paddle/phi/core/expect.h"
namespace phi { namespace phi {
...@@ -42,8 +42,7 @@ OneDNNContextThreadLocals::Body::~Body() { ...@@ -42,8 +42,7 @@ OneDNNContextThreadLocals::Body::~Body() {
auto cpu_place = phi::CPUPlace(); auto cpu_place = phi::CPUPlace();
// TODO(YuanRisheng): we need remove the dependency on fluid device context // TODO(YuanRisheng): we need remove the dependency on fluid device context
// here // here
paddle::platform::DeviceContextPool& pool = phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
paddle::platform::DeviceContextPool::Instance();
OneDNNContext* dev_ctx = static_cast<OneDNNContext*>(pool.Get(cpu_place)); OneDNNContext* dev_ctx = static_cast<OneDNNContext*>(pool.Get(cpu_place));
dev_ctx->ResetBlobMap(exec_ptr_); dev_ctx->ResetBlobMap(exec_ptr_);
} }
......
...@@ -44,14 +44,14 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { ...@@ -44,14 +44,14 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
return nullptr; return nullptr;
} }
#endif #endif
if (paddle::platform::is_gpu_place(place)) { if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
return static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)->NCCLComm( return static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)->NCCLComm(
place); place);
#else #else
return nullptr; return nullptr;
#endif #endif
} else if (paddle::platform::is_custom_place(place)) { } else if (place.GetType() == phi::AllocationType::CUSTOM) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE) #if defined(PADDLE_WITH_CUSTOM_DEVICE)
return static_cast<paddle::distributed::ProcessGroupCustom*>(pg) return static_cast<paddle::distributed::ProcessGroupCustom*>(pg)
->CustomCCLComm(place); ->CustomCCLComm(place);
......
...@@ -125,10 +125,14 @@ struct XPUContext::Impl { ...@@ -125,10 +125,14 @@ struct XPUContext::Impl {
xpu::BKCLContext_t bkcl_context_{nullptr}; xpu::BKCLContext_t bkcl_context_{nullptr};
}; };
XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {} XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {
impl_->Init();
}
XPUContext::XPUContext(const XPUPlace& place) XPUContext::XPUContext(const XPUPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {} : DeviceContext(), impl_(std::make_unique<Impl>(place)) {
impl_->Init();
}
XPUContext::~XPUContext() = default; XPUContext::~XPUContext() = default;
......
...@@ -24,6 +24,10 @@ limitations under the License. */ ...@@ -24,6 +24,10 @@ limitations under the License. */
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
#include "xpu/runtime.h" #include "xpu/runtime.h"
namespace Eigen {
struct DefaultDevice;
} // namespace Eigen
namespace xpu = baidu::xpu::api; namespace xpu = baidu::xpu::api;
namespace phi { namespace phi {
...@@ -65,6 +69,8 @@ class XPUContext : public DeviceContext, ...@@ -65,6 +69,8 @@ class XPUContext : public DeviceContext,
void SetL3Cache(int l3_size = 14155776); void SetL3Cache(int l3_size = 14155776);
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
XPUStream stream() const; XPUStream stream() const;
static const char* name() { return "XPUContext"; } static const char* name() { return "XPUContext"; }
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/int_array.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
......
...@@ -118,7 +118,7 @@ cc_library( ...@@ -118,7 +118,7 @@ cc_library(
cc_library( cc_library(
mixed_vector mixed_vector
SRCS mixed_vector.cc SRCS mixed_vector.cc
DEPS device_context place memory) DEPS phi_backends place memory)
cc_library( cc_library(
generator generator
...@@ -135,24 +135,20 @@ if(WITH_GPU) ...@@ -135,24 +135,20 @@ if(WITH_GPU)
nv_library( nv_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows memcpy device_context DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
memory_utils)
elseif(WITH_ROCM) elseif(WITH_ROCM)
hip_library( hip_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows memcpy device_context DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
memory_utils)
elseif(WITH_XPU_KP) elseif(WITH_XPU_KP)
xpu_library( xpu_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows memcpy device_context DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
memory_utils)
else() else()
cc_library( cc_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS dense_tensor selected_rows memcpy device_context phi_backends DEPS dense_tensor selected_rows memcpy phi_backends memory_utils)
memory_utils)
endif() endif()
...@@ -468,6 +468,59 @@ struct EnforceNotMet : public std::exception { ...@@ -468,6 +468,59 @@ struct EnforceNotMet : public std::exception {
/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/ /** EXTENDED TOOL FUNCTIONS WITH CHECKING **/
/*
* Summary: This macro is used to get Variable or internal type
* data (such as LoDTensor or SelectedRows) of the Input and
* Output in op, generally used when call scope.FindVar(Input/
* Output("Name")) or ctx.Input<LoDTensor>().
* Firstly this macro check whether the obtained pointer is null,
* and then return data if it is not null.
*
* Note: This macro is only suitable for specific scenarios and
* does not intended to be widely used. If it cannot meet the
* requirements, please use other PADDLE_ENFORCE** check macro.
*
* Parameters:
*     __PTR: pointer
* __ROLE: (string), Input or Output
* __NAME: (string), Input or Output name
* __OP_TYPE: (string), the op type
*
* Return: The data pointed to by the pointer.
*
* Examples:
* GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
*/
#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \
(([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \
auto* __ptr = (__PTR); \
if (UNLIKELY(nullptr == __ptr)) { \
auto __summary__ = phi::errors::NotFound( \
"Unable to get %s data of %s %s in operator %s. " \
"Possible reasons are:\n" \
" 1. The %s is not the %s of operator %s;\n" \
" 2. The %s has no corresponding variable passed in;\n" \
" 3. The %s corresponding variable is not initialized.", \
phi::demangle( \
typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
.name()), \
__ROLE, \
__NAME, \
__OP_TYPE, \
__NAME, \
__ROLE, \
__OP_TYPE, \
__NAME, \
__NAME); \
auto __message__ = ::paddle::string::Sprintf( \
"%s\n [Hint: pointer " #__PTR " should not be null.]", \
__summary__.error_message()); \
__THROW_ERROR_INTERNAL__( \
phi::ErrorSummary(__summary__.code(), __message__)); \
} \
return *__ptr; \
})())
/* /*
* Summary: This PADDLE_GET(_**) series macros are used to call paddle::get * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get
* safely. paddle::get is not a completely safe api, although it will not * safely. paddle::get is not a completely safe api, although it will not
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
......
...@@ -14,14 +14,13 @@ limitations under the License. */ ...@@ -14,14 +14,13 @@ limitations under the License. */
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/platform/device_context.h"
namespace phi { namespace phi {
template <typename Context> template <typename Context>
......
...@@ -62,10 +62,10 @@ void IndexSelectInner(const Context& ctx, ...@@ -62,10 +62,10 @@ void IndexSelectInner(const Context& ctx,
auto index_size = index.dims()[0]; auto index_size = index.dims()[0];
DenseTensor index_cpu_copy; DenseTensor index_cpu_copy;
if (!paddle::platform::is_cpu_place(index.place())) { if (index.place().GetType() != phi::AllocationType::CPU) {
phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy); phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
} }
const IndexT* index_data = paddle::platform::is_cpu_place(index.place()) const IndexT* index_data = index.place().GetType() == phi::AllocationType::CPU
? index.data<IndexT>() ? index.data<IndexT>()
: index_cpu_copy.data<IndexT>(); : index_cpu_copy.data<IndexT>();
ctx.template Alloc<T>(output); ctx.template Alloc<T>(output);
......
...@@ -160,5 +160,5 @@ PD_REGISTER_KERNEL(overlap_add_grad, ...@@ -160,5 +160,5 @@ PD_REGISTER_KERNEL(overlap_add_grad,
int64_t, int64_t,
float, float,
double, double,
paddle::platform::complex<float>, phi::dtype::complex<float>,
paddle::platform::complex<double>) {} phi::dtype::complex<double>) {}
...@@ -146,5 +146,5 @@ PD_REGISTER_KERNEL(overlap_add, ...@@ -146,5 +146,5 @@ PD_REGISTER_KERNEL(overlap_add,
int64_t, int64_t,
float, float,
double, double,
paddle::platform::complex<float>, phi::dtype::complex<float>,
paddle::platform::complex<double>) {} phi::dtype::complex<double>) {}
...@@ -33,7 +33,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, ...@@ -33,7 +33,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
DenseTensor* x_grad, DenseTensor* x_grad,
DenseTensor* value_grad) { DenseTensor* value_grad) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true, true,
errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU.")); errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));
......
...@@ -32,7 +32,7 @@ void PutAlongAxisKernel(const Context& dev_ctx, ...@@ -32,7 +32,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
const std::string& reduce, const std::string& reduce,
DenseTensor* out) { DenseTensor* out) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true, true,
errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU.")); errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));
......
...@@ -62,7 +62,7 @@ void RepeatInterleaveWithTensorIndexGradKernel( ...@@ -62,7 +62,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
paddle::framework::DataTypeToString( paddle::framework::DataTypeToString(
paddle::framework::proto::VarType::INT64))); paddle::framework::proto::VarType::INT64)));
paddle::platform::DeviceContextPool::Instance().Get(repeats_tensor.place()); phi::DeviceContextPool::Instance().Get(repeats_tensor.place());
if (index_type == paddle::framework::proto::VarType::INT32) { if (index_type == paddle::framework::proto::VarType::INT32) {
phi::funcs::RepeatsTensor2IndexTensor<Context, int>( phi::funcs::RepeatsTensor2IndexTensor<Context, int>(
ctx, repeats_tensor, &index); ctx, repeats_tensor, &index);
......
...@@ -30,7 +30,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, ...@@ -30,7 +30,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
int axis, int axis,
DenseTensor* x_grad) { DenseTensor* x_grad) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true, true,
errors::PreconditionNotMet("This kernel only runs on CPU.")); errors::PreconditionNotMet("This kernel only runs on CPU."));
......
...@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx, ...@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
int axis, int axis,
DenseTensor* out) { DenseTensor* out) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true, true,
errors::PreconditionNotMet("This kernel only runs on CPU.")); errors::PreconditionNotMet("This kernel only runs on CPU."));
......
cc_library( cc_library(
blas blas
SRCS blas.cc SRCS blas.cc
DEPS cblas framework_proto device_context) DEPS cblas framework_proto phi_backends)
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/layout.h" #include "paddle/phi/common/layout.h"
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
#include "paddle/phi/kernels/funcs/gru_compute.h" #include "paddle/phi/kernels/funcs/gru_compute.h"
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
#include "paddle/phi/kernels/funcs/lstm_compute.h" #include "paddle/phi/kernels/funcs/lstm_compute.h"
...@@ -218,7 +218,7 @@ __global__ void KeLstmBackward(Op op, ...@@ -218,7 +218,7 @@ __global__ void KeLstmBackward(Op op,
} }
template <class T, class Op> template <class T, class Op>
void gpu_lstm_forward(const paddle::platform::DeviceContext& context, void gpu_lstm_forward(const phi::DeviceContext& context,
Op op, Op op,
phi::funcs::LstmMetaValue<T> value, phi::funcs::LstmMetaValue<T> value,
int frame_size, int frame_size,
...@@ -269,7 +269,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context, ...@@ -269,7 +269,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
} }
template <class T, class Op> template <class T, class Op>
void gpu_lstm_backward(const paddle::platform::DeviceContext& context, void gpu_lstm_backward(const phi::DeviceContext& context,
Op op, Op op,
phi::funcs::LstmMetaValue<T> value, phi::funcs::LstmMetaValue<T> value,
phi::funcs::LstmMetaGrad<T> grad, phi::funcs::LstmMetaGrad<T> grad,
......
...@@ -35,8 +35,8 @@ inline std::vector<size_t> GetLodFromRoisNum(const Context& dev_ctx, ...@@ -35,8 +35,8 @@ inline std::vector<size_t> GetLodFromRoisNum(const Context& dev_ctx,
std::vector<size_t> rois_lod; std::vector<size_t> rois_lod;
auto* rois_num_data = rois_num->data<int>(); auto* rois_num_data = rois_num->data<int>();
DenseTensor cpu_tensor; DenseTensor cpu_tensor;
if (paddle::platform::is_gpu_place(rois_num->place()) || if (rois_num->place().GetType() == phi::AllocationType::GPU ||
paddle::platform::is_xpu_place(rois_num->place())) { rois_num->place().GetType() == phi::AllocationType::XPU) {
Copy<Context>(dev_ctx, *rois_num, phi::CPUPlace(), true, &cpu_tensor); Copy<Context>(dev_ctx, *rois_num, phi::CPUPlace(), true, &cpu_tensor);
rois_num_data = cpu_tensor.data<int>(); rois_num_data = cpu_tensor.data<int>();
} }
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -986,7 +986,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream, ...@@ -986,7 +986,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X); dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()); auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
auto *ctx = static_cast<GPUContext *>( auto *ctx = static_cast<GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(gplace)); phi::DeviceContextPool::Instance().Get(gplace));
phi::backends::gpu::LimitGridDim(*ctx, &grid_size); phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>( FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy); x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
...@@ -1010,8 +1010,8 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, ...@@ -1010,8 +1010,8 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
dim3 grid_size = dim3(n); dim3 grid_size = dim3(n);
auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()); auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
auto *ctx = static_cast<GPUContext *>( auto *ctx =
paddle::platform::DeviceContextPool::Instance().Get(gplace)); static_cast<GPUContext *>(phi::DeviceContextPool::Instance().Get(gplace));
phi::backends::gpu::LimitGridDim(*ctx, &grid_size); phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>( ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy); x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/jit/kernels.h" #include "paddle/phi/kernels/funcs/jit/kernels.h"
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/fc_functor.h"
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
......
...@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/platform/device_context.h> #include "paddle/phi/kernels/funcs/gru_compute.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
#include "paddle/phi/kernels/funcs/detail/gru_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
#include "paddle/phi/kernels/funcs/gru_compute.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/layout.h" #include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
...@@ -122,20 +123,20 @@ inline std::vector<T> get_new_data_from_tensor( ...@@ -122,20 +123,20 @@ inline std::vector<T> get_new_data_from_tensor(
DenseTensor cpu_starts_tensor; DenseTensor cpu_starts_tensor;
auto& pool = phi::DeviceContextPool::Instance(); auto& pool = phi::DeviceContextPool::Instance();
phi::DeviceContext* dev_ctx = pool.Get(new_data_tensor->place()); phi::DeviceContext* dev_ctx = pool.Get(new_data_tensor->place());
if (paddle::platform::is_gpu_place(new_data_tensor->place())) { if (new_data_tensor->place().GetType() == phi::AllocationType::GPU) {
phi::Copy( phi::Copy(
*dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor); *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
new_data = cpu_starts_tensor.data<T>(); new_data = cpu_starts_tensor.data<T>();
} }
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
if (paddle::platform::is_npu_place(new_data_tensor->place())) { if (new_data_tensor->place().GetType() == phi::AllocationType::NPU) {
phi::Copy( phi::Copy(
*dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor); *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
new_data = cpu_starts_tensor.data<T>(); new_data = cpu_starts_tensor.data<T>();
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (paddle::platform::is_xpu_place(new_data_tensor->place())) { if (new_data_tensor->place().GetType() == phi::AllocationType::XPU) {
phi::Copy( phi::Copy(
*dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor); *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
new_data = cpu_starts_tensor.data<T>(); new_data = cpu_starts_tensor.data<T>();
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
...@@ -52,22 +53,6 @@ template struct SetConstant<phi::CPUContext, phi::dtype::complex<float>>; ...@@ -52,22 +53,6 @@ template struct SetConstant<phi::CPUContext, phi::dtype::complex<float>>;
template struct SetConstant<phi::CPUContext, phi::dtype::complex<double>>; template struct SetConstant<phi::CPUContext, phi::dtype::complex<double>>;
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::float16>;
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::bfloat16>;
template struct SetConstant<paddle::platform::XPUDeviceContext, float>;
template struct SetConstant<paddle::platform::XPUDeviceContext, double>;
template struct SetConstant<paddle::platform::XPUDeviceContext, uint8_t>;
template struct SetConstant<paddle::platform::XPUDeviceContext, int16_t>;
template struct SetConstant<paddle::platform::XPUDeviceContext, int>;
template struct SetConstant<paddle::platform::XPUDeviceContext, int64_t>;
template struct SetConstant<paddle::platform::XPUDeviceContext, bool>;
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::complex<float>>;
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::complex<double>>;
template struct SetConstant<phi::XPUContext, phi::dtype::float16>; template struct SetConstant<phi::XPUContext, phi::dtype::float16>;
template struct SetConstant<phi::XPUContext, phi::dtype::bfloat16>; template struct SetConstant<phi::XPUContext, phi::dtype::bfloat16>;
template struct SetConstant<phi::XPUContext, float>; template struct SetConstant<phi::XPUContext, float>;
...@@ -164,10 +149,9 @@ struct TensorSetConstantCPU { ...@@ -164,10 +149,9 @@ struct TensorSetConstantCPU {
}; };
template <> template <>
void set_constant_with_place<paddle::platform::XPUPlace>( void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
const paddle::platform::DeviceContext& context, phi::DenseTensor* tensor,
phi::DenseTensor* tensor, float value) {
float value) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
phi::VisitDataType( phi::VisitDataType(
tensor->dtype(), tensor->dtype(),
...@@ -178,64 +162,54 @@ void set_constant_with_place<paddle::platform::XPUPlace>( ...@@ -178,64 +162,54 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
} }
template <> template <>
void set_constant_with_place<paddle::platform::NPUPlace>( void set_constant_with_place<phi::NPUPlace>(const phi::DeviceContext& context,
const paddle::platform::DeviceContext& context, phi::DenseTensor* tensor,
phi::DenseTensor* tensor, float value) {
float value) {
PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported")); PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
} }
template <> template <>
void set_constant_with_place<paddle::platform::NPUPinnedPlace>( void set_constant_with_place<phi::NPUPinnedPlace>(
const paddle::platform::DeviceContext& context, const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
phi::DenseTensor* tensor,
float value) {
PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported")); PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
} }
template <> template <>
void set_constant_with_place<paddle::platform::IPUPlace>( void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context,
const paddle::platform::DeviceContext& context, phi::DenseTensor* tensor,
phi::DenseTensor* tensor, float value) {
float value) {
PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported")); PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
} }
template <> template <>
void set_constant_with_place<paddle::platform::CustomPlace>( void set_constant_with_place<phi::CustomPlace>(
const paddle::platform::DeviceContext& context, const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
phi::DenseTensor* tensor,
float value) {
PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported")); PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
} }
template <> template <>
void set_constant_with_place<phi::CPUPlace>( void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
const paddle::platform::DeviceContext& context, phi::DenseTensor* tensor,
phi::DenseTensor* tensor, float value) {
float value) {
phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value)); phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
} }
template <> template <>
void set_constant_with_place<paddle::platform::MLUPlace>( void set_constant_with_place<phi::MLUPlace>(const phi::DeviceContext& context,
const paddle::platform::DeviceContext& context, phi::DenseTensor* tensor,
phi::DenseTensor* tensor, float value) {
float value) {
PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported")); PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
} }
template <> template <>
void set_constant_with_place<paddle::platform::CUDAPinnedPlace>( void set_constant_with_place<phi::GPUPinnedPlace>(
const paddle::platform::DeviceContext& context, const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
phi::DenseTensor* tensor,
float value) {
phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value)); phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
} }
struct TensorSetConstantWithPlace struct TensorSetConstantWithPlace
: public std::unary_function<paddle::platform::Place, void> { : public std::unary_function<phi::Place, void> {
TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context, TensorSetConstantWithPlace(const phi::DeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
float value) float value)
: context_(context), tensor_(tensor), value_(value) {} : context_(context), tensor_(tensor), value_(value) {}
...@@ -245,17 +219,17 @@ struct TensorSetConstantWithPlace ...@@ -245,17 +219,17 @@ struct TensorSetConstantWithPlace
set_constant_with_place<Place>(context_, tensor_, value_); set_constant_with_place<Place>(context_, tensor_, value_);
} }
const paddle::platform::DeviceContext& context_; const phi::DeviceContext& context_;
phi::DenseTensor* tensor_; phi::DenseTensor* tensor_;
float value_; float value_;
}; };
void set_constant(const paddle::platform::DeviceContext& context, void set_constant(const phi::DeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
float value) { float value) {
TensorSetConstantWithPlace func(context, tensor, value); TensorSetConstantWithPlace func(context, tensor, value);
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
if (paddle::platform::is_custom_place(context.GetPlace())) { if (context.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
func(phi::CPUPlace()); func(phi::CPUPlace());
return; return;
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
...@@ -187,8 +188,8 @@ void TransposeNormal<DeviceContext, T>::operator()( ...@@ -187,8 +188,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
auto* out_ptr = out->data<T>(); auto* out_ptr = out->data<T>();
// copy in_stride, out_stride, axis to gpu device // copy in_stride, out_stride, axis to gpu device
const paddle::platform::CUDAPlace& cuda_place = context.GetPlace(); const phi::GPUPlace& cuda_place = context.GetPlace();
paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); phi::CPUPlace cpu_place = phi::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t); size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size); auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size); auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
...@@ -231,7 +232,7 @@ struct TransposeNormal<phi::GPUContext, T> { ...@@ -231,7 +232,7 @@ struct TransposeNormal<phi::GPUContext, T> {
// copy in_stride, out_stride, axis to gpu device // copy in_stride, out_stride, axis to gpu device
const phi::GPUPlace& cuda_place = context.GetPlace(); const phi::GPUPlace& cuda_place = context.GetPlace();
phi::CPUPlace cpu_place = paddle::platform::CPUPlace(); phi::CPUPlace cpu_place = phi::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t); size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size); auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size); auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
...@@ -286,7 +287,7 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<float>); ...@@ -286,7 +287,7 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<float>);
DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>); DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>);
struct TensorSetConstantGPU { struct TensorSetConstantGPU {
TensorSetConstantGPU(const paddle::platform::DeviceContext& context, TensorSetConstantGPU(const phi::DeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
float value) float value)
: context_(context), tensor_(tensor), value_(value) {} : context_(context), tensor_(tensor), value_(value) {}
...@@ -299,16 +300,15 @@ struct TensorSetConstantGPU { ...@@ -299,16 +300,15 @@ struct TensorSetConstantGPU {
static_cast<T>(value_)); static_cast<T>(value_));
} }
const paddle::platform::DeviceContext& context_; const phi::DeviceContext& context_;
phi::DenseTensor* tensor_; phi::DenseTensor* tensor_;
float value_; float value_;
}; };
template <> template <>
void set_constant_with_place<paddle::platform::CUDAPlace>( void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& context,
const paddle::platform::DeviceContext& context, phi::DenseTensor* tensor,
phi::DenseTensor* tensor, float value) {
float value) {
phi::VisitDataType(tensor->dtype(), phi::VisitDataType(tensor->dtype(),
TensorSetConstantGPU(context, tensor, value)); TensorSetConstantGPU(context, tensor, value));
} }
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
...@@ -56,24 +56,19 @@ struct SetConstant { ...@@ -56,24 +56,19 @@ struct SetConstant {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template <typename T> template <typename T>
struct SetConstant<XPUContext, T> { struct SetConstant<phi::XPUContext, T> {
void operator()(const XPUContext& context, phi::DenseTensor* tensor, T num); void operator()(const phi::XPUContext& context,
};
template <typename T>
struct SetConstant<paddle::platform::XPUDeviceContext, T> {
void operator()(const paddle::platform::XPUDeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
T num); T num);
}; };
#endif #endif
template <typename Place> template <typename Place>
void set_constant_with_place(const paddle::platform::DeviceContext& context, void set_constant_with_place(const phi::DeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
float value); float value);
void set_constant(const paddle::platform::DeviceContext& context, void set_constant(const phi::DeviceContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
float value); float value);
...@@ -109,9 +104,7 @@ struct RowwiseMean { ...@@ -109,9 +104,7 @@ struct RowwiseMean {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template <typename U> template <typename U>
struct TensorSetConstantXPU { struct TensorSetConstantXPU {
TensorSetConstantXPU(phi::DenseTensor* tensor, TensorSetConstantXPU(phi::DenseTensor* tensor, U value, phi::Place place)
U value,
paddle::platform::Place place)
: tensor_(tensor), value_(value), place_(place) {} : tensor_(tensor), value_(value), place_(place) {}
template <typename T> template <typename T>
void apply() const { void apply() const {
...@@ -127,7 +120,7 @@ struct TensorSetConstantXPU { ...@@ -127,7 +120,7 @@ struct TensorSetConstantXPU {
} }
phi::DenseTensor* tensor_; phi::DenseTensor* tensor_;
U value_; U value_;
paddle::platform::Place place_; phi::Place place_;
}; };
#endif #endif
......
...@@ -34,17 +34,9 @@ void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context, ...@@ -34,17 +34,9 @@ void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template <typename T> template <typename T>
void SetConstant<XPUContext, T>::operator()(const XPUContext& context, void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& context,
phi::DenseTensor* tensor, phi::DenseTensor* tensor,
T num) { T num) {
phi::VisitDataType(tensor->dtype(),
TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
}
template <typename T>
void SetConstant<paddle::platform::XPUDeviceContext, T>::operator()(
const paddle::platform::XPUDeviceContext& context,
phi::DenseTensor* tensor,
T num) {
phi::VisitDataType(tensor->dtype(), phi::VisitDataType(tensor->dtype(),
TensorSetConstantXPU<T>(tensor, num, context.GetPlace())); TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
} }
...@@ -65,7 +57,7 @@ void Transpose<DeviceContext, T, Rank>::operator()( ...@@ -65,7 +57,7 @@ void Transpose<DeviceContext, T, Rank>::operator()(
auto* dev = context.eigen_device(); auto* dev = context.eigen_device();
// use 32bit index to speed up computation // use 32bit index to speed up computation
bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest(); bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
bool is_gpu_place = paddle::platform::is_gpu_place(context.GetPlace()); bool is_gpu_place = context.GetPlace().GetType() == phi::AllocationType::GPU;
if (use_32bit_index && is_gpu_place) { if (use_32bit_index && is_gpu_place) {
To32BitIndex(eigen_out).device(*dev) = To32BitIndex(eigen_out).device(*dev) =
To32BitIndex(eigen_in).shuffle(permute); To32BitIndex(eigen_in).shuffle(permute);
......
...@@ -415,7 +415,7 @@ struct ReduceConfig { ...@@ -415,7 +415,7 @@ struct ReduceConfig {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
bool not_higher = x_dim[0] > 1; bool not_higher = x_dim[0] > 1;
#else #else
int device_id = paddle::platform::GetCurrentDeviceId(); int device_id = phi::backends::gpu::GetCurrentDeviceId();
int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2]; int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2];
bool not_higher = x_dim[0] >= max_grid_z; bool not_higher = x_dim[0] >= max_grid_z;
#endif // PADDLE_WITH_XPU_KP #endif // PADDLE_WITH_XPU_KP
...@@ -467,10 +467,10 @@ struct ReduceConfig { ...@@ -467,10 +467,10 @@ struct ReduceConfig {
grid_num = details::CeilingDiv(left_num, block_dim->x); grid_num = details::CeilingDiv(left_num, block_dim->x);
reduce_num_per_thread = details::CeilingDiv(reduce_num, block_dim->y); reduce_num_per_thread = details::CeilingDiv(reduce_num, block_dim->y);
} }
int device_id = paddle::platform::GetCurrentDeviceId(); int device_id = phi::backends::gpu::GetCurrentDeviceId();
int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
int max_threads_per_mp = int max_threads_per_mp =
paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
int max_threads = max_threads_per_mp * max_mp; int max_threads = max_threads_per_mp * max_mp;
int num_threads = block_dim->x * block_dim->y; int num_threads = block_dim->x * block_dim->y;
int max_num_blocks = max_threads / num_threads; int max_num_blocks = max_threads / num_threads;
...@@ -509,10 +509,10 @@ struct ReduceConfig { ...@@ -509,10 +509,10 @@ struct ReduceConfig {
int grid_z = left_num / last_dim_num; int grid_z = left_num / last_dim_num;
left_num = last_dim_num; left_num = last_dim_num;
grid_dim->z = grid_z; grid_dim->z = grid_z;
int device_id = paddle::platform::GetCurrentDeviceId(); int device_id = phi::backends::gpu::GetCurrentDeviceId();
int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
int max_threads_per_mp = int max_threads_per_mp =
paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
int max_threads = max_threads_per_mp * max_mp; int max_threads = max_threads_per_mp * max_mp;
// init // init
int num_block = (max_threads / left_num); int num_block = (max_threads / left_num);
......
...@@ -22,13 +22,13 @@ void RepeatsTensor2IndexTensor(const Context& ctx, ...@@ -22,13 +22,13 @@ void RepeatsTensor2IndexTensor(const Context& ctx,
const DenseTensor& repeats, const DenseTensor& repeats,
DenseTensor* index) { DenseTensor* index) {
DenseTensor repeats_cpu_copy; DenseTensor repeats_cpu_copy;
if (!paddle::platform::is_cpu_place(repeats.place())) { if (repeats.place().GetType() != phi::AllocationType::CPU) {
phi::Copy( phi::Copy(ctx, repeats, phi::CPUPlace(), true, &repeats_cpu_copy);
ctx, repeats, paddle::platform::CPUPlace(), true, &repeats_cpu_copy);
} }
const RepeatsT* repeats_data = paddle::platform::is_cpu_place(repeats.place()) const RepeatsT* repeats_data =
? repeats.data<RepeatsT>() repeats.place().GetType() == phi::AllocationType::CPU
: repeats_cpu_copy.data<RepeatsT>(); ? repeats.data<RepeatsT>()
: repeats_cpu_copy.data<RepeatsT>();
int64_t index_size = 0; int64_t index_size = 0;
for (int i = 0; i < repeats.dims()[0]; i++) { for (int i = 0; i < repeats.dims()[0]; i++) {
......
...@@ -389,8 +389,8 @@ void SelectKernel(const KPDevice &dev_ctx, ...@@ -389,8 +389,8 @@ void SelectKernel(const KPDevice &dev_ctx,
using CT = int64_t; // set Count_data Type using CT = int64_t; // set Count_data Type
const int t_size = sizeof(CT); const int t_size = sizeof(CT);
const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace(); const phi::GPUPlace &cuda_place = dev_ctx.GetPlace();
paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); phi::CPUPlace cpu_place = phi::CPUPlace();
// 1.1 get stored data num of per block // 1.1 get stored data num of per block
const int kVecSize = 4; const int kVecSize = 4;
......
...@@ -76,17 +76,17 @@ struct SelectedRowsAdd<phi::GPUContext, T> { ...@@ -76,17 +76,17 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place), PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The running environment is not on the GPU place.")); "The running environment is not on the GPU place."));
auto in2_place = input2.place(); auto in2_place = input2.place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in2_place), PADDLE_ENFORCE_EQ(in2_place.GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The running environment is not on the GPU place.")); "The running environment is not on the GPU place."));
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(out_place), PADDLE_ENFORCE_EQ(out_place.GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The running environment is not on the GPU place.")); "The running environment is not on the GPU place."));
...@@ -237,12 +237,12 @@ struct SelectedRowsAddTo<phi::GPUContext, T> { ...@@ -237,12 +237,12 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
} }
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place), PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The running environment is not on the GPU place.")); "The running environment is not on the GPU place."));
auto in2_place = input2->place(); auto in2_place = input2->place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place), PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The running environment is not on the GPU place.")); "The running environment is not on the GPU place."));
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/mixed_vector.h" #include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#endif #endif
namespace phi { namespace phi {
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/core/mixed_vector.h" #include "paddle/phi/core/mixed_vector.h"
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <sstream> #include <sstream>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -33,7 +33,7 @@ static const std::vector<T> &ToVector(const std::vector<T> &vec) { ...@@ -33,7 +33,7 @@ static const std::vector<T> &ToVector(const std::vector<T> &vec) {
template <typename T> template <typename T>
static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) { static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
#ifdef __NVCC__ #ifdef __NVCC__
if (paddle::platform::is_gpu_place(place)) { if (place.GetType() == phi::AllocationType::GPU) {
using CopyT = typename std:: using CopyT = typename std::
conditional<std::is_same<T, bool>::value, uint8_t, T>::type; conditional<std::is_same<T, bool>::value, uint8_t, T>::type;
std::vector<CopyT> cpu_x(n); std::vector<CopyT> cpu_x(n);
......
...@@ -64,7 +64,7 @@ void Conv2dFusionKernel(const Context& ctx, ...@@ -64,7 +64,7 @@ void Conv2dFusionKernel(const Context& ctx,
pad_w0 = paddings[2]; pad_w0 = paddings[2];
pad_w1 = paddings[3]; pad_w1 = paddings[3];
} else { } else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Attr paddins in conv2d_fusion must have 2 or 4 elements, but now have " "Attr paddins in conv2d_fusion must have 2 or 4 elements, but now have "
"%u elements.", "%u elements.",
paddings.size())); paddings.size()));
...@@ -111,7 +111,7 @@ void Conv2dFusionKernel(const Context& ctx, ...@@ -111,7 +111,7 @@ void Conv2dFusionKernel(const Context& ctx,
params.residual = reinterpret_cast<const half*>(residual->data<T>()); params.residual = reinterpret_cast<const half*>(residual->data<T>());
Conv2dBiasAddRelu(params); Conv2dBiasAddRelu(params);
} else { } else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Cutlass now only support relu activation in a residual block")); "Cutlass now only support relu activation in a residual block"));
} }
} else if (activation == "relu") { } else if (activation == "relu") {
......
...@@ -319,7 +319,7 @@ void InitMoeRoutingKernelLauncher( ...@@ -319,7 +319,7 @@ void InitMoeRoutingKernelLauncher(
ec_route); ec_route);
} }
} else { } else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Currently only support `ec_route = True`. ")); "Currently only support `ec_route = True`. "));
} }
} }
...@@ -401,7 +401,7 @@ void GenericMoeGemmKernelLauncher(const T* A, ...@@ -401,7 +401,7 @@ void GenericMoeGemmKernelLauncher(const T* A,
int occupancy = GemmGrouped::maximum_active_blocks(); int occupancy = GemmGrouped::maximum_active_blocks();
const int threadblock_count = multi_processor_count * occupancy; const int threadblock_count = multi_processor_count * occupancy;
if (occupancy == 0) { if (occupancy == 0) {
PADDLE_THROW(paddle::platform::errors::Fatal( PADDLE_THROW(phi::errors::Fatal(
"[MoE Runner] GPU lacks the shared memory resources to run GroupedGEMM " "[MoE Runner] GPU lacks the shared memory resources to run GroupedGEMM "
"kernel")); "kernel"));
} }
...@@ -425,21 +425,21 @@ void GenericMoeGemmKernelLauncher(const T* A, ...@@ -425,21 +425,21 @@ void GenericMoeGemmKernelLauncher(const T* A,
if (can_implement != cutlass::Status::kSuccess) { if (can_implement != cutlass::Status::kSuccess) {
std::string err_msg = "MoEFC kernel will fail for params. Error: " + std::string err_msg = "MoEFC kernel will fail for params. Error: " +
std::string(cutlassGetStatusString(can_implement)); std::string(cutlassGetStatusString(can_implement));
PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg)); PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
} }
auto init_status = gemm.initialize(args); auto init_status = gemm.initialize(args);
if (init_status != cutlass::Status::kSuccess) { if (init_status != cutlass::Status::kSuccess) {
std::string err_msg = std::string err_msg =
"Failed to initialize cutlass variable batched gemm. Error: " + "Failed to initialize cutlass variable batched gemm. Error: " +
std::string(cutlassGetStatusString(init_status)); std::string(cutlassGetStatusString(init_status));
PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg)); PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
} }
auto run_status = gemm.run(stream); auto run_status = gemm.run(stream);
if (run_status != cutlass::Status::kSuccess) { if (run_status != cutlass::Status::kSuccess) {
std::string err_msg = std::string err_msg =
"Failed to run cutlass variable batched gemm. Error: " + "Failed to run cutlass variable batched gemm. Error: " +
std::string(cutlassGetStatusString(run_status)); std::string(cutlassGetStatusString(run_status));
PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg)); PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
} }
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/dynload/cudnn.h" #include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
#include "paddle/phi/common/backend.h" #include "paddle/phi/common/backend.h"
...@@ -313,12 +314,8 @@ class CudnnConvDescManager { ...@@ -313,12 +314,8 @@ class CudnnConvDescManager {
int groups, int groups,
cudnnDataType_t dtype) { cudnnDataType_t dtype) {
auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
desc->set(dtype, desc->set(
paddings, dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
groups);
return desc; return desc;
} }
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "paddle/phi/kernels/affine_grid_grad_kernel.h" #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "paddle/phi/kernels/affine_grid_kernel.h" #include "paddle/phi/kernels/affine_grid_kernel.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
......
...@@ -369,10 +369,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx, ...@@ -369,10 +369,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
// use global calculate stream // use global calculate stream
const auto calcu_stream = const auto calcu_stream =
static_cast<GPUContext*>( static_cast<GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get( phi::DeviceContextPool::Instance().Get(dev_ctx.GetPlace()))
dev_ctx.GetPlace()))
->stream(); ->stream();
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device.numel(), num_classes_per_device.numel(),
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#ifdef __NVCC__ #ifdef __NVCC__
...@@ -1721,34 +1721,36 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext, ...@@ -1721,34 +1721,36 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
template class DepthwiseConvFunctor<phi::GPUContext, float, false>; template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvFunctor<phi::GPUContext, double, false>; template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvFunctor<phi::GPUContext, platform::float16, false>; template class DepthwiseConvFunctor<phi::GPUContext,
phi::dtype::float16,
false>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>; template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>; template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, template class DepthwiseConvInputGradFunctor<phi::GPUContext,
platform::float16, phi::dtype::float16,
false>; false>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>; template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>; template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, template class DepthwiseConvFilterGradFunctor<phi::GPUContext,
platform::float16, phi::dtype::float16,
false>; false>;
template class DepthwiseConvFunctor<phi::GPUContext, float, true>; template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvFunctor<phi::GPUContext, double, true>; template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvFunctor<phi::GPUContext, platform::float16, true>; template class DepthwiseConvFunctor<phi::GPUContext, phi::dtype::float16, true>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>; template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>; template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, template class DepthwiseConvInputGradFunctor<phi::GPUContext,
platform::float16, phi::dtype::float16,
true>; true>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>; template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>; template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, template class DepthwiseConvFilterGradFunctor<phi::GPUContext,
platform::float16, phi::dtype::float16,
true>; true>;
} // namespace math } // namespace math
......
...@@ -385,17 +385,17 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -385,17 +385,17 @@ void InstanceNormGradKernel(const Context &dev_ctx,
miopenTensorDescriptor_t in_param_desc_; miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else #else
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_; cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif #endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
...@@ -406,27 +406,23 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -406,27 +406,23 @@ void InstanceNormGradKernel(const Context &dev_ctx,
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
paddle::platform::dynload::miopenSetTensorDescriptor( data_desc_,
data_desc_, CudnnDataType<T>::type,
CudnnDataType<T>::type, x_dims.size() > 3 ? x_dims.size() : 4,
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
const_cast<int *>(dims.data()), const_cast<int *>(strides.data())));
const_cast<int *>(strides.data()))); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
PADDLE_ENFORCE_GPU_SUCCESS( in_param_desc_, data_desc_, miopenBNSpatial));
paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
paddle::platform::dynload::cudnnSetTensorNdDescriptor( data_desc_,
data_desc_, CudnnDataType<T>::type,
CudnnDataType<T>::type, x_dims.size() > 3 ? x_dims.size() : 4,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(),
dims.data(), strides.data()));
strides.data())); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
PADDLE_ENFORCE_GPU_SUCCESS( in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif #endif
const auto *saved_mean_data = const auto *saved_mean_data =
...@@ -435,49 +431,47 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -435,49 +431,47 @@ void InstanceNormGradKernel(const Context &dev_ctx,
saved_variance.template data<BatchNormParamType<T>>(); saved_variance.template data<BatchNormParamType<T>>();
if (d_scale && d_bias) { if (d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
paddle::platform::dynload::miopenBatchNormalizationBackward( dev_ctx.cudnn_handle(),
dev_ctx.cudnn_handle(), miopenBNSpatial,
miopenBNSpatial, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
CudnnDataType<T>::kZero(), data_desc_,
data_desc_, x_tmp.template data<T>(),
x_tmp.template data<T>(), data_desc_,
data_desc_, d_y_tmp.template data<T>(),
d_y_tmp.template data<T>(), data_desc_,
data_desc_, d_x->template data<T>(),
d_x->template data<T>(), in_param_desc_,
in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
scale_tmp.template data<BatchNormParamType<T>>(), d_scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(), d_bias_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(), epsilon,
epsilon, saved_mean_data,
saved_mean_data, saved_var_data));
saved_var_data));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
paddle::platform::dynload::cudnnBatchNormalizationBackward( dev_ctx.cudnn_handle(),
dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
CudnnDataType<T>::kZero(), data_desc_,
data_desc_, x_tmp.template data<T>(),
x_tmp.template data<T>(), data_desc_,
data_desc_, d_y_tmp.template data<T>(),
d_y_tmp.template data<T>(), data_desc_,
data_desc_, d_x->template data<T>(),
d_x->template data<T>(), in_param_desc_,
in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
scale_tmp.template data<BatchNormParamType<T>>(), d_scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(), d_bias_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(), epsilon,
epsilon, saved_mean_data,
saved_mean_data, saved_var_data));
saved_var_data));
#endif #endif
} else { } else {
if (d_x) { if (d_x) {
...@@ -502,14 +496,14 @@ void InstanceNormGradKernel(const Context &dev_ctx, ...@@ -502,14 +496,14 @@ void InstanceNormGradKernel(const Context &dev_ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif #endif
} }
......
...@@ -63,17 +63,17 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -63,17 +63,17 @@ void InstanceNormKernel(const Context &dev_ctx,
miopenTensorDescriptor_t in_param_desc_; miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else #else
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_; cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif #endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than " LOG(ERROR) << "Provided epsilon is smaller than "
...@@ -89,27 +89,23 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -89,27 +89,23 @@ void InstanceNormKernel(const Context &dev_ctx,
strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
paddle::platform::dynload::miopenSetTensorDescriptor( data_desc_,
data_desc_, CudnnDataType<T>::type,
CudnnDataType<T>::type, x_dims.size() > 3 ? x_dims.size() : 4,
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
const_cast<int *>(dims.data()), const_cast<int *>(strides.data())));
const_cast<int *>(strides.data()))); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
PADDLE_ENFORCE_GPU_SUCCESS( in_param_desc_, data_desc_, miopenBNSpatial));
paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
paddle::platform::dynload::cudnnSetTensorNdDescriptor( data_desc_,
data_desc_, CudnnDataType<T>::type,
CudnnDataType<T>::type, x_dims.size() > 3 ? x_dims.size() : 4,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(),
dims.data(), strides.data()));
strides.data())); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
PADDLE_ENFORCE_GPU_SUCCESS( in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif #endif
const auto scale_ptr = scale.get_ptr(); const auto scale_ptr = scale.get_ptr();
...@@ -170,7 +166,7 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -170,7 +166,7 @@ void InstanceNormKernel(const Context &dev_ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenBatchNormalizationForwardTraining( phi::dynload::miopenBatchNormalizationForwardTraining(
handle, handle,
miopenBNSpatial, miopenBNSpatial,
const_cast<void *>( const_cast<void *>(
...@@ -194,12 +190,12 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -194,12 +190,12 @@ void InstanceNormKernel(const Context &dev_ctx,
static_cast<void *>(saved_variance_data))); static_cast<void *>(saved_variance_data)));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( phi::dynload::cudnnBatchNormalizationForwardTraining(
handle, handle,
CUDNN_BATCHNORM_SPATIAL, CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kOne(),
...@@ -219,9 +215,9 @@ void InstanceNormKernel(const Context &dev_ctx, ...@@ -219,9 +215,9 @@ void InstanceNormKernel(const Context &dev_ctx,
saved_variance_data)); saved_variance_data));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif #endif
} }
......
...@@ -89,11 +89,10 @@ void GetClassInterval(const gpuStream_t& stream, ...@@ -89,11 +89,10 @@ void GetClassInterval(const gpuStream_t& stream,
paddle::platform::NCCLCommContext::Instance().Get(rid, place); paddle::platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream // use global calculate stream
const auto calcu_stream = const auto calcu_stream =
static_cast<GPUContext*>( static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device.numel(), num_classes_per_device.numel(),
......
...@@ -85,11 +85,10 @@ void GetClassInterval(const gpuStream_t& stream, ...@@ -85,11 +85,10 @@ void GetClassInterval(const gpuStream_t& stream,
paddle::platform::NCCLCommContext::Instance().Get(rid, place); paddle::platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream // use global calculate stream
const auto calcu_stream = const auto calcu_stream =
static_cast<GPUContext*>( static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device.numel(), num_classes_per_device.numel(),
...@@ -247,7 +246,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, ...@@ -247,7 +246,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
// use global calculate stream // use global calculate stream
stream = static_cast<GPUContext*>( stream = static_cast<GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) phi::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
} }
} }
...@@ -358,7 +357,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, ...@@ -358,7 +357,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
auto task = pg->AllReduce(in_tensor, out_tensor, opts); auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait(); task->Wait();
} else { } else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
logits_max_buff, logits_max_buff,
logits_max_buff, logits_max_buff,
logits_max.numel(), logits_max.numel(),
...@@ -400,7 +399,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, ...@@ -400,7 +399,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
auto task = pg->AllReduce(in_tensor, out_tensor, opts); auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait(); task->Wait();
} else { } else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
sum_exp_logits_buff, sum_exp_logits_buff,
sum_exp_logits_buff, sum_exp_logits_buff,
sum_exp_logits.numel(), sum_exp_logits.numel(),
...@@ -459,7 +458,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, ...@@ -459,7 +458,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
auto task = pg->AllReduce(in_tensor, out_tensor, opts); auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait(); task->Wait();
} else { } else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
loss_ptr, loss_ptr,
loss_ptr, loss_ptr,
loss->numel(), loss->numel(),
......
...@@ -161,5 +161,5 @@ PD_REGISTER_KERNEL(overlap_add_grad, ...@@ -161,5 +161,5 @@ PD_REGISTER_KERNEL(overlap_add_grad,
float, float,
double, double,
phi::dtype::float16, phi::dtype::float16,
paddle::platform::complex<float>, phi::dtype::complex<float>,
paddle::platform::complex<double>) {} phi::dtype::complex<double>) {}
...@@ -147,5 +147,5 @@ PD_REGISTER_KERNEL(overlap_add, ...@@ -147,5 +147,5 @@ PD_REGISTER_KERNEL(overlap_add,
float, float,
double, double,
phi::dtype::float16, phi::dtype::float16,
paddle::platform::complex<float>, phi::dtype::complex<float>,
paddle::platform::complex<double>) {} phi::dtype::complex<double>) {}
...@@ -32,7 +32,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, ...@@ -32,7 +32,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
const std::string& reduce, const std::string& reduce,
DenseTensor* x_grad, DenseTensor* x_grad,
DenseTensor* value_grad) { DenseTensor* value_grad) {
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()), PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true, true,
errors::PreconditionNotMet( errors::PreconditionNotMet(
"PutAlongAxisGradOpCUDAKernel only runs on GPU.")); "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
......
...@@ -31,7 +31,7 @@ void PutAlongAxisKernel(const Context& dev_ctx, ...@@ -31,7 +31,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
int axis, int axis,
const std::string& reduce, const std::string& reduce,
DenseTensor* out) { DenseTensor* out) {
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()), PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true, true,
errors::PreconditionNotMet( errors::PreconditionNotMet(
"PutAlongAxisCUDAKernel only runs on GPU device.")); "PutAlongAxisCUDAKernel only runs on GPU device."));
......
...@@ -260,108 +260,104 @@ void RnnGradKernel(const Context &dev_ctx, ...@@ -260,108 +260,104 @@ void RnnGradKernel(const Context &dev_ctx,
if (!has_seq_length) { if (!has_seq_length) {
if (x_grad) { if (x_grad) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardData(
paddle::platform::dynload::miopenRNNBackwardData( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), seq_length,
seq_length, rnn.y_descs(),
rnn.y_descs(), out_data,
out_data, rnn.y_descs(),
rnn.y_descs(), out_grad_data,
out_grad_data, rnn.last_h_desc(),
rnn.last_h_desc(), last_h_grad_data,
last_h_grad_data, rnn.last_c_desc(),
rnn.last_c_desc(), last_c_grad_data,
last_c_grad_data, rnn.weight_desc(),
rnn.weight_desc(), weight_data,
weight_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_data,
init_c_data, rnn.x_descs(),
rnn.x_descs(), x_grad_data,
x_grad_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_grad_data,
init_h_grad_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_grad_data,
init_c_grad_data, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, const_cast<uint8_t *>(reserve_data),
const_cast<uint8_t *>(reserve_data), reserve_size));
reserve_size));
#else #else
// This interface is used when the input/output is unpadded. // This interface is used when the input/output is unpadded.
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData(
paddle::platform::dynload::cudnnRNNBackwardData( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), seq_length,
seq_length, rnn.y_descs(),
rnn.y_descs(), out_data,
out_data, rnn.y_descs(),
rnn.y_descs(), out_grad_data,
out_grad_data, rnn.last_h_desc(),
rnn.last_h_desc(), last_h_grad_data,
last_h_grad_data, rnn.last_c_desc(),
rnn.last_c_desc(), last_c_grad_data,
last_c_grad_data, rnn.weight_desc(),
rnn.weight_desc(), weight_data,
weight_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_data,
init_c_data, rnn.x_descs(),
rnn.x_descs(), x_grad_data,
x_grad_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_grad_data,
init_h_grad_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_grad_data,
init_c_grad_data, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, const_cast<uint8_t *>(reserve_data),
const_cast<uint8_t *>(reserve_data), reserve_size));
reserve_size));
#endif #endif
} }
if (!weight_grad_list.empty()) { if (!weight_grad_list.empty()) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
paddle::platform::dynload::miopenRNNBackwardWeights( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), seq_length,
seq_length, rnn.x_descs(),
rnn.x_descs(), x.data<T>(),
x.data<T>(), rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.y_descs(),
rnn.y_descs(), out.data<T>(),
out.data<T>(), rnn.weight_desc(),
rnn.weight_desc(), weight_grad_data,
weight_grad_data, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, const_cast<uint8_t *>(reserve_data),
const_cast<uint8_t *>(reserve_data), reserve_size));
reserve_size));
// permute weight grad list from weight grad tensor // permute weight grad list from weight grad tensor
TensorToPermutedWeight<T>( TensorToPermutedWeight<T>(
place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec); place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
paddle::platform::dynload::cudnnRNNBackwardWeights( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), seq_length,
seq_length, rnn.x_descs(),
rnn.x_descs(), x.data<T>(),
x.data<T>(), rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.y_descs(),
rnn.y_descs(), out.data<T>(),
out.data<T>(), workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, rnn.weight_desc(),
rnn.weight_desc(), weight_grad_data,
weight_grad_data, const_cast<uint8_t *>(reserve_data),
const_cast<uint8_t *>(reserve_data), reserve_size));
reserve_size));
#endif #endif
} }
} else { } else {
...@@ -369,57 +365,55 @@ void RnnGradKernel(const Context &dev_ctx, ...@@ -369,57 +365,55 @@ void RnnGradKernel(const Context &dev_ctx,
// for train // for train
// This interface is used when the input/output is padded. // This interface is used when the input/output is padded.
if (x_grad) { if (x_grad) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
paddle::platform::dynload::cudnnRNNBackwardDataEx( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), rnn.y_seq_desc(),
rnn.y_seq_desc(), out_data,
out_data, rnn.y_seq_desc(),
rnn.y_seq_desc(), out_grad_data,
out_grad_data, nullptr,
nullptr, nullptr,
nullptr, rnn.last_h_desc(),
rnn.last_h_desc(), last_h_grad_data,
last_h_grad_data, rnn.last_c_desc(),
rnn.last_c_desc(), last_c_grad_data,
last_c_grad_data, rnn.weight_desc(),
rnn.weight_desc(), weight_data,
weight_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_data,
init_c_data, rnn.x_seq_desc(),
rnn.x_seq_desc(), x_grad_data,
x_grad_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_grad_data,
init_h_grad_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_grad_data,
init_c_grad_data, nullptr,
nullptr, nullptr,
nullptr, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, const_cast<uint8_t *>(reserve_data),
const_cast<uint8_t *>(reserve_data), reserve_size));
reserve_size));
} }
if (!weight_grad_list.empty()) { if (!weight_grad_list.empty()) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
paddle::platform::dynload::cudnnRNNBackwardWeightsEx( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), rnn.x_seq_desc(),
rnn.x_seq_desc(), x.data<T>(),
x.data<T>(), rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.y_seq_desc(),
rnn.y_seq_desc(), out.data<T>(),
out.data<T>(), workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, rnn.weight_desc(),
rnn.weight_desc(), weight_grad_data,
weight_grad_data, const_cast<uint8_t *>(reserve_data),
const_cast<uint8_t *>(reserve_data), reserve_size));
reserve_size));
} }
#else #else
PADDLE_THROW(phi::errors::Unavailable( PADDLE_THROW(phi::errors::Unavailable(
......
...@@ -42,81 +42,78 @@ void RNNInferece(bool has_seq_length, ...@@ -42,81 +42,78 @@ void RNNInferece(bool has_seq_length,
// This interface is used when the input/output is unpadded. // This interface is used when the input/output is unpadded.
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenRNNForwardInference( phi::dynload::miopenRNNForwardInference(handle,
handle, rnn->rnn_desc(),
rnn->rnn_desc(), seq_length,
seq_length, rnn->x_descs(),
rnn->x_descs(), x_data,
x_data, rnn->init_h_desc(),
rnn->init_h_desc(), init_h_data,
init_h_data, rnn->init_c_desc(),
rnn->init_c_desc(), init_c_data,
init_c_data, rnn->weight_desc(),
rnn->weight_desc(), w_data,
w_data, rnn->y_descs(),
rnn->y_descs(), out_data,
out_data, rnn->last_h_desc(),
rnn->last_h_desc(), last_h_data,
last_h_data, rnn->last_c_desc(),
rnn->last_c_desc(), last_c_data,
last_c_data, workspace_data->data<uint8_t>(),
workspace_data->data<uint8_t>(), workspace_size));
workspace_size));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNForwardInference( phi::dynload::cudnnRNNForwardInference(handle,
handle, rnn->rnn_desc(),
rnn->rnn_desc(), seq_length,
seq_length, rnn->x_descs(),
rnn->x_descs(), x_data,
x_data, rnn->init_h_desc(),
rnn->init_h_desc(), init_h_data,
init_h_data, rnn->init_c_desc(),
rnn->init_c_desc(), init_c_data,
init_c_data, rnn->weight_desc(),
rnn->weight_desc(), w_data,
w_data, rnn->y_descs(),
rnn->y_descs(), out_data,
out_data, rnn->last_h_desc(),
rnn->last_h_desc(), last_h_data,
last_h_data, rnn->last_c_desc(),
rnn->last_c_desc(), last_c_data,
last_c_data, workspace_data->data<uint8_t>(),
workspace_data->data<uint8_t>(), workspace_size));
workspace_size));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for inference // for inference
// This interface is used when the input/output is padded. // This interface is used when the input/output is padded.
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
paddle::platform::dynload::cudnnRNNForwardInferenceEx( handle,
handle, rnn->rnn_desc(),
rnn->rnn_desc(), rnn->x_seq_desc(),
rnn->x_seq_desc(), x_data,
x_data, rnn->init_h_desc(),
rnn->init_h_desc(), init_h_data,
init_h_data, rnn->init_c_desc(),
rnn->init_c_desc(), init_c_data,
init_c_data, rnn->weight_desc(),
rnn->weight_desc(), w_data,
w_data, rnn->y_seq_desc(),
rnn->y_seq_desc(), out_data,
out_data, rnn->last_h_desc(),
rnn->last_h_desc(), last_h_data,
last_h_data, rnn->last_c_desc(),
rnn->last_c_desc(), last_c_data,
last_c_data, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, workspace_data->data<uint8_t>(),
workspace_data->data<uint8_t>(), workspace_size));
workspace_size));
#else #else
// CUDNN VERSION has to >=7.2.1 // CUDNN VERSION has to >=7.2.1
PADDLE_THROW(phi::errors::Unavailable( PADDLE_THROW(phi::errors::Unavailable(
...@@ -310,88 +307,85 @@ void RnnKernel(const Context &dev_ctx, ...@@ -310,88 +307,85 @@ void RnnKernel(const Context &dev_ctx,
// for train // for train
// This interface is used when the input/output is unpadded. // This interface is used when the input/output is unpadded.
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
paddle::platform::dynload::miopenRNNForwardTraining( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), seq_length,
seq_length, rnn.x_descs(),
rnn.x_descs(), x_data,
x_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_data,
init_c_data, rnn.weight_desc(),
rnn.weight_desc(), w_data,
w_data, rnn.y_descs(),
rnn.y_descs(), out_data,
out_data, rnn.last_h_desc(),
rnn.last_h_desc(), last_h_data,
last_h_data, rnn.last_c_desc(),
rnn.last_c_desc(), last_c_data,
last_c_data, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, reserve_data,
reserve_data, reserve_size));
reserve_size));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNForwardTraining( phi::dynload::cudnnRNNForwardTraining(handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), seq_length,
seq_length, rnn.x_descs(),
rnn.x_descs(), x_data,
x_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_data,
init_c_data, rnn.weight_desc(),
rnn.weight_desc(), w_data,
w_data, rnn.y_descs(),
rnn.y_descs(), out_data,
out_data, rnn.last_h_desc(),
rnn.last_h_desc(), last_h_data,
last_h_data, rnn.last_c_desc(),
rnn.last_c_desc(), last_c_data,
last_c_data, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, reserve_data,
reserve_data, reserve_size));
reserve_size));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for train // for train
// This interface is used when the input/output is padded. // This interface is used when the input/output is padded.
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
paddle::platform::dynload::cudnnRNNForwardTrainingEx( handle,
handle, rnn.rnn_desc(),
rnn.rnn_desc(), rnn.x_seq_desc(),
rnn.x_seq_desc(), x_data,
x_data, rnn.init_h_desc(),
rnn.init_h_desc(), init_h_data,
init_h_data, rnn.init_c_desc(),
rnn.init_c_desc(), init_c_data,
init_c_data, rnn.weight_desc(),
rnn.weight_desc(), w_data,
w_data, rnn.y_seq_desc(),
rnn.y_seq_desc(), out_data,
out_data, rnn.last_h_desc(),
rnn.last_h_desc(), last_h_data,
last_h_data, rnn.last_c_desc(),
rnn.last_c_desc(), last_c_data,
last_c_data, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, nullptr,
nullptr, workspace_data_.data<uint8_t>(),
workspace_data_.data<uint8_t>(), workspace_size,
workspace_size, reserve_data,
reserve_data, reserve_size));
reserve_size));
#else #else
PADDLE_THROW(phi::errors::Unavailable( PADDLE_THROW(phi::errors::Unavailable(
"The padded input is supported by " "The padded input is supported by "
......
...@@ -83,7 +83,7 @@ void SyncBatchNormKernel(const Context &ctx, ...@@ -83,7 +83,7 @@ void SyncBatchNormKernel(const Context &ctx,
var_data = variance.template data<BatchNormParamType<T>>(); var_data = variance.template data<BatchNormParamType<T>>();
} else { } else {
// x, x^2, 1, here 1 is used to calc device num // x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool // device num also can be got from phi::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>); const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
alloc_ptr = phi::memory_utils::Alloc( alloc_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
...@@ -111,14 +111,14 @@ void SyncBatchNormKernel(const Context &ctx, ...@@ -111,14 +111,14 @@ void SyncBatchNormKernel(const Context &ctx,
int dtype = paddle::platform::ToNCCLDataType( int dtype = paddle::platform::ToNCCLDataType(
paddle::framework::TransToProtoVarType(mean_out->dtype())); paddle::framework::TransToProtoVarType(mean_out->dtype()));
// In-place operation // In-place operation
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(
stats, phi::dynload::ncclAllReduce(stats,
stats, stats,
2 * C + 1, 2 * C + 1,
static_cast<ncclDataType_t>(dtype), static_cast<ncclDataType_t>(dtype),
ncclSum, ncclSum,
comm, comm,
stream)); stream));
VLOG(3) << "Sync result using all reduce"; VLOG(3) << "Sync result using all reduce";
} }
#endif #endif
......
...@@ -31,7 +31,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, ...@@ -31,7 +31,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
int axis, int axis,
DenseTensor* x_grad) { DenseTensor* x_grad) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true, true,
errors::PreconditionNotMet("This kernel only runs on GPU.")); errors::PreconditionNotMet("This kernel only runs on GPU."));
......
...@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx, ...@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
int axis, int axis,
DenseTensor* out) { DenseTensor* out) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true, true,
errors::PreconditionNotMet("This kernel only runs on GPU device.")); errors::PreconditionNotMet("This kernel only runs on GPU device."));
......
...@@ -97,7 +97,7 @@ void TriangularSolveKernel(const Context& dev_ctx, ...@@ -97,7 +97,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
memory_utils::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
paddle::platform::CPUPlace(), phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()), static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*), cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream()); dev_ctx.stream());
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/affine_grid_grad_kernel.h" #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
...@@ -35,7 +35,7 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx, ...@@ -35,7 +35,7 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
bool align_corners, bool align_corners,
DenseTensor* input_grad) { DenseTensor* input_grad) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"Only support for CUDAPlace.Please switch your context from " "Only support for CUDAPlace.Please switch your context from "
...@@ -58,9 +58,8 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx, ...@@ -58,9 +58,8 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad); T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfGridGeneratorBackward(
paddle::platform::dynload::cudnnSpatialTfGridGeneratorBackward( handle, cudnn_st_desc, output_grad_data, theta_grad_data));
handle, cudnn_st_desc, output_grad_data, theta_grad_data));
} }
} // namespace phi } // namespace phi
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/affine_grid_kernel.h" #include "paddle/phi/kernels/affine_grid_kernel.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
...@@ -35,7 +34,7 @@ void AffineGridCudnnKernel(const Context& dev_ctx, ...@@ -35,7 +34,7 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
bool align_corners, bool align_corners,
DenseTensor* output) { DenseTensor* output) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()), dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true, true,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"Only support for CUDAPlace.Please switch your context from " "Only support for CUDAPlace.Please switch your context from "
...@@ -56,12 +55,11 @@ void AffineGridCudnnKernel(const Context& dev_ctx, ...@@ -56,12 +55,11 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
cudnnSpatialTransformerDescriptor_t cudnn_st_desc = cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data); st_desc.descriptor<T>(4, h_size_data);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(phi::dynload::cudnnSpatialTfGridGeneratorForward(
paddle::platform::dynload::cudnnSpatialTfGridGeneratorForward( handle, cudnn_st_desc, theta_data, output_data),
handle, cudnn_st_desc, theta_data, output_data), 0,
0, phi::errors::Fatal("Some errors has occurred "
phi::errors::Fatal("Some errors has occurred " "during forward computation in cudnn."));
"during forward computation in cudnn."));
} }
} // namespace phi } // namespace phi
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/conv_grad_kernel.h" #include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -161,7 +162,7 @@ void ConvCudnnGradKernelImplV7( ...@@ -161,7 +162,7 @@ void ConvCudnnGradKernelImplV7(
padding_common, padding_common,
strides, strides,
dilations, dilations,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -188,7 +189,7 @@ void ConvCudnnGradKernelImplV7( ...@@ -188,7 +189,7 @@ void ConvCudnnGradKernelImplV7(
padding_common, padding_common,
strides, strides,
dilations, dilations,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
...@@ -227,39 +228,38 @@ void ConvCudnnGradKernelImplV7( ...@@ -227,39 +228,38 @@ void ConvCudnnGradKernelImplV7(
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData( phi::dynload::miopenConvolutionBackwardData(handle,
handle, &alpha,
&alpha, args1.odesc.desc(),
args1.odesc.desc(), output_grad_data,
output_grad_data, args1.wdesc.desc(),
args1.wdesc.desc(), filter_data,
filter_data, args1.cdesc.desc(),
args1.cdesc.desc(), bwd_result.algo,
bwd_result.algo, &beta,
&beta, args1.idesc.desc(),
args1.idesc.desc(), temp_tensor_data,
temp_tensor_data, cudnn_workspace_ptr,
cudnn_workspace_ptr, workspace_size));
workspace_size));
}, },
workspace_size); workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( PADDLE_ENFORCE_GPU_SUCCESS(
handle, phi::dynload::miopenOpTensor(handle,
miopenTensorOpAdd, miopenTensorOpAdd,
&alpha, &alpha,
args1.idesc.desc(), args1.idesc.desc(),
transformed_input_grad_data, transformed_input_grad_data,
&alpha, &alpha,
args1.idesc.desc(), args1.idesc.desc(),
temp_tensor_data, temp_tensor_data,
&beta, &beta,
args1.idesc.desc(), args1.idesc.desc(),
transformed_input_grad_data)); transformed_input_grad_data));
} else { } else {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData( phi::dynload::miopenConvolutionBackwardData(
handle, handle,
&alpha, &alpha,
args1.odesc.desc(), args1.odesc.desc(),
...@@ -300,7 +300,7 @@ void ConvCudnnGradKernelImplV7( ...@@ -300,7 +300,7 @@ void ConvCudnnGradKernelImplV7(
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights( phi::dynload::miopenConvolutionBackwardWeights(
handle, handle,
&alpha, &alpha,
args2.odesc.desc(), args2.odesc.desc(),
...@@ -356,7 +356,7 @@ void ConvCudnnGradKernelImplV8( ...@@ -356,7 +356,7 @@ void ConvCudnnGradKernelImplV8(
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
groups, groups,
1, 1,
paddle::platform::errors::Unimplemented( phi::errors::Unimplemented(
"Group concolution using CUDNNv8 API is unsupported for now")); "Group concolution using CUDNNv8 API is unsupported for now"));
cudnnHandle_t handle = const_cast<cudnnHandle_t>(ctx.cudnn_handle()); cudnnHandle_t handle = const_cast<cudnnHandle_t>(ctx.cudnn_handle());
...@@ -1082,7 +1082,7 @@ void ConvCudnnGradGradKernel( ...@@ -1082,7 +1082,7 @@ void ConvCudnnGradGradKernel(
padding_common, padding_common,
strides, strides,
dilations, dilations,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -1106,7 +1106,7 @@ void ConvCudnnGradGradKernel( ...@@ -1106,7 +1106,7 @@ void ConvCudnnGradGradKernel(
padding_common, padding_common,
strides, strides,
dilations, dilations,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -1133,7 +1133,7 @@ void ConvCudnnGradGradKernel( ...@@ -1133,7 +1133,7 @@ void ConvCudnnGradGradKernel(
padding_common, padding_common,
strides, strides,
dilations, dilations,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -1160,7 +1160,7 @@ void ConvCudnnGradGradKernel( ...@@ -1160,7 +1160,7 @@ void ConvCudnnGradGradKernel(
padding_common, padding_common,
strides, strides,
dilations, dilations,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -1210,20 +1210,19 @@ void ConvCudnnGradGradKernel( ...@@ -1210,20 +1210,19 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward( phi::dynload::miopenConvolutionForward(handle,
handle, &alpha,
&alpha, args1.idesc.desc(),
args1.idesc.desc(), ddx,
ddx, args1.wdesc.desc(),
args1.wdesc.desc(), w,
w, args1.cdesc.desc(),
args1.cdesc.desc(), fwd_result1.algo,
fwd_result1.algo, &beta,
&beta, args1.odesc.desc(),
args1.odesc.desc(), transformed_ddy_channel,
transformed_ddy_channel, workspace_ptr,
workspace_ptr, workspace_size));
workspace_size));
}, },
workspace_size); workspace_size);
#else #else
...@@ -1248,20 +1247,19 @@ void ConvCudnnGradGradKernel( ...@@ -1248,20 +1247,19 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward( phi::dynload::miopenConvolutionForward(handle,
handle, &alpha,
&alpha, args2.idesc.desc(),
args2.idesc.desc(), x,
x, args2.wdesc.desc(),
args2.wdesc.desc(), ddw,
ddw, args2.cdesc.desc(),
args2.cdesc.desc(), fwd_result2.algo,
fwd_result2.algo, &beta,
&beta, args2.odesc.desc(),
args2.odesc.desc(), transformed_ddy_channel,
transformed_ddy_channel, workspace_ptr,
workspace_ptr, workspace_size));
workspace_size));
}, },
workspace_size); workspace_size);
#else #else
...@@ -1291,7 +1289,7 @@ void ConvCudnnGradGradKernel( ...@@ -1291,7 +1289,7 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights( phi::dynload::miopenConvolutionBackwardWeights(
handle, handle,
&alpha, &alpha,
args3.odesc.desc(), args3.odesc.desc(),
...@@ -1330,7 +1328,7 @@ void ConvCudnnGradGradKernel( ...@@ -1330,7 +1328,7 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData( phi::dynload::miopenConvolutionBackwardData(
handle, handle,
&alpha, &alpha,
args4.odesc.desc(), args4.odesc.desc(),
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/conv_kernel.h" #include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -79,18 +80,11 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, ...@@ -79,18 +80,11 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h // MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype, args.cdesc.set(
padding_common, dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
groups);
#else #else
args.cdesc.set(dtype, args.cdesc.set(
padding_common, dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
strides,
dilations,
paddle::platform::AllowTF32Cudnn());
#endif #endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
...@@ -98,8 +92,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, ...@@ -98,8 +92,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1. // rather than setting it to 1.
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetConvolutionGroupCount( phi::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), groups));
args.cdesc.desc(), groups));
groups = 1; groups = 1;
#endif #endif
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -185,20 +178,19 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, ...@@ -185,20 +178,19 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward( phi::dynload::miopenConvolutionForward(handle,
handle, &alpha,
&alpha, args.idesc.desc(),
args.idesc.desc(), input_data,
input_data, args.wdesc.desc(),
args.wdesc.desc(), filter_data,
filter_data, args.cdesc.desc(),
args.cdesc.desc(), fwd_result.algo,
fwd_result.algo, &beta,
&beta, args.odesc.desc(),
args.odesc.desc(), output_data,
output_data, workspace_ptr,
workspace_ptr, workspace_size));
workspace_size));
}, },
workspace_size); workspace_size);
#else #else
...@@ -237,7 +229,7 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, ...@@ -237,7 +229,7 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
groups, groups,
1, 1,
paddle::platform::errors::Unimplemented( phi::errors::Unimplemented(
"Group concolution using CUDNNv8 API unsupported for now")); "Group concolution using CUDNNv8 API unsupported for now"));
T* input_data = const_cast<T*>(input_tensor->data<T>()); T* input_data = const_cast<T*>(input_tensor->data<T>());
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/dynload/cudnn.h" #include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
...@@ -219,7 +220,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -219,7 +220,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
...@@ -244,7 +245,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -244,7 +245,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
...@@ -691,7 +692,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -691,7 +692,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
...@@ -713,7 +714,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -713,7 +714,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
...@@ -738,7 +739,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -738,7 +739,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
...@@ -764,7 +765,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -764,7 +765,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_group); c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/dynload/cudnn.h" #include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
...@@ -216,7 +217,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -216,7 +217,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
padding_common, padding_common,
strides, strides,
dilations_, dilations_,
paddle::platform::AllowTF32Cudnn(), phi::AllowTF32Cudnn(),
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册