未验证 提交 a1006b2b 编写于 作者: H Huang Jiyi 提交者: GitHub

[phi decoupling] decouple dependency to device_context in phi (Part 1) (#50865)

* move DeviceContextPool to phi

* add EmplaceExternalContextFunc

* update namespace

* update cmake

* fix bugs and create context_pool_impl.h

* replace platform::is_xxx_place

* fix bugs

* update generator

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix enforce usage

* Revert "fix enforce usage"

This reverts commit 5f521f08a69713cee506e64a00ec6d9fba709e27.

* fix bugs

* rm XPUDeviceContext and CustomDeviceContext

* fix bugs

* fix fix context init bug

* fix bugs after merge

* fix bugs

* fix name

* fix mutable_data

* update and fix bugs

* fix bugs

* update

* fix bugs

* fix name

* fix bugs

* merge

* fix bugs

* create context_pool in phi/backends

* create context_pool in phi/backends

* fix bugs

* fix xpu bugs

* fix rocm bugs

* fix bugs

* fix bugs

* fix bugs

* fix xpu bugs

* update

* update

* fix bugs

* fix bugs
上级 203a62b8
......@@ -94,4 +94,6 @@ paddle/fluid/pybind/op_function_impl.h
paddle/fluid/pybind/*final_state_op_function_impl.h
paddle/fluid/prim/api/generated/prim_api/*
paddle/fluid/framework/__init__.py
paddle/phi/api/profiler/__init__.py
python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
......@@ -165,7 +165,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
conv_desc[i], CUDNN_DEFAULT_MATH));
#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
if (!platform::allow_tf32_cudnn) {
if (!phi::allow_tf32_cudnn) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
CUDNN_FMA_MATH));
......
......@@ -19,13 +19,6 @@ namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {} // namespace framework
namespace platform {
class XPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......
......@@ -159,7 +159,6 @@ cc_library(
cudnn_workspace_helper
${XPU_CTX_DEPS}
${MLU_CTX_DEPS}
eigen3
phi_backends
phi_device_context
generator)
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
DECLARE_bool(use_stream_safe_cuda_allocator);
DECLARE_bool(new_executor_use_cuda_graph);
......
......@@ -54,7 +54,8 @@ void InitDevice() {
}
EXPECT_GT(static_cast<int>(places.size()), 0);
paddle::platform::DeviceContextPool::Init(places);
paddle::platform::DeviceContextPool::Init(
places, paddle::platform::EmplaceExternalContext);
}
void TestDeviceInterface(const paddle::platform::Place& place) {
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <set>
#include "glog/logging.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -39,19 +38,11 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
#include "paddle/phi/backends/context_pool_utils.h"
namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
#endif // PADDLE_WITH_CUDA
DeviceType Place2DeviceType(const platform::Place& place) {
if (platform::is_cpu_place(place)) {
return platform::DeviceType::CPU;
......@@ -73,312 +64,76 @@ DeviceType Place2DeviceType(const platform::Place& place) {
}
}
static DeviceContextPool* pool = nullptr;
DeviceContextPool& DeviceContextPool::Instance() {
PADDLE_ENFORCE_NOT_NULL(pool,
phi::errors::PreconditionNotMet(
"Need to Create DeviceContextPool firstly!"));
return *pool;
}
/*! \brief Create should only called by Init function */
DeviceContextPool& DeviceContextPool::Init(
const std::vector<platform::Place>& places) {
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}
bool DeviceContextPool::IsInitialized() { return pool != nullptr; }
void DeviceContextPool::SetPool(DeviceContextPool* dev_pool) {
pool = dev_pool;
}
thread_local const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*
DeviceContextPool::external_device_contexts_ = nullptr;
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
VLOG(6) << "DeviceContextPool Get: " << place;
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
ptr;
if (external_device_contexts_ && external_device_contexts_->count(place)) {
ptr = external_device_contexts_;
} else {
ptr = &device_contexts_;
}
auto it = ptr->find(place);
if (it == ptr->end()) {
PADDLE_THROW(platform::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
"or check "
"that your train process set the correct device id if you use "
"Executor.",
place));
}
return it->second.get().get();
}
size_t DeviceContextPool::size() const {
if (external_device_contexts_) {
return external_device_contexts_->size();
}
return device_contexts_.size();
}
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
DeviceContextPool::device_contexts() const {
if (external_device_contexts_) {
return *external_device_contexts_;
}
return device_contexts_;
}
void DeviceContextPool::SetDeviceContexts(
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
dev_ctxs) {
external_device_contexts_ = dev_ctxs;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const platform::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const platform::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const platform::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
std::unique_ptr<DeviceContext> CreateDeviceContext(
const platform::Place& p,
bool disable_setting_default_stream_for_allocator = false,
int stream_priority = 0) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
if (is_gpu_place(p)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
auto& instance = memory::allocation::AllocatorFacade::Instance();
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (is_xpu_place(p)) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(
memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(
memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p)
.get());
dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
void EmplaceExternalContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
platform::Place place,
const platform::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
// lazy evaluation. i.e., only create device context at first `Get`
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<platform::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
PADDLE_ENFORCE_GT(
places.size(),
0,
platform::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (platform::is_gpu_place(p)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("CUDAPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (platform::is_cuda_pinned_place(p)) {
if (platform::is_cuda_pinned_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDAPinnedDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
#endif
} else if (platform::is_xpu_place(p)) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<XPUDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
phi::EmplaceDeviceContext<CUDAPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
#endif
} else if (platform::is_mlu_place(p)) {
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
phi::EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option."));
PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option."));
#endif
} else if (platform::is_ipu_place(p)) {
} else if (platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU
EmplaceDeviceContext<IPUDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
phi::EmplaceDeviceContext<IPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."));
PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."));
#endif
} else if (platform::is_npu_place(p)) {
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
phi::EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."));
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."));
#endif
} else if (platform::is_npu_pinned_place(p)) {
} else if (platform::is_npu_pinned_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
#endif
} else if (platform::is_custom_place(p)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<CustomDeviceContext>(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
phi::EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
#endif
}
}
}
DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) {
EmplaceDeviceContexts(&device_contexts_,
places,
/*disable_setting_default_stream_for_allocator=*/false,
/*stream_priority=*/0);
}
#ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
......@@ -390,19 +145,6 @@ void IPUDeviceContext::Wait() const {
IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() : phi::XPUContext() {
phi::XPUContext::Init();
}
XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : phi::XPUContext(place) {
phi::XPUContext::Init();
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place.device);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -469,14 +211,5 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDeviceContext::CustomDeviceContext(CustomPlace place)
: phi::CustomContext(place) {
Init();
stream_.reset(new phi::stream::Stream(place, stream()));
}
CustomDeviceContext::~CustomDeviceContext() {}
#endif
} // namespace platform
} // namespace paddle
......@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/custom/custom_context.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
......@@ -98,18 +99,6 @@ struct GpuDevice;
namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/*Set the value of the global variable allow_tf32_cublas*/
void SetAllowTF32Cublas(bool active);
/*Get the global variable allow_tf32_cublas value*/
bool AllowTF32Cublas();
extern bool allow_tf32_cudnn;
/*Set the value of the global variable allow_tf32_cudnn*/
void SetAllowTF32Cudnn(bool active);
/*Get the global variable allow_tf32_cudnn value*/
bool AllowTF32Cudnn();
#endif // PADDLE_WITH_CUDA
enum DeviceType {
CPU = 0,
CUDA = 1,
......@@ -134,14 +123,6 @@ constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE;
using DeviceContext = phi::DeviceContext;
template <typename Place>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<platform::CPUPlace> {
using TYPE = phi::CPUContext;
};
// Graphcore IPU
#ifdef PADDLE_WITH_IPU
class IPUDeviceContext
......@@ -161,35 +142,15 @@ class IPUDeviceContext
private:
IPUPlace place_;
};
template <>
struct DefaultDeviceContextType<platform::IPUPlace> {
using TYPE = IPUDeviceContext;
};
#endif
#ifdef PADDLE_WITH_MLU
class MLUDeviceContext;
template <>
struct DefaultDeviceContextType<platform::MLUPlace>;
#endif
#ifdef PADDLE_WITH_XPU
namespace xpu = baidu::xpu::api;
class XPUDeviceContext : public phi::XPUContext {
public:
XPUDeviceContext();
explicit XPUDeviceContext(XPUPlace place);
virtual ~XPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
void CreateStream() { XPUContext::CreateStream(); }
};
template <>
struct DefaultDeviceContextType<platform::XPUPlace> {
using TYPE = XPUDeviceContext;
};
using XPUDeviceContext = phi::XPUContext;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -251,11 +212,6 @@ class NPUDeviceContext
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
};
template <>
struct DefaultDeviceContextType<platform::NPUPlace> {
using TYPE = NPUDeviceContext;
};
// Currently, NPUPinnedDeviceContext is only used to data copying.
class NPUPinnedDeviceContext
: public DeviceContext,
......@@ -275,19 +231,9 @@ class NPUPinnedDeviceContext
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
};
template <>
struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
using TYPE = NPUPinnedDeviceContext;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = phi::GPUContext;
};
// Currently, CUDAPinnedDeviceContext is only used to data copying.
class CUDAPinnedDeviceContext
: public DeviceContext,
......@@ -306,90 +252,57 @@ class CUDAPinnedDeviceContext
CUDAPinnedPlace place_;
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
};
template <>
struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
using TYPE = CUDAPinnedDeviceContext;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDeviceContext : public phi::CustomContext {
public:
explicit CustomDeviceContext(CustomPlace place);
virtual ~CustomDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return stream_->AddCallback(callback);
}
void WaitStreamCallback() const { return stream_->WaitCallback(); }
private:
std::shared_ptr<phi::stream::Stream> stream_;
};
template <>
struct DefaultDeviceContextType<platform::CustomPlace> {
using TYPE = CustomDeviceContext;
};
#else
template <>
struct DefaultDeviceContextType<platform::CustomPlace> {
using TYPE = DeviceContext;
};
using CustomDeviceContext = phi::CustomContext;
#endif
void EmplaceDeviceContexts(
void EmplaceExternalContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<platform::Place>& places,
const platform::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority);
/*! \brief device context pool singleton */
class DeviceContextPool {
public:
static DeviceContextPool& Instance();
/*! \brief Create should only called by Init function */
static DeviceContextPool& Init(const std::vector<platform::Place>& places);
using phi::EmplaceDeviceContexts;
static bool IsInitialized();
using DeviceContextPool = phi::DeviceContextPool;
static void SetPool(DeviceContextPool* dev_pool);
} // namespace platform
} // namespace paddle
/*! \brief Return handle of single device context. */
platform::DeviceContext* Get(const platform::Place& place);
namespace phi {
template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) {
return reinterpret_cast<
const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
}
#ifdef PADDLE_WITH_IPU
template <>
struct DefaultDeviceContextType<phi::IPUPlace> {
using TYPE = paddle::platform::IPUDeviceContext;
};
#endif
size_t size() const;
#ifdef PADDLE_WITH_MLU
template <>
struct DefaultDeviceContextType<phi::MLUPlace>;
#endif
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
device_contexts() const;
#ifdef PADDLE_WITH_ASCEND_CL
template <>
struct DefaultDeviceContextType<phi::NPUPlace> {
using TYPE = paddle::platform::NPUDeviceContext;
};
static void SetDeviceContexts(
const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*);
template <>
struct DefaultDeviceContextType<phi::NPUPinnedPlace> {
using TYPE = paddle::platform::NPUPinnedDeviceContext;
};
#endif
private:
explicit DeviceContextPool(const std::vector<platform::Place>& places);
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
device_contexts_;
static thread_local const std::
map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
external_device_contexts_; // not owned
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
using TYPE = paddle::platform::CUDAPinnedDeviceContext;
};
#endif
} // namespace platform
} // namespace paddle
} // namespace phi
......@@ -119,59 +119,6 @@ using namespace ::phi::enforce; // NOLINT
#define PADDLE_MAY_THROW noexcept(false)
#endif
/*
* Summary: This macro is used to get Variable or internal type
* data (such as LoDTensor or SelectedRows) of the Input and
* Output in op, generally used when call scope.FindVar(Input/
* Output("Name")) or ctx.Input<LoDTensor>().
* Firstly this macro check whether the obtained pointer is null,
* and then return data if it is not null.
*
* Note: This macro is only suitable for specific scenarios and
* does not intended to be widely used. If it cannot meet the
* requirements, please use other PADDLE_ENFORCE** check macro.
*
* Parameters:
*     __PTR: pointer
* __ROLE: (string), Input or Output
* __NAME: (string), Input or Output name
* __OP_TYPE: (string), the op type
*
* Return: The data pointed to by the pointer.
*
* Examples:
* GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
*/
#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \
(([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \
auto* __ptr = (__PTR); \
if (UNLIKELY(nullptr == __ptr)) { \
auto __summary__ = phi::errors::NotFound( \
"Unable to get %s data of %s %s in operator %s. " \
"Possible reasons are:\n" \
" 1. The %s is not the %s of operator %s;\n" \
" 2. The %s has no corresponding variable passed in;\n" \
" 3. The %s corresponding variable is not initialized.", \
phi::demangle( \
typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
.name()), \
__ROLE, \
__NAME, \
__OP_TYPE, \
__NAME, \
__ROLE, \
__OP_TYPE, \
__NAME, \
__NAME); \
auto __message__ = ::paddle::string::Sprintf( \
"%s\n [Hint: pointer " #__PTR " should not be null.]", \
__summary__.error_message()); \
__THROW_ERROR_INTERNAL__( \
phi::ErrorSummary(__summary__.code(), __message__)); \
} \
return *__ptr; \
})())
/*
* Summary: This macro is used to check whether op has specified
* Input or Output Variables. Because op's Input and Output
......
......@@ -282,7 +282,7 @@ void InitDevices(const std::vector<int> devices) {
}
}
#endif
platform::DeviceContextPool::Init(places);
platform::DeviceContextPool::Init(places, platform::EmplaceExternalContext);
#ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(FLAGS_paddle_num_threads);
......
......@@ -27,7 +27,7 @@ TEST(InitDevices, CPU) {
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MLU)
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U);
ASSERT_EQ(pool.Size(), 1U);
#endif
}
......@@ -39,7 +39,7 @@ TEST(InitDevices, CUDA) {
int count = paddle::platform::GetGPUDeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
ASSERT_EQ(pool.Size(), 2U + static_cast<unsigned>(count));
#endif
}
......@@ -51,7 +51,7 @@ TEST(InitDevices, XPU) {
int count = paddle::platform::GetXPUDeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
#endif
}
......@@ -63,7 +63,7 @@ TEST(InitDevices, MLU) {
int count = paddle::platform::GetMLUDeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
#endif
}
......
......@@ -2526,10 +2526,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
m.def("get_cublas_switch", platform::AllowTF32Cublas);
m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn);
m.def("get_cudnn_switch", platform::AllowTF32Cudnn);
m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
m.def("get_cublas_switch", phi::AllowTF32Cublas);
m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
m.def("get_cudnn_switch", phi::AllowTF32Cudnn);
#endif // PADDLE_WITH_CUDA
m.def("clear_executor_cache", []() {
pybind11::gil_scoped_release release;
......
......@@ -91,7 +91,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
phi::AddNInferMeta(x_metas, &meta_out);
using kernel_signature =
void (*)(const platform::DeviceContext&,
void (*)(const phi::DeviceContext&,
const std::vector<const phi::SelectedRows*>&,
phi::SelectedRows*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
......@@ -119,7 +119,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
phi::AddNInferMeta(x_metas, &meta_out);
using kernel_signature =
void (*)(const platform::DeviceContext&,
void (*)(const phi::DeviceContext&,
const std::vector<const phi::TensorBase*>&,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
......@@ -177,7 +177,7 @@ void embedding_grad_impl(const Tensor& x,
meta_out.set_dtype(input_weight->dtype());
kernel_out->set_height(input_weight->dims()[0]);
using kernel_signature = void (*)(const platform::DeviceContext&,
using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
......@@ -194,7 +194,7 @@ void embedding_grad_impl(const Tensor& x,
auto* kernel_out = SetKernelOutput(weight_grad);
phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
using kernel_signature = void (*)(const platform::DeviceContext&,
using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
......@@ -229,7 +229,7 @@ void embedding_grad_impl(const Tensor& x,
auto* kernel_out = SetSelectedRowsKernelOutput(weight_grad);
phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
using kernel_signature = void (*)(const platform::DeviceContext&,
using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&,
const phi::SelectedRows&,
const phi::DenseTensor&,
......@@ -247,7 +247,7 @@ void embedding_grad_impl(const Tensor& x,
phi::MetaTensor meta_out(kernel_out);
meta_out.set_dims(input_weight->GetCompleteDims());
meta_out.set_dtype(input_weight->dtype());
using kernel_signature = void (*)(const platform::DeviceContext&,
using kernel_signature = void (*)(const phi::DeviceContext&,
const phi::DenseTensor&,
const phi::SelectedRows&,
const phi::DenseTensor&,
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/api/include/context_pool.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/enforce.h"
......@@ -35,11 +35,11 @@ DeviceContextPool& DeviceContextPool::Instance() {
const phi::DeviceContext* DeviceContextPool::Get(const Place& place) {
auto it = context_map_.find(place);
if (it == context_map_.end()) {
if (!paddle::platform::DeviceContextPool::IsInitialized()) {
if (!phi::DeviceContextPool::IsInitialized()) {
paddle::framework::InitDevices();
}
// only when we need the specific DeviceContext, get and cache it
auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
auto* dev_ctx = phi::DeviceContextPool::Instance().Get(place);
{
std::lock_guard<std::mutex> lock(mutex_);
context_map_[place] = dev_ctx;
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cast_kernel.h"
......@@ -33,7 +33,7 @@ inline bool NeedTransformDataType(const DataType& input,
target == DataType::COMPLEX64 || target == DataType::COMPLEX128);
}
inline bool NeedTransformPlace(const paddle::platform::Place& input,
inline bool NeedTransformPlace(const phi::Place& input,
const Backend& target,
const TransformFlag& transform_flag) {
// NOTE(dev): The default value of TransformFlag is True, if it is set with
......@@ -52,12 +52,12 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
inline bool NeedTransformLayout(const DataLayout& input,
const DataLayout& target,
const paddle::platform::Place& place,
const phi::Place& place,
const TransformFlag& transform_flag) {
bool ret = transform_flag.need_trans_layout() &&
(input != DataLayout::ALL_LAYOUT &&
target != DataLayout::ALL_LAYOUT && input != target);
if (platform::is_gpu_place(place)) {
if (place.GetType() == phi::AllocationType::GPU) {
return false;
}
return ret;
......@@ -65,10 +65,10 @@ inline bool NeedTransformLayout(const DataLayout& input,
inline phi::DenseTensor TransDataLayout(const phi::DenseTensor& tensor,
DataLayout layout) {
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto& pool = phi::DeviceContextPool::Instance();
VLOG(3) << "DataLayoutTransform src_layout: " << tensor.layout()
<< " dst_layout: " << layout;
if (platform::is_cpu_place(tensor.place())) {
if (tensor.place().GetType() == phi::AllocationType::CPU) {
auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
return phi::TransferLayout(*dev_ctx, tensor, layout);
} else {
......@@ -139,7 +139,7 @@ phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
DataType dtype) {
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto& pool = phi::DeviceContextPool::Instance();
VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype()
<< " dst_dtype: " << dtype;
......@@ -147,11 +147,11 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
DefaultAllocator alloc(tensor.place());
phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()});
if (platform::is_cpu_place(tensor.place())) {
if (tensor.place().GetType() == phi::AllocationType::CPU) {
auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
return CastDataType(*dev_ctx, tensor, dtype);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if (platform::is_gpu_place(tensor.place())) {
} else if (tensor.place().GetType() == phi::AllocationType::GPU) {
auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
return CastDataType(*dev_ctx, tensor, dtype);
#endif
......@@ -170,7 +170,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
auto& pool = phi::DeviceContextPool::Instance();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// NOTE(yy): TransDataPlace should wait for computation of input.
if (!platform::is_cuda_pinned_place(tensor.place())) {
if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
pool.Get(tensor.place())->Wait();
pool.Get(dst_place)->Wait();
}
......
......@@ -112,7 +112,7 @@ DataType ParseDataType(const std::vector<Tensor>& tensors) {
auto n = tensors.size();
for (size_t i = 1; i < n; ++i) {
if (tensors[i].type() != dtype) {
PADDLE_THROW(platform::errors::InvalidArgument(
PADDLE_THROW(phi::errors::InvalidArgument(
"The data_type of input tensor in list isn't consistent, "
"the first tensor is %s, but %dth tensor is %s.",
dtype,
......
......@@ -872,7 +872,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
sr_out_trans_map = {'Tensor': 'phi::SelectedRows*'}
input_names = self.inputs['names']
input_infos = self.inputs['input_info']
kernel_args_type_list = ['const platform::DeviceContext&']
kernel_args_type_list = ['const phi::DeviceContext&']
attr_names = self.attrs['names']
kernel_param = self.kernel['param']
......
......@@ -127,7 +127,7 @@ class StringsAPI(ForwardAPI):
}
input_names = self.inputs['names']
input_infos = self.inputs['input_info']
kernel_args_type_list = ['const platform::DeviceContext&']
kernel_args_type_list = ['const phi::DeviceContext&']
attr_names = self.attrs['names']
kernel_param = self.kernel['param']
......
......@@ -3,6 +3,7 @@ add_subdirectory(gpu)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
set(BACKENDS_DEPS allocator generator)
if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak)
endif()
......@@ -45,7 +46,8 @@ list(
stream.cc
event.cc
device_base.cc
device_manager.cc)
device_manager.cc
context_pool.cc)
if(WITH_CUSTOM_DEVICE)
list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc
......@@ -54,7 +56,6 @@ endif()
add_library(phi_backends "${BACKENDS_SRCS}")
target_link_libraries(phi_backends ${BACKENDS_DEPS})
add_dependencies(phi_backends eigen3)
# for inference library
get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
......
......@@ -26,11 +26,4 @@ limitations under the License. */
#include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#ifndef PADDLE_WITH_CUSTOM_KERNEL
// TODO(wilber): DeviceContextPool nees include fluid file.
#include "paddle/fluid/platform/device_context.h"
namespace phi {
using DeviceContextPool = paddle::platform::DeviceContextPool;
} // namespace phi
#endif
namespace phi {} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/context_pool_utils.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
#endif // PADDLE_WITH_CUDA
static DeviceContextPool* pool = nullptr;
DeviceContextPool& DeviceContextPool::Instance() {
PADDLE_ENFORCE_NOT_NULL(pool,
phi::errors::PreconditionNotMet(
"Need to Create DeviceContextPool firstly!"));
return *pool;
}
EmplaceExternalContextFunc DeviceContextPool::emplace_external_context_func_ =
nullptr;
/*! \brief Create should only called by Init function */
DeviceContextPool& DeviceContextPool::Init(
const std::vector<phi::Place>& places, EmplaceExternalContextFunc func) {
emplace_external_context_func_ = func;
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}
bool DeviceContextPool::IsInitialized() { return pool != nullptr; }
void DeviceContextPool::SetPool(DeviceContextPool* dev_pool) {
pool = dev_pool;
}
thread_local const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*
DeviceContextPool::external_device_contexts_ = nullptr;
phi::DeviceContext* DeviceContextPool::Get(const phi::Place& place) {
VLOG(6) << "DeviceContextPool Get: " << place;
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
ptr;
if (external_device_contexts_ && external_device_contexts_->count(place)) {
ptr = external_device_contexts_;
} else {
ptr = &device_contexts_;
}
auto it = ptr->find(place);
if (it == ptr->end()) {
PADDLE_THROW(phi::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
"or check "
"that your train process set the correct device id if you use "
"Executor.",
place));
}
return it->second.get().get();
}
size_t DeviceContextPool::Size() const {
if (external_device_contexts_) {
return external_device_contexts_->size();
}
return device_contexts_.size();
}
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
DeviceContextPool::device_contexts() const {
if (external_device_contexts_) {
return *external_device_contexts_;
}
return device_contexts_;
}
void DeviceContextPool::SetDeviceContexts(
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
dev_ctxs) {
external_device_contexts_ = dev_ctxs;
}
inline void EmplaceNativeContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
if (place.GetType() == phi::AllocationType::CPU) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("GPUPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::XPU) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<phi::XPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<phi::CustomContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif
}
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func) {
PADDLE_ENFORCE_GT(
places.size(),
0,
phi::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
EmplaceNativeContext(place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
if (emplace_external_context_func != nullptr) {
(*emplace_external_context_func)(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
}
}
}
DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) {
EmplaceDeviceContexts(&device_contexts_,
places,
/*disable_setting_default_stream_for_allocator=*/false,
/*stream_priority=*/0,
emplace_external_context_func_);
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <future> // NOLINT
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/macros.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void SetAllowTF32Cublas(bool active);
/*Get the global variable allow_tf32_cublas value*/
bool AllowTF32Cublas();
extern bool allow_tf32_cudnn;
/*Set the value of the global variable allow_tf32_cudnn*/
void SetAllowTF32Cudnn(bool active);
/*Get the global variable allow_tf32_cudnn value*/
bool AllowTF32Cudnn();
#endif // PADDLE_WITH_CUDA
template <typename Place>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<phi::CPUPlace> {
using TYPE = phi::CPUContext;
};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct DefaultDeviceContextType<phi::GPUPlace> {
using TYPE = phi::GPUContext;
};
#endif
#ifdef PADDLE_WITH_XPU
template <>
struct DefaultDeviceContextType<phi::XPUPlace> {
using TYPE = phi::XPUContext;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
template <>
struct DefaultDeviceContextType<phi::CustomPlace> {
using TYPE = phi::CustomContext;
};
#else
template <>
struct DefaultDeviceContextType<phi::CustomPlace> {
using TYPE = DeviceContext;
};
#endif
using EmplaceExternalContextFunc = void (*)(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*,
const phi::Place&,
bool,
int);
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func = nullptr);
/*! \brief device context pool singleton */
class DeviceContextPool {
public:
static DeviceContextPool& Instance();
/*! \brief Create should only called by Init function */
static DeviceContextPool& Init(const std::vector<phi::Place>& places,
EmplaceExternalContextFunc func = nullptr);
static bool IsInitialized();
static void SetPool(DeviceContextPool* dev_pool);
/*! \brief Return handle of single device context. */
phi::DeviceContext* Get(const phi::Place& place);
template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) {
return reinterpret_cast<
const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
}
size_t Size() const;
const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
device_contexts() const;
static void SetDeviceContexts(
const std::map<Place,
std::shared_future<std::unique_ptr<DeviceContext>>>*);
private:
explicit DeviceContextPool(const std::vector<phi::Place>& places);
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
device_contexts_;
static thread_local const std::
map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
external_device_contexts_; // not owned
static EmplaceExternalContextFunc emplace_external_context_func_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/core/generator.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const phi::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
inline std::unique_ptr<DeviceContext> CreateDeviceContext(
const phi::Place& p,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
if (p.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
phi::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(phi::GPUPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (p.GetType() == phi::AllocationType::XPU) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p)
.get());
dev_ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::CPUPlace())
.get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
// lazy evaluation. i.e., only create device context at first `Get`
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
} // namespace phi
......@@ -66,8 +66,10 @@ void CustomContext::SetStream(std::shared_ptr<phi::stream::Stream> stream) {
void CustomContext::Wait() const { return impl_->Wait(); }
CustomContext::CustomContext(const CustomPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {
impl_->Init();
}
CustomContext::~CustomContext() {}
CustomContext::~CustomContext() { impl_->Init(); }
} // namespace phi
......@@ -20,6 +20,10 @@ limitations under the License. */
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"
namespace Eigen {
struct DefaultDevice;
} // namespace Eigen
namespace phi {
class CustomContext : public DeviceContext,
......@@ -42,6 +46,15 @@ class CustomContext : public DeviceContext,
// Wait for all operations completion in the stream.
void Wait() const override;
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return GetStream()->AddCallback(callback);
}
void WaitStreamCallback() const { return GetStream()->WaitCallback(); }
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
static const char* name() { return "CustomContext"; }
public:
......
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/phi/backends/callback_manager.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/custom/enforce_custom.h"
#include "paddle/phi/backends/device_base.h"
#include "paddle/phi/backends/device_guard.h"
......@@ -285,8 +285,7 @@ class CustomDevice : public DeviceInterface {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
} else if (pimpl_->memory_copy_h2d) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_h2d(device, dst, src, size));
......@@ -306,8 +305,7 @@ class CustomDevice : public DeviceInterface {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
} else if (pimpl_->memory_copy_d2h) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_d2h(device, dst, src, size));
......@@ -327,8 +325,7 @@ class CustomDevice : public DeviceInterface {
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
} else if (pimpl_->memory_copy_d2d) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
pool.Get(place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_d2d(device, dst, src, size));
......@@ -364,8 +361,7 @@ class CustomDevice : public DeviceInterface {
MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
} else {
auto src_place = CustomPlace(Type(), src_dev_id);
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
pool.Get(src_place)->Wait();
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));
......
......@@ -18,7 +18,7 @@
#include <utility>
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
#endif
......
......@@ -18,7 +18,7 @@
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/flat_hash_map.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/core/expect.h"
namespace phi {
......@@ -42,8 +42,7 @@ OneDNNContextThreadLocals::Body::~Body() {
auto cpu_place = phi::CPUPlace();
// TODO(YuanRisheng): we need remove the dependency on fluid device context
// here
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
OneDNNContext* dev_ctx = static_cast<OneDNNContext*>(pool.Get(cpu_place));
dev_ctx->ResetBlobMap(exec_ptr_);
}
......
......@@ -44,14 +44,14 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
return nullptr;
}
#endif
if (paddle::platform::is_gpu_place(place)) {
if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
return static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)->NCCLComm(
place);
#else
return nullptr;
#endif
} else if (paddle::platform::is_custom_place(place)) {
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
return static_cast<paddle::distributed::ProcessGroupCustom*>(pg)
->CustomCCLComm(place);
......
......@@ -125,10 +125,14 @@ struct XPUContext::Impl {
xpu::BKCLContext_t bkcl_context_{nullptr};
};
XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {
impl_->Init();
}
XPUContext::XPUContext(const XPUPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {
impl_->Init();
}
XPUContext::~XPUContext() = default;
......
......@@ -24,6 +24,10 @@ limitations under the License. */
#include "paddle/phi/core/device_context.h"
#include "xpu/runtime.h"
namespace Eigen {
struct DefaultDevice;
} // namespace Eigen
namespace xpu = baidu::xpu::api;
namespace phi {
......@@ -65,6 +69,8 @@ class XPUContext : public DeviceContext,
void SetL3Cache(int l3_size = 14155776);
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
XPUStream stream() const;
static const char* name() { return "XPUContext"; }
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/tensor_utils.h"
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
......
......@@ -118,7 +118,7 @@ cc_library(
cc_library(
mixed_vector
SRCS mixed_vector.cc
DEPS device_context place memory)
DEPS phi_backends place memory)
cc_library(
generator
......@@ -135,24 +135,20 @@ if(WITH_GPU)
nv_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
elseif(WITH_ROCM)
hip_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
elseif(WITH_XPU_KP)
xpu_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
else()
cc_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS dense_tensor selected_rows memcpy device_context phi_backends
memory_utils)
DEPS dense_tensor selected_rows memcpy phi_backends memory_utils)
endif()
......@@ -468,6 +468,59 @@ struct EnforceNotMet : public std::exception {
/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/
/*
* Summary: This macro is used to get Variable or internal type
* data (such as LoDTensor or SelectedRows) of the Input and
* Output in op, generally used when call scope.FindVar(Input/
* Output("Name")) or ctx.Input<LoDTensor>().
* Firstly this macro check whether the obtained pointer is null,
* and then return data if it is not null.
*
* Note: This macro is only suitable for specific scenarios and
* does not intended to be widely used. If it cannot meet the
* requirements, please use other PADDLE_ENFORCE** check macro.
*
* Parameters:
*     __PTR: pointer
* __ROLE: (string), Input or Output
* __NAME: (string), Input or Output name
* __OP_TYPE: (string), the op type
*
* Return: The data pointed to by the pointer.
*
* Examples:
* GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
*/
#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \
(([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \
auto* __ptr = (__PTR); \
if (UNLIKELY(nullptr == __ptr)) { \
auto __summary__ = phi::errors::NotFound( \
"Unable to get %s data of %s %s in operator %s. " \
"Possible reasons are:\n" \
" 1. The %s is not the %s of operator %s;\n" \
" 2. The %s has no corresponding variable passed in;\n" \
" 3. The %s corresponding variable is not initialized.", \
phi::demangle( \
typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
.name()), \
__ROLE, \
__NAME, \
__OP_TYPE, \
__NAME, \
__ROLE, \
__OP_TYPE, \
__NAME, \
__NAME); \
auto __message__ = ::paddle::string::Sprintf( \
"%s\n [Hint: pointer " #__PTR " should not be null.]", \
__summary__.error_message()); \
__THROW_ERROR_INTERNAL__( \
phi::ErrorSummary(__summary__.code(), __message__)); \
} \
return *__ptr; \
})())
/*
* Summary: This PADDLE_GET(_**) series macros are used to call paddle::get
* safely. paddle::get is not a completely safe api, although it will not
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"
......
......@@ -14,14 +14,13 @@ limitations under the License. */
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/platform/device_context.h"
namespace phi {
template <typename Context>
......
......@@ -62,10 +62,10 @@ void IndexSelectInner(const Context& ctx,
auto index_size = index.dims()[0];
DenseTensor index_cpu_copy;
if (!paddle::platform::is_cpu_place(index.place())) {
if (index.place().GetType() != phi::AllocationType::CPU) {
phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
}
const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
const IndexT* index_data = index.place().GetType() == phi::AllocationType::CPU
? index.data<IndexT>()
: index_cpu_copy.data<IndexT>();
ctx.template Alloc<T>(output);
......
......@@ -160,5 +160,5 @@ PD_REGISTER_KERNEL(overlap_add_grad,
int64_t,
float,
double,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {}
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -146,5 +146,5 @@ PD_REGISTER_KERNEL(overlap_add,
int64_t,
float,
double,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {}
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -33,7 +33,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
DenseTensor* x_grad,
DenseTensor* value_grad) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true,
errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));
......
......@@ -32,7 +32,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
const std::string& reduce,
DenseTensor* out) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true,
errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));
......
......@@ -62,7 +62,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
paddle::framework::DataTypeToString(
paddle::framework::proto::VarType::INT64)));
paddle::platform::DeviceContextPool::Instance().Get(repeats_tensor.place());
phi::DeviceContextPool::Instance().Get(repeats_tensor.place());
if (index_type == paddle::framework::proto::VarType::INT32) {
phi::funcs::RepeatsTensor2IndexTensor<Context, int>(
ctx, repeats_tensor, &index);
......
......@@ -30,7 +30,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
int axis,
DenseTensor* x_grad) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true,
errors::PreconditionNotMet("This kernel only runs on CPU."));
......
......@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
int axis,
DenseTensor* out) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
true,
errors::PreconditionNotMet("This kernel only runs on CPU."));
......
cc_library(
blas
SRCS blas.cc
DEPS cblas framework_proto device_context)
DEPS cblas framework_proto phi_backends)
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
......
......@@ -16,7 +16,7 @@
#include "glog/logging.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/layout.h"
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
#include "paddle/phi/kernels/funcs/gru_compute.h"
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
#include "paddle/phi/kernels/funcs/lstm_compute.h"
......@@ -218,7 +218,7 @@ __global__ void KeLstmBackward(Op op,
}
template <class T, class Op>
void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
void gpu_lstm_forward(const phi::DeviceContext& context,
Op op,
phi::funcs::LstmMetaValue<T> value,
int frame_size,
......@@ -269,7 +269,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
}
template <class T, class Op>
void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
void gpu_lstm_backward(const phi::DeviceContext& context,
Op op,
phi::funcs::LstmMetaValue<T> value,
phi::funcs::LstmMetaGrad<T> grad,
......
......@@ -35,8 +35,8 @@ inline std::vector<size_t> GetLodFromRoisNum(const Context& dev_ctx,
std::vector<size_t> rois_lod;
auto* rois_num_data = rois_num->data<int>();
DenseTensor cpu_tensor;
if (paddle::platform::is_gpu_place(rois_num->place()) ||
paddle::platform::is_xpu_place(rois_num->place())) {
if (rois_num->place().GetType() == phi::AllocationType::GPU ||
rois_num->place().GetType() == phi::AllocationType::XPU) {
Copy<Context>(dev_ctx, *rois_num, phi::CPUPlace(), true, &cpu_tensor);
rois_num_data = cpu_tensor.data<int>();
}
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -986,7 +986,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
auto *ctx = static_cast<GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(gplace));
phi::DeviceContextPool::Instance().Get(gplace));
phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
......@@ -1010,8 +1010,8 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
dim3 grid_size = dim3(n);
auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
auto *ctx = static_cast<GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(gplace));
auto *ctx =
static_cast<GPUContext *>(phi::DeviceContextPool::Instance().Get(gplace));
phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/jit/kernels.h"
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include <algorithm>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
namespace phi {
namespace funcs {
......
......@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/platform/device_context.h>
#include "paddle/phi/kernels/funcs/gru_compute.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
#include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
#include "paddle/phi/kernels/funcs/gru_compute.h"
namespace phi {
namespace funcs {
......
......@@ -11,7 +11,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
......
......@@ -14,6 +14,7 @@
#pragma once
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -122,20 +123,20 @@ inline std::vector<T> get_new_data_from_tensor(
DenseTensor cpu_starts_tensor;
auto& pool = phi::DeviceContextPool::Instance();
phi::DeviceContext* dev_ctx = pool.Get(new_data_tensor->place());
if (paddle::platform::is_gpu_place(new_data_tensor->place())) {
if (new_data_tensor->place().GetType() == phi::AllocationType::GPU) {
phi::Copy(
*dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
new_data = cpu_starts_tensor.data<T>();
}
#ifdef PADDLE_WITH_ASCEND_CL
if (paddle::platform::is_npu_place(new_data_tensor->place())) {
if (new_data_tensor->place().GetType() == phi::AllocationType::NPU) {
phi::Copy(
*dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
new_data = cpu_starts_tensor.data<T>();
}
#endif
#ifdef PADDLE_WITH_XPU
if (paddle::platform::is_xpu_place(new_data_tensor->place())) {
if (new_data_tensor->place().GetType() == phi::AllocationType::XPU) {
phi::Copy(
*dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
new_data = cpu_starts_tensor.data<T>();
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
......
......@@ -26,6 +26,7 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h"
......@@ -52,22 +53,6 @@ template struct SetConstant<phi::CPUContext, phi::dtype::complex<float>>;
template struct SetConstant<phi::CPUContext, phi::dtype::complex<double>>;
#ifdef PADDLE_WITH_XPU
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::float16>;
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::bfloat16>;
template struct SetConstant<paddle::platform::XPUDeviceContext, float>;
template struct SetConstant<paddle::platform::XPUDeviceContext, double>;
template struct SetConstant<paddle::platform::XPUDeviceContext, uint8_t>;
template struct SetConstant<paddle::platform::XPUDeviceContext, int16_t>;
template struct SetConstant<paddle::platform::XPUDeviceContext, int>;
template struct SetConstant<paddle::platform::XPUDeviceContext, int64_t>;
template struct SetConstant<paddle::platform::XPUDeviceContext, bool>;
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::complex<float>>;
template struct SetConstant<paddle::platform::XPUDeviceContext,
phi::dtype::complex<double>>;
template struct SetConstant<phi::XPUContext, phi::dtype::float16>;
template struct SetConstant<phi::XPUContext, phi::dtype::bfloat16>;
template struct SetConstant<phi::XPUContext, float>;
......@@ -164,10 +149,9 @@ struct TensorSetConstantCPU {
};
template <>
void set_constant_with_place<paddle::platform::XPUPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
#ifdef PADDLE_WITH_XPU
phi::VisitDataType(
tensor->dtype(),
......@@ -178,64 +162,54 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
}
template <>
void set_constant_with_place<paddle::platform::NPUPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::NPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::NPUPinnedPlace>(
const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<paddle::platform::IPUPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
}
template <>
void set_constant_with_place<paddle::platform::CustomPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::CustomPlace>(
const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
}
template <>
void set_constant_with_place<phi::CPUPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
}
template <>
void set_constant_with_place<paddle::platform::MLUPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::MLUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
}
template <>
void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::GPUPinnedPlace>(
const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
}
struct TensorSetConstantWithPlace
: public std::unary_function<paddle::platform::Place, void> {
TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context,
: public std::unary_function<phi::Place, void> {
TensorSetConstantWithPlace(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value)
: context_(context), tensor_(tensor), value_(value) {}
......@@ -245,17 +219,17 @@ struct TensorSetConstantWithPlace
set_constant_with_place<Place>(context_, tensor_, value_);
}
const paddle::platform::DeviceContext& context_;
const phi::DeviceContext& context_;
phi::DenseTensor* tensor_;
float value_;
};
void set_constant(const paddle::platform::DeviceContext& context,
void set_constant(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
TensorSetConstantWithPlace func(context, tensor, value);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (paddle::platform::is_custom_place(context.GetPlace())) {
if (context.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
func(phi::CPUPlace());
return;
}
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h"
......@@ -187,8 +188,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
auto* out_ptr = out->data<T>();
// copy in_stride, out_stride, axis to gpu device
const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
const phi::GPUPlace& cuda_place = context.GetPlace();
phi::CPUPlace cpu_place = phi::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
......@@ -231,7 +232,7 @@ struct TransposeNormal<phi::GPUContext, T> {
// copy in_stride, out_stride, axis to gpu device
const phi::GPUPlace& cuda_place = context.GetPlace();
phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
phi::CPUPlace cpu_place = phi::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
......@@ -286,7 +287,7 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<float>);
DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>);
struct TensorSetConstantGPU {
TensorSetConstantGPU(const paddle::platform::DeviceContext& context,
TensorSetConstantGPU(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value)
: context_(context), tensor_(tensor), value_(value) {}
......@@ -299,16 +300,15 @@ struct TensorSetConstantGPU {
static_cast<T>(value_));
}
const paddle::platform::DeviceContext& context_;
const phi::DeviceContext& context_;
phi::DenseTensor* tensor_;
float value_;
};
template <>
void set_constant_with_place<paddle::platform::CUDAPlace>(
const paddle::platform::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
phi::VisitDataType(tensor->dtype(),
TensorSetConstantGPU(context, tensor, value));
}
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/utils/data_type.h"
......@@ -56,24 +56,19 @@ struct SetConstant {
#ifdef PADDLE_WITH_XPU
template <typename T>
struct SetConstant<XPUContext, T> {
void operator()(const XPUContext& context, phi::DenseTensor* tensor, T num);
};
template <typename T>
struct SetConstant<paddle::platform::XPUDeviceContext, T> {
void operator()(const paddle::platform::XPUDeviceContext& context,
struct SetConstant<phi::XPUContext, T> {
void operator()(const phi::XPUContext& context,
phi::DenseTensor* tensor,
T num);
};
#endif
template <typename Place>
void set_constant_with_place(const paddle::platform::DeviceContext& context,
void set_constant_with_place(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value);
void set_constant(const paddle::platform::DeviceContext& context,
void set_constant(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value);
......@@ -109,9 +104,7 @@ struct RowwiseMean {
#ifdef PADDLE_WITH_XPU
template <typename U>
struct TensorSetConstantXPU {
TensorSetConstantXPU(phi::DenseTensor* tensor,
U value,
paddle::platform::Place place)
TensorSetConstantXPU(phi::DenseTensor* tensor, U value, phi::Place place)
: tensor_(tensor), value_(value), place_(place) {}
template <typename T>
void apply() const {
......@@ -127,7 +120,7 @@ struct TensorSetConstantXPU {
}
phi::DenseTensor* tensor_;
U value_;
paddle::platform::Place place_;
phi::Place place_;
};
#endif
......
......@@ -34,17 +34,9 @@ void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
#ifdef PADDLE_WITH_XPU
template <typename T>
void SetConstant<XPUContext, T>::operator()(const XPUContext& context,
phi::DenseTensor* tensor,
T num) {
phi::VisitDataType(tensor->dtype(),
TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
}
template <typename T>
void SetConstant<paddle::platform::XPUDeviceContext, T>::operator()(
const paddle::platform::XPUDeviceContext& context,
phi::DenseTensor* tensor,
T num) {
void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& context,
phi::DenseTensor* tensor,
T num) {
phi::VisitDataType(tensor->dtype(),
TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
}
......@@ -65,7 +57,7 @@ void Transpose<DeviceContext, T, Rank>::operator()(
auto* dev = context.eigen_device();
// use 32bit index to speed up computation
bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
bool is_gpu_place = paddle::platform::is_gpu_place(context.GetPlace());
bool is_gpu_place = context.GetPlace().GetType() == phi::AllocationType::GPU;
if (use_32bit_index && is_gpu_place) {
To32BitIndex(eigen_out).device(*dev) =
To32BitIndex(eigen_in).shuffle(permute);
......
......@@ -415,7 +415,7 @@ struct ReduceConfig {
#ifdef PADDLE_WITH_XPU_KP
bool not_higher = x_dim[0] > 1;
#else
int device_id = paddle::platform::GetCurrentDeviceId();
int device_id = phi::backends::gpu::GetCurrentDeviceId();
int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2];
bool not_higher = x_dim[0] >= max_grid_z;
#endif // PADDLE_WITH_XPU_KP
......@@ -467,10 +467,10 @@ struct ReduceConfig {
grid_num = details::CeilingDiv(left_num, block_dim->x);
reduce_num_per_thread = details::CeilingDiv(reduce_num, block_dim->y);
}
int device_id = paddle::platform::GetCurrentDeviceId();
int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
int device_id = phi::backends::gpu::GetCurrentDeviceId();
int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
int max_threads_per_mp =
paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
int max_threads = max_threads_per_mp * max_mp;
int num_threads = block_dim->x * block_dim->y;
int max_num_blocks = max_threads / num_threads;
......@@ -509,10 +509,10 @@ struct ReduceConfig {
int grid_z = left_num / last_dim_num;
left_num = last_dim_num;
grid_dim->z = grid_z;
int device_id = paddle::platform::GetCurrentDeviceId();
int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
int device_id = phi::backends::gpu::GetCurrentDeviceId();
int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
int max_threads_per_mp =
paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
int max_threads = max_threads_per_mp * max_mp;
// init
int num_block = (max_threads / left_num);
......
......@@ -22,13 +22,13 @@ void RepeatsTensor2IndexTensor(const Context& ctx,
const DenseTensor& repeats,
DenseTensor* index) {
DenseTensor repeats_cpu_copy;
if (!paddle::platform::is_cpu_place(repeats.place())) {
phi::Copy(
ctx, repeats, paddle::platform::CPUPlace(), true, &repeats_cpu_copy);
if (repeats.place().GetType() != phi::AllocationType::CPU) {
phi::Copy(ctx, repeats, phi::CPUPlace(), true, &repeats_cpu_copy);
}
const RepeatsT* repeats_data = paddle::platform::is_cpu_place(repeats.place())
? repeats.data<RepeatsT>()
: repeats_cpu_copy.data<RepeatsT>();
const RepeatsT* repeats_data =
repeats.place().GetType() == phi::AllocationType::CPU
? repeats.data<RepeatsT>()
: repeats_cpu_copy.data<RepeatsT>();
int64_t index_size = 0;
for (int i = 0; i < repeats.dims()[0]; i++) {
......
......@@ -389,8 +389,8 @@ void SelectKernel(const KPDevice &dev_ctx,
using CT = int64_t; // set Count_data Type
const int t_size = sizeof(CT);
const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
const phi::GPUPlace &cuda_place = dev_ctx.GetPlace();
phi::CPUPlace cpu_place = phi::CPUPlace();
// 1.1 get stored data num of per block
const int kVecSize = 4;
......
......@@ -76,17 +76,17 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place),
PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
auto in2_place = input2.place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in2_place),
PADDLE_ENFORCE_EQ(in2_place.GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
auto out_place = context.GetPlace();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(out_place),
PADDLE_ENFORCE_EQ(out_place.GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
......@@ -237,12 +237,12 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
}
auto in1_place = input1.place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place),
PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
auto in2_place = input2->place();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place),
PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#endif
namespace phi {
......
......@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/core/mixed_vector.h"
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/lod_utils.h"
......
......@@ -16,7 +16,7 @@
#include <sstream>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -33,7 +33,7 @@ static const std::vector<T> &ToVector(const std::vector<T> &vec) {
template <typename T>
static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
#ifdef __NVCC__
if (paddle::platform::is_gpu_place(place)) {
if (place.GetType() == phi::AllocationType::GPU) {
using CopyT = typename std::
conditional<std::is_same<T, bool>::value, uint8_t, T>::type;
std::vector<CopyT> cpu_x(n);
......
......@@ -64,7 +64,7 @@ void Conv2dFusionKernel(const Context& ctx,
pad_w0 = paddings[2];
pad_w1 = paddings[3];
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
PADDLE_THROW(phi::errors::InvalidArgument(
"Attr paddins in conv2d_fusion must have 2 or 4 elements, but now have "
"%u elements.",
paddings.size()));
......@@ -111,7 +111,7 @@ void Conv2dFusionKernel(const Context& ctx,
params.residual = reinterpret_cast<const half*>(residual->data<T>());
Conv2dBiasAddRelu(params);
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
PADDLE_THROW(phi::errors::InvalidArgument(
"Cutlass now only support relu activation in a residual block"));
}
} else if (activation == "relu") {
......
......@@ -319,7 +319,7 @@ void InitMoeRoutingKernelLauncher(
ec_route);
}
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
PADDLE_THROW(phi::errors::InvalidArgument(
"Currently only support `ec_route = True`. "));
}
}
......@@ -401,7 +401,7 @@ void GenericMoeGemmKernelLauncher(const T* A,
int occupancy = GemmGrouped::maximum_active_blocks();
const int threadblock_count = multi_processor_count * occupancy;
if (occupancy == 0) {
PADDLE_THROW(paddle::platform::errors::Fatal(
PADDLE_THROW(phi::errors::Fatal(
"[MoE Runner] GPU lacks the shared memory resources to run GroupedGEMM "
"kernel"));
}
......@@ -425,21 +425,21 @@ void GenericMoeGemmKernelLauncher(const T* A,
if (can_implement != cutlass::Status::kSuccess) {
std::string err_msg = "MoEFC kernel will fail for params. Error: " +
std::string(cutlassGetStatusString(can_implement));
PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg));
PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
}
auto init_status = gemm.initialize(args);
if (init_status != cutlass::Status::kSuccess) {
std::string err_msg =
"Failed to initialize cutlass variable batched gemm. Error: " +
std::string(cutlassGetStatusString(init_status));
PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg));
PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
}
auto run_status = gemm.run(stream);
if (run_status != cutlass::Status::kSuccess) {
std::string err_msg =
"Failed to run cutlass variable batched gemm. Error: " +
std::string(cutlassGetStatusString(run_status));
PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg));
PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
}
}
......
......@@ -22,6 +22,7 @@
#include <memory>
#include <unordered_map>
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
#include "paddle/phi/common/backend.h"
......@@ -313,12 +314,8 @@ class CudnnConvDescManager {
int groups,
cudnnDataType_t dtype) {
auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
desc->set(dtype,
paddings,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
groups);
desc->set(
dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
return desc;
}
......
......@@ -16,7 +16,6 @@
#include "paddle/phi/kernels/affine_grid_grad_kernel.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/kernels/affine_grid_kernel.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......
......@@ -369,10 +369,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
// use global calculate stream
const auto calcu_stream =
static_cast<GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(
dev_ctx.GetPlace()))
phi::DeviceContextPool::Instance().Get(dev_ctx.GetPlace()))
->stream();
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr,
num_classes_per_device_ptr,
num_classes_per_device.numel(),
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/hostdevice.h"
#ifdef __NVCC__
......@@ -1721,34 +1721,36 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvFunctor<phi::GPUContext, platform::float16, false>;
template class DepthwiseConvFunctor<phi::GPUContext,
phi::dtype::float16,
false>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext,
platform::float16,
phi::dtype::float16,
false>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext,
platform::float16,
phi::dtype::float16,
false>;
template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvFunctor<phi::GPUContext, platform::float16, true>;
template class DepthwiseConvFunctor<phi::GPUContext, phi::dtype::float16, true>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvInputGradFunctor<phi::GPUContext,
platform::float16,
phi::dtype::float16,
true>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
template class DepthwiseConvFilterGradFunctor<phi::GPUContext,
platform::float16,
phi::dtype::float16,
true>;
} // namespace math
......
......@@ -385,17 +385,17 @@ void InstanceNormGradKernel(const Context &dev_ctx,
miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
......@@ -406,27 +406,23 @@ void InstanceNormGradKernel(const Context &dev_ctx,
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSetTensorDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif
const auto *saved_mean_data =
......@@ -435,49 +431,47 @@ void InstanceNormGradKernel(const Context &dev_ctx,
saved_variance.template data<BatchNormParamType<T>>();
if (d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenBatchNormalizationBackward(
dev_ctx.cudnn_handle(),
miopenBNSpatial,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
d_y_tmp.template data<T>(),
data_desc_,
d_x->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(),
epsilon,
saved_mean_data,
saved_var_data));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
dev_ctx.cudnn_handle(),
miopenBNSpatial,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
d_y_tmp.template data<T>(),
data_desc_,
d_x->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(),
epsilon,
saved_mean_data,
saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
d_y_tmp.template data<T>(),
data_desc_,
d_x->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(),
epsilon,
saved_mean_data,
saved_var_data));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
x_tmp.template data<T>(),
data_desc_,
d_y_tmp.template data<T>(),
data_desc_,
d_x->template data<T>(),
in_param_desc_,
scale_tmp.template data<BatchNormParamType<T>>(),
d_scale_tmp.template data<BatchNormParamType<T>>(),
d_bias_tmp.template data<BatchNormParamType<T>>(),
epsilon,
saved_mean_data,
saved_var_data));
#endif
} else {
if (d_x) {
......@@ -502,14 +496,14 @@ void InstanceNormGradKernel(const Context &dev_ctx,
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif
}
......
......@@ -63,17 +63,17 @@ void InstanceNormKernel(const Context &dev_ctx,
miopenTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
......@@ -89,27 +89,23 @@ void InstanceNormKernel(const Context &dev_ctx,
strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSetTensorDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, miopenBNSpatial));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
#endif
const auto scale_ptr = scale.get_ptr();
......@@ -170,7 +166,7 @@ void InstanceNormKernel(const Context &dev_ctx,
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenBatchNormalizationForwardTraining(
phi::dynload::miopenBatchNormalizationForwardTraining(
handle,
miopenBNSpatial,
const_cast<void *>(
......@@ -194,12 +190,12 @@ void InstanceNormKernel(const Context &dev_ctx,
static_cast<void *>(saved_variance_data)));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
phi::dynload::cudnnBatchNormalizationForwardTraining(
handle,
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
......@@ -219,9 +215,9 @@ void InstanceNormKernel(const Context &dev_ctx,
saved_variance_data));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
#endif
}
......
......@@ -89,11 +89,10 @@ void GetClassInterval(const gpuStream_t& stream,
paddle::platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream
const auto calcu_stream =
static_cast<GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
->stream();
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr,
num_classes_per_device_ptr,
num_classes_per_device.numel(),
......
......@@ -85,11 +85,10 @@ void GetClassInterval(const gpuStream_t& stream,
paddle::platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream
const auto calcu_stream =
static_cast<GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
->stream();
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr,
num_classes_per_device_ptr,
num_classes_per_device.numel(),
......@@ -247,7 +246,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
// use global calculate stream
stream = static_cast<GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
phi::DeviceContextPool::Instance().Get(place))
->stream();
}
}
......@@ -358,7 +357,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
logits_max_buff,
logits_max_buff,
logits_max.numel(),
......@@ -400,7 +399,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
sum_exp_logits_buff,
sum_exp_logits_buff,
sum_exp_logits.numel(),
......@@ -459,7 +458,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
loss_ptr,
loss_ptr,
loss->numel(),
......
......@@ -161,5 +161,5 @@ PD_REGISTER_KERNEL(overlap_add_grad,
float,
double,
phi::dtype::float16,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {}
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -147,5 +147,5 @@ PD_REGISTER_KERNEL(overlap_add,
float,
double,
phi::dtype::float16,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {}
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -32,7 +32,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
const std::string& reduce,
DenseTensor* x_grad,
DenseTensor* value_grad) {
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true,
errors::PreconditionNotMet(
"PutAlongAxisGradOpCUDAKernel only runs on GPU."));
......
......@@ -31,7 +31,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
int axis,
const std::string& reduce,
DenseTensor* out) {
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true,
errors::PreconditionNotMet(
"PutAlongAxisCUDAKernel only runs on GPU device."));
......
......@@ -260,108 +260,104 @@ void RnnGradKernel(const Context &dev_ctx,
if (!has_seq_length) {
if (x_grad) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenRNNBackwardData(
handle,
rnn.rnn_desc(),
seq_length,
rnn.y_descs(),
out_data,
rnn.y_descs(),
out_grad_data,
rnn.last_h_desc(),
last_h_grad_data,
rnn.last_c_desc(),
last_c_grad_data,
rnn.weight_desc(),
weight_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.x_descs(),
x_grad_data,
rnn.init_h_desc(),
init_h_grad_data,
rnn.init_c_desc(),
init_c_grad_data,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardData(
handle,
rnn.rnn_desc(),
seq_length,
rnn.y_descs(),
out_data,
rnn.y_descs(),
out_grad_data,
rnn.last_h_desc(),
last_h_grad_data,
rnn.last_c_desc(),
last_c_grad_data,
rnn.weight_desc(),
weight_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.x_descs(),
x_grad_data,
rnn.init_h_desc(),
init_h_grad_data,
rnn.init_c_desc(),
init_c_grad_data,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
#else
// This interface is used when the input/output is unpadded.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNBackwardData(
handle,
rnn.rnn_desc(),
seq_length,
rnn.y_descs(),
out_data,
rnn.y_descs(),
out_grad_data,
rnn.last_h_desc(),
last_h_grad_data,
rnn.last_c_desc(),
last_c_grad_data,
rnn.weight_desc(),
weight_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.x_descs(),
x_grad_data,
rnn.init_h_desc(),
init_h_grad_data,
rnn.init_c_desc(),
init_c_grad_data,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData(
handle,
rnn.rnn_desc(),
seq_length,
rnn.y_descs(),
out_data,
rnn.y_descs(),
out_grad_data,
rnn.last_h_desc(),
last_h_grad_data,
rnn.last_c_desc(),
last_c_grad_data,
rnn.weight_desc(),
weight_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.x_descs(),
x_grad_data,
rnn.init_h_desc(),
init_h_grad_data,
rnn.init_c_desc(),
init_c_grad_data,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
#endif
}
if (!weight_grad_list.empty()) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenRNNBackwardWeights(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x.data<T>(),
rnn.init_h_desc(),
init_h_data,
rnn.y_descs(),
out.data<T>(),
rnn.weight_desc(),
weight_grad_data,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x.data<T>(),
rnn.init_h_desc(),
init_h_data,
rnn.y_descs(),
out.data<T>(),
rnn.weight_desc(),
weight_grad_data,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
// permute weight grad list from weight grad tensor
TensorToPermutedWeight<T>(
place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNBackwardWeights(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x.data<T>(),
rnn.init_h_desc(),
init_h_data,
rnn.y_descs(),
out.data<T>(),
workspace_data_.data<uint8_t>(),
workspace_size,
rnn.weight_desc(),
weight_grad_data,
const_cast<uint8_t *>(reserve_data),
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x.data<T>(),
rnn.init_h_desc(),
init_h_data,
rnn.y_descs(),
out.data<T>(),
workspace_data_.data<uint8_t>(),
workspace_size,
rnn.weight_desc(),
weight_grad_data,
const_cast<uint8_t *>(reserve_data),
reserve_size));
#endif
}
} else {
......@@ -369,57 +365,55 @@ void RnnGradKernel(const Context &dev_ctx,
// for train
// This interface is used when the input/output is padded.
if (x_grad) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNBackwardDataEx(
handle,
rnn.rnn_desc(),
rnn.y_seq_desc(),
out_data,
rnn.y_seq_desc(),
out_grad_data,
nullptr,
nullptr,
rnn.last_h_desc(),
last_h_grad_data,
rnn.last_c_desc(),
last_c_grad_data,
rnn.weight_desc(),
weight_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.x_seq_desc(),
x_grad_data,
rnn.init_h_desc(),
init_h_grad_data,
rnn.init_c_desc(),
init_c_grad_data,
nullptr,
nullptr,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
handle,
rnn.rnn_desc(),
rnn.y_seq_desc(),
out_data,
rnn.y_seq_desc(),
out_grad_data,
nullptr,
nullptr,
rnn.last_h_desc(),
last_h_grad_data,
rnn.last_c_desc(),
last_c_grad_data,
rnn.weight_desc(),
weight_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.x_seq_desc(),
x_grad_data,
rnn.init_h_desc(),
init_h_grad_data,
rnn.init_c_desc(),
init_c_grad_data,
nullptr,
nullptr,
workspace_data_.data<uint8_t>(),
workspace_size,
const_cast<uint8_t *>(reserve_data),
reserve_size));
}
if (!weight_grad_list.empty()) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNBackwardWeightsEx(
handle,
rnn.rnn_desc(),
rnn.x_seq_desc(),
x.data<T>(),
rnn.init_h_desc(),
init_h_data,
rnn.y_seq_desc(),
out.data<T>(),
workspace_data_.data<uint8_t>(),
workspace_size,
rnn.weight_desc(),
weight_grad_data,
const_cast<uint8_t *>(reserve_data),
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
handle,
rnn.rnn_desc(),
rnn.x_seq_desc(),
x.data<T>(),
rnn.init_h_desc(),
init_h_data,
rnn.y_seq_desc(),
out.data<T>(),
workspace_data_.data<uint8_t>(),
workspace_size,
rnn.weight_desc(),
weight_grad_data,
const_cast<uint8_t *>(reserve_data),
reserve_size));
}
#else
PADDLE_THROW(phi::errors::Unavailable(
......
......@@ -42,81 +42,78 @@ void RNNInferece(bool has_seq_length,
// This interface is used when the input/output is unpadded.
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenRNNForwardInference(
handle,
rnn->rnn_desc(),
seq_length,
rnn->x_descs(),
x_data,
rnn->init_h_desc(),
init_h_data,
rnn->init_c_desc(),
init_c_data,
rnn->weight_desc(),
w_data,
rnn->y_descs(),
out_data,
rnn->last_h_desc(),
last_h_data,
rnn->last_c_desc(),
last_c_data,
workspace_data->data<uint8_t>(),
workspace_size));
phi::dynload::miopenRNNForwardInference(handle,
rnn->rnn_desc(),
seq_length,
rnn->x_descs(),
x_data,
rnn->init_h_desc(),
init_h_data,
rnn->init_c_desc(),
init_c_data,
rnn->weight_desc(),
w_data,
rnn->y_descs(),
out_data,
rnn->last_h_desc(),
last_h_data,
rnn->last_c_desc(),
last_c_data,
workspace_data->data<uint8_t>(),
workspace_size));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNForwardInference(
handle,
rnn->rnn_desc(),
seq_length,
rnn->x_descs(),
x_data,
rnn->init_h_desc(),
init_h_data,
rnn->init_c_desc(),
init_c_data,
rnn->weight_desc(),
w_data,
rnn->y_descs(),
out_data,
rnn->last_h_desc(),
last_h_data,
rnn->last_c_desc(),
last_c_data,
workspace_data->data<uint8_t>(),
workspace_size));
phi::dynload::cudnnRNNForwardInference(handle,
rnn->rnn_desc(),
seq_length,
rnn->x_descs(),
x_data,
rnn->init_h_desc(),
init_h_data,
rnn->init_c_desc(),
init_c_data,
rnn->weight_desc(),
w_data,
rnn->y_descs(),
out_data,
rnn->last_h_desc(),
last_h_data,
rnn->last_c_desc(),
last_c_data,
workspace_data->data<uint8_t>(),
workspace_size));
#endif
} else {
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for inference
// This interface is used when the input/output is padded.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNForwardInferenceEx(
handle,
rnn->rnn_desc(),
rnn->x_seq_desc(),
x_data,
rnn->init_h_desc(),
init_h_data,
rnn->init_c_desc(),
init_c_data,
rnn->weight_desc(),
w_data,
rnn->y_seq_desc(),
out_data,
rnn->last_h_desc(),
last_h_data,
rnn->last_c_desc(),
last_c_data,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
workspace_data->data<uint8_t>(),
workspace_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
handle,
rnn->rnn_desc(),
rnn->x_seq_desc(),
x_data,
rnn->init_h_desc(),
init_h_data,
rnn->init_c_desc(),
init_c_data,
rnn->weight_desc(),
w_data,
rnn->y_seq_desc(),
out_data,
rnn->last_h_desc(),
last_h_data,
rnn->last_c_desc(),
last_c_data,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
workspace_data->data<uint8_t>(),
workspace_size));
#else
// CUDNN VERSION has to >=7.2.1
PADDLE_THROW(phi::errors::Unavailable(
......@@ -310,88 +307,85 @@ void RnnKernel(const Context &dev_ctx,
// for train
// This interface is used when the input/output is unpadded.
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenRNNForwardTraining(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.weight_desc(),
w_data,
rnn.y_descs(),
out_data,
rnn.last_h_desc(),
last_h_data,
rnn.last_c_desc(),
last_c_data,
workspace_data_.data<uint8_t>(),
workspace_size,
reserve_data,
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.weight_desc(),
w_data,
rnn.y_descs(),
out_data,
rnn.last_h_desc(),
last_h_data,
rnn.last_c_desc(),
last_c_data,
workspace_data_.data<uint8_t>(),
workspace_size,
reserve_data,
reserve_size));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNForwardTraining(
handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.weight_desc(),
w_data,
rnn.y_descs(),
out_data,
rnn.last_h_desc(),
last_h_data,
rnn.last_c_desc(),
last_c_data,
workspace_data_.data<uint8_t>(),
workspace_size,
reserve_data,
reserve_size));
phi::dynload::cudnnRNNForwardTraining(handle,
rnn.rnn_desc(),
seq_length,
rnn.x_descs(),
x_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.weight_desc(),
w_data,
rnn.y_descs(),
out_data,
rnn.last_h_desc(),
last_h_data,
rnn.last_c_desc(),
last_c_data,
workspace_data_.data<uint8_t>(),
workspace_size,
reserve_data,
reserve_size));
#endif
} else {
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for train
// This interface is used when the input/output is padded.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnRNNForwardTrainingEx(
handle,
rnn.rnn_desc(),
rnn.x_seq_desc(),
x_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.weight_desc(),
w_data,
rnn.y_seq_desc(),
out_data,
rnn.last_h_desc(),
last_h_data,
rnn.last_c_desc(),
last_c_data,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
workspace_data_.data<uint8_t>(),
workspace_size,
reserve_data,
reserve_size));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
handle,
rnn.rnn_desc(),
rnn.x_seq_desc(),
x_data,
rnn.init_h_desc(),
init_h_data,
rnn.init_c_desc(),
init_c_data,
rnn.weight_desc(),
w_data,
rnn.y_seq_desc(),
out_data,
rnn.last_h_desc(),
last_h_data,
rnn.last_c_desc(),
last_c_data,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
workspace_data_.data<uint8_t>(),
workspace_size,
reserve_data,
reserve_size));
#else
PADDLE_THROW(phi::errors::Unavailable(
"The padded input is supported by "
......
......@@ -83,7 +83,7 @@ void SyncBatchNormKernel(const Context &ctx,
var_data = variance.template data<BatchNormParamType<T>>();
} else {
// x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool
// device num also can be got from phi::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
alloc_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(),
......@@ -111,14 +111,14 @@ void SyncBatchNormKernel(const Context &ctx,
int dtype = paddle::platform::ToNCCLDataType(
paddle::framework::TransToProtoVarType(mean_out->dtype()));
// In-place operation
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
stats,
stats,
2 * C + 1,
static_cast<ncclDataType_t>(dtype),
ncclSum,
comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::ncclAllReduce(stats,
stats,
2 * C + 1,
static_cast<ncclDataType_t>(dtype),
ncclSum,
comm,
stream));
VLOG(3) << "Sync result using all reduce";
}
#endif
......
......@@ -31,7 +31,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
int axis,
DenseTensor* x_grad) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true,
errors::PreconditionNotMet("This kernel only runs on GPU."));
......
......@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
int axis,
DenseTensor* out) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true,
errors::PreconditionNotMet("This kernel only runs on GPU device."));
......
......@@ -97,7 +97,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
paddle::platform::CPUPlace(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
......
......@@ -15,7 +15,7 @@
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/affine_grid_grad_kernel.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
......@@ -35,7 +35,7 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
bool align_corners,
DenseTensor* input_grad) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"Only support for CUDAPlace.Please switch your context from "
......@@ -58,9 +58,8 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
const T* output_grad_data = output_grad.data<T>();
T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSpatialTfGridGeneratorBackward(
handle, cudnn_st_desc, output_grad_data, theta_grad_data));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfGridGeneratorBackward(
handle, cudnn_st_desc, output_grad_data, theta_grad_data));
}
} // namespace phi
......
......@@ -15,7 +15,6 @@
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/affine_grid_kernel.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
......@@ -35,7 +34,7 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
bool align_corners,
DenseTensor* output) {
PADDLE_ENFORCE_EQ(
paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
true,
phi::errors::InvalidArgument(
"Only support for CUDAPlace.Please switch your context from "
......@@ -56,12 +55,11 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
PADDLE_ENFORCE_EQ(
paddle::platform::dynload::cudnnSpatialTfGridGeneratorForward(
handle, cudnn_st_desc, theta_data, output_data),
0,
phi::errors::Fatal("Some errors has occurred "
"during forward computation in cudnn."));
PADDLE_ENFORCE_EQ(phi::dynload::cudnnSpatialTfGridGeneratorForward(
handle, cudnn_st_desc, theta_data, output_data),
0,
phi::errors::Fatal("Some errors has occurred "
"during forward computation in cudnn."));
}
} // namespace phi
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h"
......
......@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -161,7 +162,7 @@ void ConvCudnnGradKernelImplV7(
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
......@@ -188,7 +189,7 @@ void ConvCudnnGradKernelImplV7(
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
......@@ -227,39 +228,38 @@ void ConvCudnnGradKernelImplV7(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args1.odesc.desc(),
output_grad_data,
args1.wdesc.desc(),
filter_data,
args1.cdesc.desc(),
bwd_result.algo,
&beta,
args1.idesc.desc(),
temp_tensor_data,
cudnn_workspace_ptr,
workspace_size));
phi::dynload::miopenConvolutionBackwardData(handle,
&alpha,
args1.odesc.desc(),
output_grad_data,
args1.wdesc.desc(),
filter_data,
args1.cdesc.desc(),
bwd_result.algo,
&beta,
args1.idesc.desc(),
temp_tensor_data,
cudnn_workspace_ptr,
workspace_size));
},
workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
handle,
miopenTensorOpAdd,
&alpha,
args1.idesc.desc(),
transformed_input_grad_data,
&alpha,
args1.idesc.desc(),
temp_tensor_data,
&beta,
args1.idesc.desc(),
transformed_input_grad_data));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenOpTensor(handle,
miopenTensorOpAdd,
&alpha,
args1.idesc.desc(),
transformed_input_grad_data,
&alpha,
args1.idesc.desc(),
temp_tensor_data,
&beta,
args1.idesc.desc(),
transformed_input_grad_data));
} else {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
phi::dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args1.odesc.desc(),
......@@ -300,7 +300,7 @@ void ConvCudnnGradKernelImplV7(
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights(
phi::dynload::miopenConvolutionBackwardWeights(
handle,
&alpha,
args2.odesc.desc(),
......@@ -356,7 +356,7 @@ void ConvCudnnGradKernelImplV8(
PADDLE_ENFORCE_EQ(
groups,
1,
paddle::platform::errors::Unimplemented(
phi::errors::Unimplemented(
"Group concolution using CUDNNv8 API is unsupported for now"));
cudnnHandle_t handle = const_cast<cudnnHandle_t>(ctx.cudnn_handle());
......@@ -1082,7 +1082,7 @@ void ConvCudnnGradGradKernel(
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
......@@ -1106,7 +1106,7 @@ void ConvCudnnGradGradKernel(
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
......@@ -1133,7 +1133,7 @@ void ConvCudnnGradGradKernel(
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
......@@ -1160,7 +1160,7 @@ void ConvCudnnGradGradKernel(
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
......@@ -1210,20 +1210,19 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args1.idesc.desc(),
ddx,
args1.wdesc.desc(),
w,
args1.cdesc.desc(),
fwd_result1.algo,
&beta,
args1.odesc.desc(),
transformed_ddy_channel,
workspace_ptr,
workspace_size));
phi::dynload::miopenConvolutionForward(handle,
&alpha,
args1.idesc.desc(),
ddx,
args1.wdesc.desc(),
w,
args1.cdesc.desc(),
fwd_result1.algo,
&beta,
args1.odesc.desc(),
transformed_ddy_channel,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
......@@ -1248,20 +1247,19 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args2.idesc.desc(),
x,
args2.wdesc.desc(),
ddw,
args2.cdesc.desc(),
fwd_result2.algo,
&beta,
args2.odesc.desc(),
transformed_ddy_channel,
workspace_ptr,
workspace_size));
phi::dynload::miopenConvolutionForward(handle,
&alpha,
args2.idesc.desc(),
x,
args2.wdesc.desc(),
ddw,
args2.cdesc.desc(),
fwd_result2.algo,
&beta,
args2.odesc.desc(),
transformed_ddy_channel,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
......@@ -1291,7 +1289,7 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights(
phi::dynload::miopenConvolutionBackwardWeights(
handle,
&alpha,
args3.odesc.desc(),
......@@ -1330,7 +1328,7 @@ void ConvCudnnGradGradKernel(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
phi::dynload::miopenConvolutionBackwardData(
handle,
&alpha,
args4.odesc.desc(),
......
......@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/conv_kernel.h"
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -79,18 +80,11 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn(),
groups);
args.cdesc.set(
dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
#else
args.cdesc.set(dtype,
padding_common,
strides,
dilations,
paddle::platform::AllowTF32Cudnn());
args.cdesc.set(
dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
......@@ -98,8 +92,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetConvolutionGroupCount(
args.cdesc.desc(), groups));
phi::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), groups));
groups = 1;
#endif
#ifdef PADDLE_WITH_HIP
......@@ -185,20 +178,19 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
handle,
&alpha,
args.idesc.desc(),
input_data,
args.wdesc.desc(),
filter_data,
args.cdesc.desc(),
fwd_result.algo,
&beta,
args.odesc.desc(),
output_data,
workspace_ptr,
workspace_size));
phi::dynload::miopenConvolutionForward(handle,
&alpha,
args.idesc.desc(),
input_data,
args.wdesc.desc(),
filter_data,
args.cdesc.desc(),
fwd_result.algo,
&beta,
args.odesc.desc(),
output_data,
workspace_ptr,
workspace_size));
},
workspace_size);
#else
......@@ -237,7 +229,7 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
PADDLE_ENFORCE_EQ(
groups,
1,
paddle::platform::errors::Unimplemented(
phi::errors::Unimplemented(
"Group concolution using CUDNNv8 API unsupported for now"));
T* input_data = const_cast<T*>(input_tensor->data<T>());
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm>
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h"
......@@ -219,7 +220,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
......@@ -244,7 +245,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
......@@ -691,7 +692,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
......@@ -713,7 +714,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
......@@ -738,7 +739,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
......@@ -764,7 +765,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_group);
#ifdef PADDLE_WITH_HIP
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm>
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h"
......@@ -216,7 +217,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
padding_common,
strides,
dilations_,
paddle::platform::AllowTF32Cudnn(),
phi::AllowTF32Cudnn(),
c_groups);
#ifdef PADDLE_WITH_HIP
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册