未验证 提交 a821c4a9 编写于 作者: W Wilber 提交者: GitHub

[PTEN] Add Gpu context (#39305)

上级 dcff7fa8
...@@ -33,7 +33,7 @@ namespace distributed { ...@@ -33,7 +33,7 @@ namespace distributed {
template <typename T> template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T> inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() { GetBlas() {
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx); T>(cpu_ctx);
} }
......
...@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) { ...@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto &t_latest = var_latest->Get<framework::LoDTensor>(); auto &t_latest = var_latest->Get<framework::LoDTensor>();
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>(); auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname); auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>(); auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace()); t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
...@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) { ...@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense(varnames, table_id, pserver_scope_.get()); RpcRecvDense(varnames, table_id, pserver_scope_.get());
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
for (auto &varname : varnames) { for (auto &varname : varnames) {
auto *var_latest = recv_scope_->FindVar(varname); auto *var_latest = recv_scope_->FindVar(varname);
auto t_latest = var_latest->GetMutable<framework::LoDTensor>(); auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
...@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, ...@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto *t_old = var_old->GetMutable<framework::LoDTensor>(); auto *t_old = var_old->GetMutable<framework::LoDTensor>();
auto dims1 = t_latest.dims()[1]; auto dims1 = t_latest.dims()[1];
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname); auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>(); auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
...@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id, ...@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std::vector<float> v_delta; std::vector<float> v_delta;
v_delta.resize(numel); v_delta.resize(numel);
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
auto blas = auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>( paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx); cpu_ctx);
......
...@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name, ...@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
} }
// set output tensor to 0. // set output tensor to 0.
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T> paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
constant_functor; constant_functor;
constant_functor(cpu_ctx, out_t, static_cast<T>(0)); constant_functor(cpu_ctx, out_t, static_cast<T>(0));
...@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name, ...@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for (auto &var : vars) { for (auto &var : vars) {
inputs.push_back(&var->Get<pten::SelectedRows>()); inputs.push_back(&var->Get<pten::SelectedRows>());
} }
auto dev_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext dev_ctx;
if (merge_add) { if (merge_add) {
paddle::operators::math::scatter::MergeAdd< paddle::operators::math::scatter::MergeAdd<
paddle::platform::CPUDeviceContext, T> paddle::platform::CPUDeviceContext, T>
......
...@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace(); auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0); auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
auto kernel_fp16 = paddle::framework::OpKernelType( auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place, paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout, paddle::framework::DataLayout::kAnyLayout,
......
...@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { ...@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>( auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
pool.Get(member_->places_[dev_id])); pool.Get(member_->places_[dev_id]));
auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_bkcl_context(bkcl_ctx.comm()); dev_ctx->SetBkclContext(bkcl_ctx.comm());
} }
#else #else
PADDLE_THROW( PADDLE_THROW(
......
...@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> { ...@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using TYPE = pten::CPUContext; using TYPE = pten::CPUContext;
}; };
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct ConvertToPtenContext<platform::CUDADeviceContext> {
using TYPE = pten::GPUContext;
};
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template <> template <>
struct ConvertToPtenContext<platform::XPUDeviceContext> { struct ConvertToPtenContext<platform::XPUDeviceContext> {
......
...@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.seekg(seekg, is.cur); is.seekg(seekg, is.cur);
void* buf; void* buf;
auto ctx = platform::CPUDeviceContext(); platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
...@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims)); tensor->Resize(framework::make_ddim(dims));
void* buf; void* buf;
auto ctx = platform::CPUDeviceContext(); platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
...@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) { ...@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
VLOG(1) << "PrintVar: unrecognized data type:" << t.type(); VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
return os; return os;
} }
} } // namespace pten
...@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) { ...@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0); auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
...@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) { ...@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor.Resize(paddle::framework::make_ddim({3, 3})); gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace(); auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
...@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) { ...@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place); paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<int> dst; std::vector<int> dst;
...@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) { ...@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place); paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor); paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<bool> dst; std::vector<bool> dst;
...@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) { ...@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
auto gpu_place = new platform::CUDAPlace(); auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
......
...@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() { ...@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
gloo_wrapper->Init(); gloo_wrapper->Init();
device_ = std::unique_ptr<platform::CPUDeviceContext>( device_ = std::unique_ptr<platform::CPUDeviceContext>(
new platform::CPUDeviceContext(platform::CPUPlace())); new platform::CPUDeviceContext(platform::CPUPlace()));
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
device_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
} }
void GLOOParallelContext::InitWithRingID(int ring_id) { void GLOOParallelContext::InitWithRingID(int ring_id) {
......
...@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) { ...@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else #else
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
......
...@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
ctx_->PartialInitWithAllocator();
engine_ = new TensorRTEngine(10, 1 << 10); engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork(); engine_->InitNetwork();
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
...@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) { ...@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
platform::CUDAPlace gpu(0); platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu); platform::CUDADeviceContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream())
.get());
dev_ctx.PartialInitWithAllocator();
auto th_main = [&](std::random_device::result_type seed) { auto th_main = [&](std::random_device::result_type seed) {
std::default_random_engine engine(seed); std::default_random_engine engine(seed);
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) { ...@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset(); main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>( auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place))); new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]); MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
} }
...@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { ...@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset(); main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>( auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place))); new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i], threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
std::cref(*dev_ctx[i]))); std::cref(*dev_ctx[i])));
} }
......
...@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, ...@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
return block_size; return block_size;
}; };
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post; int64_t height = pre * post;
int64_t width = n; int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
...@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, ...@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
int block_size = ComputeBlockSize(num_cols); int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size // actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
// Init a index array // Init a index array
...@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, ...@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
int block_size = ComputeBlockSize(num_cols); int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size // actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
FillGrad<<<grid_size, block_size, 0, cu_stream>>>( FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
......
...@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> { ...@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
// reduce_sum implementation on CUDA // reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input_tensor, output_tensor, kps::IdentityFunctor<T>(), context.cuda_device_context(), *input_tensor, output_tensor,
reduce_dims_vec, stream); kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
} }
} }
} }
......
...@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> { ...@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
} }
gpuStream_t stream = ctx.cuda_device_context().stream(); gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream); ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
} }
}; };
......
...@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16> ...@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
{1}, dev_ctx); {1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor, TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>( kps::SquareFunctor<platform::float16, float>>(
*input, &tmp, kps::SquareFunctor<platform::float16, float>(), dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream()); reduce_dims, dev_ctx.stream());
auto tmp_eigen = EigenVector<float>::Flatten(tmp); auto tmp_eigen = EigenVector<float>::Flatten(tmp);
auto x_norm = tmp_eigen.sqrt(); auto x_norm = tmp_eigen.sqrt();
......
...@@ -65,7 +65,8 @@ class CompareReduceOpKernel ...@@ -65,7 +65,8 @@ class CompareReduceOpKernel
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd, TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
kps::IdentityFunctor<bool>>( kps::IdentityFunctor<bool>>(
tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream); context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream);
} }
} }
}; };
......
...@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
TEST(copy_cross_scope, CUDA_fp32) { TEST(copy_cross_scope, CUDA_fp32) {
f::Scope scope; f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0)); p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare1<float>(&scope, ctx, "copy_cross_scope"); Compare1<float>(&scope, ctx, "copy_cross_scope");
} }
TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
f::Scope scope; f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0)); p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare2<float>(&scope, ctx, "copy_cross_scope"); Compare2<float>(&scope, ctx, "copy_cross_scope");
} }
#elif PADDLE_WITH_ASCEND_CL #elif PADDLE_WITH_ASCEND_CL
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
...@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T> ...@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
pt_z.get()); *pt_x.get(), *pt_y.get(), axis, pt_z.get());
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"X's type[%s] is not supported by elementwise_op. X's type should be " "X's type[%s] is not supported by elementwise_op. X's type should be "
......
...@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, ...@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) { framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream()); dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream());
} }
template <ElementwiseType ET, typename T, typename Functor> template <ElementwiseType ET, typename T, typename Functor>
......
...@@ -275,6 +275,18 @@ class TestFeedForward { ...@@ -275,6 +275,18 @@ class TestFeedForward {
output_size_ = 3 * num_head_ * dim_head_; output_size_ = 3 * num_head_ * dim_head_;
input_size_ = dim_embed_; input_size_ = dim_embed_;
ctx_ = new platform::CUDADeviceContext(place_); ctx_ = new platform::CUDADeviceContext(place_);
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
ctx_->PartialInitWithAllocator();
size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim] size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim]
size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim] size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim]
......
...@@ -166,7 +166,8 @@ class AttnMatMul { ...@@ -166,7 +166,8 @@ class AttnMatMul {
if (support_case_1 || support_case_2) { if (support_case_1 || support_case_2) {
gpuStream_t stream = dev_ctx_.stream(); gpuStream_t stream = dev_ctx_.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream); dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
stream);
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Only support reduce when the input dims are [0,1,2,3,4] and " "Only support reduce when the input dims are [0,1,2,3,4] and "
......
...@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( ...@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
is_aligned(y, kAlignment)) { \ is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \ size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \ size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \ block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \
<< " , thread = " << thread; \ << " , thread = " << thread; \
FP16FastGeluFwdCUDAKernel< \ FP16FastGeluFwdCUDAKernel< \
...@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( ...@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
is_aligned(x_g, kAlignment)) { \ is_aligned(x_g, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \ size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \ size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \ block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \
<< " , thread = " << thread; \ << " , thread = " << thread; \
FP16FastGeluBwdCUDAKernel< \ FP16FastGeluBwdCUDAKernel< \
......
...@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input, ...@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
int block = 1024; int block = 1024;
#endif #endif
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block; int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
// 1. Insert data into keys and values. // 1. Insert data into keys and values.
...@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, ...@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
int block = 1024; int block = 1024;
#endif #endif
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (outputs->size() + block - 1) / block; int64_t grid_tmp = (outputs->size() + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
ReindexSrcOutput< ReindexSrcOutput<
......
...@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper( ...@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
#endif #endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0]; int64_t input_size = src_dims[0];
...@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( ...@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
#endif #endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0]; int64_t input_size = src_dims[0];
......
...@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> { ...@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
const int size_from_axis = SizeFromAxis(axis, X.dims()); const int size_from_axis = SizeFromAxis(axis, X.dims());
const int size_out_axis = SizeOutAxis(axis, X.dims()); const int size_out_axis = SizeOutAxis(axis, X.dims());
constexpr int thread_size = 512; constexpr int thread_size = 512;
int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
int64_t height = size_to_axis * size_out_axis; int64_t height = size_to_axis * size_out_axis;
int block_size = height < max_grid_dimx ? height : max_grid_dimx; int block_size = height < max_grid_dimx ? height : max_grid_dimx;
......
...@@ -27,10 +27,10 @@ namespace operators { ...@@ -27,10 +27,10 @@ namespace operators {
namespace { namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>() auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize(); .GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x; grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y; grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
} }
} }
......
...@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( ...@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
int block_y = std::min(GetLastPow2(height), max_threads / block_x); int block_y = std::min(GetLastPow2(height), max_threads / block_x);
int block_z = std::min(num_img, max_threads / block_x / block_y); int block_z = std::min(num_img, max_threads / block_x / block_y);
dim3 max_grid_dim = context.GetCUDAMaxGridDimSize(); auto max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x)); int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y)); int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
int grid_z = int grid_z =
std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4)); std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));
const int capability = context.GetComputeCapability(); const int capability = context.GetComputeCapability();
platform::GpuLaunchConfig config; platform::GpuLaunchConfig config;
......
...@@ -306,11 +306,11 @@ struct KronGradOpFunctor { ...@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
auto stream = dev_ctx.stream(); // it is a cuda device_context auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) { if (dx) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream); dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
} }
if (dy) { if (dy) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream); dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
} }
#else #else
auto* place = dev_ctx.eigen_device(); auto* place = dev_ctx.eigen_device();
......
...@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx, ...@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
input_indices.mutable_data<int64_t>(ctx.GetPlace()); input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1; size_t temp_storage_bytes = -1;
int block_size = getBlockSize(num_cols); int block_size = getBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
unsigned int grid_size = num_rows < maxGridDimX unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows) ? static_cast<unsigned int>(num_rows)
: maxGridDimX; : maxGridDimX;
......
...@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) { ...@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else #else
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
......
...@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place); T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1}, dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx.stream()); dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
...@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1}, dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx.stream()); dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h" #include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
void PrepareCPUTensors(paddle::framework::LoDTensor* ids, void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* scores, paddle::framework::LoDTensor* scores,
...@@ -129,6 +131,83 @@ void TestBeamSearch() { ...@@ -129,6 +131,83 @@ void TestBeamSearch() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestBeamSearch<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::LoDTensor ids;
paddle::framework::LoDTensor scores;
paddle::framework::LoDTensor pre_ids;
paddle::framework::LoDTensor pre_scores;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
} else {
paddle::framework::LoDTensor cpu_ids;
paddle::framework::LoDTensor cpu_scores;
paddle::framework::LoDTensor cpu_pre_ids;
paddle::framework::LoDTensor cpu_pre_scores;
PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
ids.set_lod(cpu_ids.lod());
scores.set_lod(cpu_scores.lod());
pre_ids.set_lod(cpu_pre_ids.lod());
pre_scores.set_lod(cpu_pre_scores.lod());
}
paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores;
paddle::framework::LoDTensor parent_idx;
size_t level = 0;
size_t beam_size = 2;
int end_id = 0;
paddle::operators::math::BeamSearchFunctor<
paddle::platform::CUDADeviceContext, float>
beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, &parent_idx, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
paddle::framework::LoDTensor cpu_selected_ids;
paddle::framework::LoDTensor cpu_selected_scores;
if (paddle::platform::is_cpu_place(*place)) {
cpu_selected_ids = selected_ids;
cpu_selected_scores = selected_scores;
} else {
paddle::framework::TensorCopySync(
selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
paddle::framework::TensorCopySync(
selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
cpu_selected_ids.set_lod(selected_ids.lod());
cpu_selected_scores.set_lod(selected_scores.lod());
}
std::vector<int64_t> expected_ids({4, 5, 3, 8});
std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
}
delete place;
delete context;
}
#endif
TEST(BeamSearch, CPU) { TEST(BeamSearch, CPU) {
TestBeamSearch<paddle::platform::CPUDeviceContext, TestBeamSearch<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>(); paddle::platform::CPUPlace>();
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
...@@ -92,6 +93,32 @@ struct CUBlas<float> { ...@@ -92,6 +93,32 @@ struct CUBlas<float> {
#endif #endif
} }
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const float *beta, void *C, cudaDataType_t Ctype,
int ldc) {
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG(5) << "use_tensor_op_math: "
<< (dev_ctx->tensor_core_available() ? "True" : "False");
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasSgemmEx is not supported on cuda <= 7.5"));
#endif
}
template <typename... ARGS> template <typename... ARGS>
static void TRSM(ARGS... args) { static void TRSM(ARGS... args) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
...@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> { ...@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> {
"cublasGemmEx is not supported on cuda <= 7.5")); "cublasGemmEx is not supported on cuda <= 7.5"));
#endif #endif
} }
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const void *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const void *beta, void *C, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
}; };
template <> template <>
...@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> { ...@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> {
#endif #endif
} }
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const void *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const void *beta, void *C, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side, static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
cublasFillMode_t uplo, cublasOperation_t transa, cublasFillMode_t uplo, cublasOperation_t transa,
cublasDiagType_t diag, int m, int n, cublasDiagType_t diag, int m, int n,
...@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> { ...@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> {
"cublasGemmEx is not supported on cuda <= 7.5")); "cublasGemmEx is not supported on cuda <= 7.5"));
#endif #endif
} }
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const void *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const void *beta, void *C, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
}; };
template <> template <>
...@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T *A, const T *B, T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
CUDA_R_32F, N);
} else {
#endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, N);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template <> template <>
template <> template <>
inline void Blas<platform::CUDADeviceContext>::GEMM( inline void Blas<platform::CUDADeviceContext>::GEMM(
...@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::float16>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
N);
});
#endif // CUDA_VERSION >= 8000
}
template <> template <>
template <> template <>
inline void Blas<platform::CUDADeviceContext>::GEMM( inline void Blas<platform::CUDADeviceContext>::GEMM(
...@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<float> alpha,
const platform::complex<float> *A,
const platform::complex<float> *B,
platform::complex<float> beta,
platform::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<float>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A,
CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::complex<float>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&c_alpha, h_B, ldb, h_A, lda,
&c_beta, h_C, N);
});
#endif // CUDA_VERSION >= 8000
}
template <> template <>
template <> template <>
inline void Blas<platform::CUDADeviceContext>::GEMM( inline void Blas<platform::CUDADeviceContext>::GEMM(
...@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<double> alpha,
const platform::complex<double> *A,
const platform::complex<double> *B,
platform::complex<double> beta,
platform::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<double> c_alpha =
thrust::complex<double>(alpha.real, alpha.imag);
thrust::complex<double> c_beta =
thrust::complex<double>(beta.real, beta.imag);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<double>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A,
CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::complex<double>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&c_alpha, h_B, ldb, h_A, lda,
&c_beta, h_C, N);
});
#endif // CUDA_VERSION >= 8000
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M, void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
...@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M, ...@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
#endif // CUDA_VERSION >= 8000 #endif // CUDA_VERSION >= 8000
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
T alpha, const T *A, int lda, const T *B,
int ldb, T beta, T *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
CUDA_R_32F, ldc);
} else {
#endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, ldc);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template <> template <>
template <> template <>
inline void Blas<platform::CUDADeviceContext>::GEMM( inline void Blas<platform::CUDADeviceContext>::GEMM(
...@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
}); });
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A, int lda,
const platform::float16 *B, int ldb,
platform::float16 beta,
platform::float16 *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, ldc);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x, void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
...@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x, ...@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const { void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
...@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const { ...@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
[&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); }); [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const { void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
...@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const { ...@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); }); [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N, void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
...@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N, ...@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
const T *A, const T *B, T beta, T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <> template <>
template <> template <>
inline void Blas<platform::CUDADeviceContext>::GEMV( inline void Blas<platform::CUDADeviceContext>::GEMV(
...@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV( ...@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
} }
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
alpha, B, A, beta, C);
} else {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
alpha, A, B, beta, C);
}
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM( void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
#endif // CUDA_VERSION >= 9010 #endif // CUDA_VERSION >= 9010
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T *A, const T *B,
T beta, T *C, int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
#if CUDA_VERSION >= 9010
if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
std::is_same<T, paddle::platform::float16>::value) {
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
});
} else {
#endif // CUDA_VERSION >= 9010
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, strideB, A, lda, strideA, &beta, C,
ldc, strideC, batchCount);
});
#if CUDA_VERSION >= 9010
}
#endif // CUDA_VERSION >= 9010
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM( void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
} }
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T **A,
const T **B, T beta, T **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
C[k]);
}
}
template <> template <>
template <> template <>
inline void Blas<platform::CUDADeviceContext>::BatchedGEMM( inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
} }
} }
template <>
template <>
inline void Blas<pten::GPUContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 **A,
const platform::float16 **B, platform::float16 beta, platform::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
B[k], beta, C[k]);
}
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
...@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, ...@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
int M, int N, T alpha, const T *A, int lda,
T *B, int ldb) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t cuSide =
(side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t cuUplo =
(uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasDiagType_t cuDiag =
(diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
lda, B, ldb);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv, void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
...@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv, ...@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a, void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
...@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a, ...@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
T **a_inv, int *info,
int batch_size) const {
PADDLE_ENFORCE_NE(
a_inv, a,
platform::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv, a));
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a, void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
...@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a, ...@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
int *info, int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRS( void Blas<platform::CUDADeviceContext>::BatchedGETRS(
...@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS( ...@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
int nrhs, const T **a, int lda,
int *ipiv, T **b, int ldb, int *info,
int batch_size) const {
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTrans =
(trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedTRSM( void Blas<platform::CUDADeviceContext>::BatchedTRSM(
...@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM( ...@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag, int M, int N, T alpha,
const T **A, int lda, T **B, int ldb,
int batch_size) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t cuSide =
(side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t cuUplo =
(uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasDiagType_t cuDiag =
(diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
&alpha, A, lda, B, ldb, batch_size);
});
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/dynload/rocblas.h" #include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
...@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> { ...@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> {
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
}); });
} }
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
rocblas_operation transb, int m, int n, int k,
const void *alpha, const void *A, rocblas_datatype Atype,
int lda, const void *B, rocblas_datatype Btype, int ldb,
const void *beta, void *C, rocblas_datatype Ctype,
int ldc, rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
}; };
template <> template <>
...@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> { ...@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> {
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
}); });
} }
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
rocblas_operation transb, int m, int n, int k,
const void *alpha, const void *A, rocblas_datatype Atype,
int lda, const void *B, rocblas_datatype Btype, int ldb,
const void *beta, void *C, rocblas_datatype Ctype,
int ldc, rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
}; };
template <> template <>
...@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> { ...@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> {
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0)); beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
}); });
} }
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
rocblas_operation transb, int m, int n, int k,
const void *alpha, const void *A, rocblas_datatype Atype,
int lda, const void *B, rocblas_datatype Btype, int ldb,
const void *beta, void *C, rocblas_datatype Ctype,
int ldc, rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
}; };
template <> template <>
...@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
&beta, C, N); &beta, C, N);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T *A, const T *B, T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
&beta, C, N);
});
}
template <> template <>
template <> template <>
...@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C, rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
rocblas_datatype_f16_r, N, rocblas_datatype_f32_r); rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::float16>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B,
rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
}
template <> template <>
template <> template <>
...@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C, rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
rocblas_datatype_f32_c, N, rocblas_datatype_f32_c); rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<float> alpha,
const platform::complex<float> *A,
const platform::complex<float> *B,
platform::complex<float> beta,
platform::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<float>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
}
template <> template <>
template <> template <>
...@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C, rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
rocblas_datatype_f64_c, N, rocblas_datatype_f64_c); rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<double> alpha,
const platform::complex<double> *A,
const platform::complex<double> *B,
platform::complex<double> beta,
platform::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<double> c_alpha =
thrust::complex<double>(alpha.real, alpha.imag);
thrust::complex<double> c_beta =
thrust::complex<double>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<double>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
}
template <> template <>
template <typename T> template <typename T>
...@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M, ...@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
&beta, C, ldc); &beta, C, ldc);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
T alpha, const T *A, int lda, const T *B,
int ldb, T beta, T *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA =
transA ? rocblas_operation_transpose : rocblas_operation_none;
rocblas_operation cuTransB =
transB ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
&beta, C, ldc);
});
}
template <> template <>
template <> template <>
...@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM( ...@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
B, ldb, A, lda, &beta, C, ldc); B, ldb, A, lda, &beta, C, ldc);
}); });
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A, int lda,
const platform::float16 *B, int ldb,
platform::float16 beta,
platform::float16 *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA =
transA ? rocblas_operation_transpose : rocblas_operation_none;
rocblas_operation cuTransB =
transB ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, ldc);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x, ...@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1); CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const { ...@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
context_.CublasCall( context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); }); [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <> template <>
template <typename T> template <typename T>
...@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const { ...@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall( context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); }); [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <> template <>
template <typename T> template <typename T>
...@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N, ...@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
const T *A, const T *B, T beta, T *C) const {
rocblas_operation cuTransA =
!trans_a ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <> template <>
template <> template <>
...@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV( ...@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
alpha, A, B, beta, C); alpha, A, B, beta, C);
} }
} }
template <>
template <>
inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
alpha, B, A, beta, C);
} else {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
alpha, A, B, beta, C);
}
}
template <> template <>
template <typename T> template <typename T>
...@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
ldc, strideC, batchCount); ldc, strideC, batchCount);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T *A, const T *B,
T beta, T *C, int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
const int64_t strideC = M * N;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, strideB, A, lda, strideA, &beta, C,
ldc, strideC, batchCount);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
C[k]); C[k]);
} }
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T **A,
const T **B, T beta, T **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
C[k]);
}
}
template <> template <>
template <> template <>
...@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
B[k], beta, C[k]); B[k], beta, C[k]);
} }
} }
template <>
template <>
inline void Blas<pten::GPUContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 **A,
const platform::float16 **B, platform::float16 beta, platform::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
B[k], beta, C[k]);
}
}
template <> template <>
template <typename T> template <typename T>
...@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, ...@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
lda, B, ldb); lda, B, ldb);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
int M, int N, T alpha, const T *A, int lda,
T *B, int ldb) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side cuSide =
(side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
rocblas_fill cuUplo =
(uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_diagonal cuDiag =
(diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
lda, B, ldb);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv, ...@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
int batch_size) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a, ...@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
T **a_inv, int *info,
int batch_size) const {
PADDLE_ENFORCE_NE(
a_inv, a,
platform::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv, a));
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a, ...@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
int *info, int batch_size) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS( ...@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
batch_size); batch_size);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
int nrhs, const T **a, int lda,
int *ipiv, T **b, int ldb, int *info,
int batch_size) const {
rocblas_operation cuTrans = (trans == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
batch_size);
});
}
template <> template <>
template <typename T> template <typename T>
...@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM( ...@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
&alpha, A, lda, B, ldb, batch_size); &alpha, A, lda, B, ldb, batch_size);
}); });
} }
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag, int M, int N, T alpha,
const T **A, int lda, T **B, int ldb,
int batch_size) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side cuSide =
(side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
rocblas_fill cuUplo =
(uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_diagonal cuDiag =
(diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
&alpha, A, lda, B, ldb, batch_size);
});
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
/** /**
* case 1: * case 1:
...@@ -441,6 +443,31 @@ void TestConcatMain() { ...@@ -441,6 +443,31 @@ void TestConcatMain() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestConcatMain<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
auto* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
delete context;
}
#endif
TEST(math, concat) { TEST(math, concat) {
TestConcatMain<paddle::platform::CPUDeviceContext, TestConcatMain<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>(); paddle::platform::CPUPlace>();
......
...@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz, ...@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
const int rows, const int cols) { const int rows, const int cols) {
paddle::platform::CUDADeviceContext* context = paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
auto sparse = auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext, paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
T>(*context); T>(*context);
...@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data, ...@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
const std::string& mode) { const std::string& mode) {
paddle::platform::CUDADeviceContext* context = paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
// get sparse // get sparse
auto sparse = auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext, paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testIm2col() { void testIm2col() {
...@@ -60,6 +62,7 @@ void testIm2col() { ...@@ -60,6 +62,7 @@ void testIm2col() {
auto* place = new Place(); auto* place = new Place();
DeviceContext* context = new DeviceContext(*place); DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
...@@ -164,6 +167,165 @@ void testIm2col() { ...@@ -164,6 +167,165 @@ void testIm2col() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor input;
paddle::framework::Tensor output_cfo;
paddle::framework::Tensor output_ocf;
paddle::framework::Tensor output_tmp;
/**
* input = [0, 1, 2,
* 3, 4, 5]
*
* output_cfo = [0, 1
* 1, 2
* 3, 4
* 4, 5]
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*
* col2im_cfo = [0, 2, 2
* 3, 4, 5]
*
* col2im_ocf = [0, 2, 2
* 3, 4, 5]
*/
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> stride({1, 1}); // stride_y, stride_x
std::vector<int> padding(
{0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad
std::vector<int> dilation({1, 1}); // dilation_y, dilation_x
int output_height =
(input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
int output_width =
(input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
float* input_ptr = input_tmp.mutable_data<float>(
{1, input_height, input_width}, paddle::platform::CPUPlace());
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input_ptr, arr, 6 * sizeof(float));
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
output_ocf.mutable_data<float>(
{output_height, output_width, 1, filter_size, filter_size}, *place);
// Im2Col
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
im2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
im2col_ocf;
im2col(*context, input, dilation, stride, padding, &output_cfo);
im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
}
float* out_ocf_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
&output_tmp);
out_ocf_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
}
// Col2Im: kCFO
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
col2im;
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
col2im_ocf;
float col2im_data[] = {0, 2, 2, 3, 8, 5};
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
// Col2Im: kOCF
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
delete place;
delete context;
}
#endif
TEST(math, im2col) { TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, ...@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
constexpr size_t kThreadNumY = 32; constexpr size_t kThreadNumY = 32;
size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY; size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x); grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
dim3 thread_dims(kThreadNumX, kThreadNumY); dim3 thread_dims(kThreadNumX, kThreadNumY);
if (reverse) { if (reverse) {
InclusiveScanInnerDimCUDAKernel< InclusiveScanInnerDimCUDAKernel<
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h" #include "paddle/pten/kernels/funcs/eigen/common.h"
namespace paddle { namespace paddle {
...@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext, ...@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
template struct SetConstant<platform::CUDADeviceContext, template struct SetConstant<platform::CUDADeviceContext,
platform::complex<double>>; platform::complex<double>>;
template struct SetConstant<pten::GPUContext, platform::float16>;
template struct SetConstant<pten::GPUContext, platform::bfloat16>;
template struct SetConstant<pten::GPUContext, float>;
template struct SetConstant<pten::GPUContext, double>;
template struct SetConstant<pten::GPUContext, uint8_t>;
template struct SetConstant<pten::GPUContext, int>;
template struct SetConstant<pten::GPUContext, int16_t>;
template struct SetConstant<pten::GPUContext, int64_t>;
template struct SetConstant<pten::GPUContext, bool>;
template struct SetConstant<pten::GPUContext, platform::complex<float>>;
template struct SetConstant<pten::GPUContext, platform::complex<double>>;
template struct SetConstant<platform::CUDAPinnedDeviceContext, template struct SetConstant<platform::CUDAPinnedDeviceContext,
platform::float16>; platform::float16>;
template struct SetConstant<platform::CUDAPinnedDeviceContext, template struct SetConstant<platform::CUDAPinnedDeviceContext,
......
...@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) { ...@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
t.Resize({10, 10}); t.Resize({10, 10});
t.mutable_data<int>(paddle::platform::CPUPlace()); t.mutable_data<int>(paddle::platform::CPUPlace());
auto* ctx = new paddle::platform::CPUDeviceContext(); auto* ctx = new paddle::platform::CPUDeviceContext();
ctx->Init();
paddle::operators::math::set_constant(*ctx, &t, 10); paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) { for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i], PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
......
...@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
...@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
...@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) { ...@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place); T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testVol2col() { void testVol2col() {
...@@ -25,7 +27,6 @@ void testVol2col() { ...@@ -25,7 +27,6 @@ void testVol2col() {
auto* place = new Place(); auto* place = new Place();
DeviceContext* context = new DeviceContext(*place); DeviceContext* context = new DeviceContext(*place);
/** /**
* input = [[0, 1, 2, * input = [[0, 1, 2,
* 3, 4, 5] * 3, 4, 5]
...@@ -123,6 +124,124 @@ void testVol2col() { ...@@ -123,6 +124,124 @@ void testVol2col() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testVol2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input;
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor output;
paddle::framework::Tensor output_tmp;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
/**
* input = [[0, 1, 2,
* 3, 4, 5]
* [6, 7, 8,
* 9, 10, 11]]
*
* output = [0, 1
* 1, 2
* 3, 4
* 4, 5
* 6, 7
* 7, 8
* 9, 10
* 10, 11]
*
* col2vol = [[0, 2, 2,
* 3, 8, 5]
* [6, 14, 8,
* 9, 20, 11]]
*
*/
int input_depth = 2;
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> strides({1, 1, 1});
std::vector<int> paddings({0, 0, 0});
std::vector<int> dilations({1, 1, 1});
int output_depth =
(input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
int output_height =
(input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
int output_width =
(input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
// Vol2Col test
float* input_ptr =
input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
paddle::platform::CPUPlace());
float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input_ptr, arr, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
*place);
paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
float>
vol2col;
vol2col(*context, input, dilations, strides, paddings, &output);
float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 16; ++i) {
EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
}
// Col2Vol test
float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
memset(input_ptr, 0, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
float>
col2vol;
col2vol(*context, output, dilations, strides, paddings, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 12; ++i) {
EXPECT_EQ(in_ptr[i], col_2_vol[i]);
}
delete place;
delete context;
}
#endif
TEST(math, vol2col) { TEST(math, vol2col) {
testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> { ...@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>( TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
*input, output, Div(numel), reduce_dims, stream); context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream);
} }
}; };
......
...@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test { ...@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list_.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i); p::CUDAPlace place(i);
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place)); auto *ctx = new p::CUDADeviceContext(place);
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->PartialInitWithAllocator();
dev_ctxs_.emplace_back(ctx);
} }
NCCLInitOp(); NCCLInitOp();
......
...@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> { ...@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using MT = typename details::MPTypeTrait<T>::Type; using MT = typename details::MPTypeTrait<T>::Type;
if (porder == 0) { if (porder == 0) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
*in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
reduce_axis, stream);
} else if (porder == INFINITY) { } else if (porder == INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else if (porder == -INFINITY) { } else if (porder == -INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else { } else {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
*in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm,
UnsignedPowFunctor<T>(porder), reduce_axis, stream);
const framework::Tensor* tmp_norm = out_norm; const framework::Tensor* tmp_norm = out_norm;
std::vector<const framework::Tensor*> ins = {tmp_norm}; std::vector<const framework::Tensor*> ins = {tmp_norm};
......
...@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, TensorReduceFunctorImpl<T, T, kps::AddFunctor,
kps::DivideFunctor<T>>( kps::DivideFunctor<T>>(
*in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim, dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
stream); reduce_dim, stream);
#else // for cpu #else // for cpu
paddle::operators::math::Pool2dFunctor< paddle::operators::math::Pool2dFunctor<
DeviceContext, paddle::operators::math::AvgPool<T>, T> DeviceContext, paddle::operators::math::AvgPool<T>, T>
......
...@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> { ...@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
} }
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream); context.cuda_device_context(), dalpha_tmp, dalpha,
kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
}; };
......
...@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) { ...@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
framework::Scope* scope = (*micro_scope)[0]; framework::Scope* scope = (*micro_scope)[0];
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
framework::Executor exe(place); framework::Executor exe(place);
// create var on local scope // create var on local scope
......
...@@ -39,14 +39,16 @@ namespace operators { ...@@ -39,14 +39,16 @@ namespace operators {
template <typename Tx, typename Ty, template <typename> class ReduceOp, template <typename Tx, typename Ty, template <typename> class ReduceOp,
typename TransformOp> typename TransformOp>
void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { gpuStream_t stream) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>( pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
x, y, transform, origin_reduce_dims, stream); static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims, stream);
} }
} // namespace operators } // namespace operators
......
...@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> { ...@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
cuda_ctx, ins, &outs, func); cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream); cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
stream);
RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>( RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm); numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
RenormKernelFunc4<T><<<grid, block, 0, stream>>>( RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
...@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> { ...@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
dim_divisor); dim_divisor);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), pow_value, &dim_value,
kps::IdentityFunctor<T>(), reduce_axis, stream);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis, ctx.cuda_device_context(), mul_value, &weight_derivative,
stream); kps::IdentityFunctor<T>(), reduce_axis, stream);
RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>( RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
x_data, dout_data, dx_data, numel, x_data, dout_data, dx_data, numel,
dim_value.mutable_data<T>(ctx.GetPlace()), dim_value.mutable_data<T>(ctx.GetPlace()),
......
...@@ -389,7 +389,8 @@ class ReshapeKernel { ...@@ -389,7 +389,8 @@ class ReshapeKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out); pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
pt_scalar_shape, out);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -417,7 +418,8 @@ class ReshapeGradKernel { ...@@ -417,7 +418,8 @@ class ReshapeGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeGradKernel(dev_ctx, *d_out, d_x); pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
*d_out, d_x);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel { ...@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out); pten::ReshapeDoubleGradKernel(
static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
......
...@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index, ...@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
int64_t max_grid_dimx = int64_t max_grid_dimx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx) reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.GetCUDAMaxGridDimSize() .GetCUDAMaxGridDimSize()[0];
.x;
int64_t grid = height < max_grid_dimx ? height : max_grid_dimx; int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
ScatterInitCUDAKernel<T, IndexT><<< ScatterInitCUDAKernel<T, IndexT><<<
......
...@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output, ...@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input, output, kps::IdentityFunctor<T>(), reduce_dims, stream); ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
reduce_dims, stream);
#else #else
ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>( ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
input, output, reduce_dims, keep_dim, false, ctx) input, output, reduce_dims, keep_dim, false, ctx)
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto src_allocation = memory::Alloc(gpu0, sizeof(src)); auto src_allocation = memory::Alloc(gpu0, sizeof(src));
...@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform::CUDAPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src)); auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
......
...@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name, ...@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
tensor->Resize(dims); tensor->Resize(dims);
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
inference::tensorrt::RandomizeTensor(tensor, place, ctx); inference::tensorrt::RandomizeTensor(tensor, place, ctx);
} }
...@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) { ...@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
// Prepare variables. // Prepare variables.
if (allow_build_at_runtime) if (allow_build_at_runtime)
CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1})); CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
...@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0); block_->set_idx(0);
......
...@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx, ...@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
}; };
int block_size = ComputeBlockSize(num_cols); int block_size = ComputeBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size // actually, int num_rows < max_grid_size
unsigned int grid_size = num_rows < maxGridDimX unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows) ? static_cast<unsigned int>(num_rows)
......
...@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
std::vector<int> reduce_dims; std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size()); reduce_dims.push_back(out->dims().size());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream); context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
reduce_dims, stream);
} else { } else {
math::SetConstant<DeviceContext, T> functor; math::SetConstant<DeviceContext, T> functor;
functor(context.device_context<DeviceContext>(), out, static_cast<T>(0)); functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
......
...@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> { ...@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
} }
gpuStream_t stream = ctx.cuda_device_context().stream(); gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream); ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
} }
}; };
......
...@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> { ...@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
} }
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
auto cu_stream = dev_ctx.stream(); auto cu_stream = dev_ctx.stream();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post; int64_t height = pre * post;
int64_t width = n; int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
...@@ -138,6 +138,7 @@ if(WITH_CNCL) ...@@ -138,6 +138,7 @@ if(WITH_CNCL)
endif() endif()
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
target_link_libraries(device_context gpu_resource_pool) target_link_libraries(device_context gpu_resource_pool)
endif() endif()
......
...@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) { ...@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor // CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place); CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor // GPU LoDTensor to CPU LoDTensor
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include <utility> #include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
...@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank, ...@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
int dev_id, int ring_id) { int dev_id, int ring_id) {
std::unique_ptr<CUDADeviceContext> dev_ctx( std::unique_ptr<CUDADeviceContext> dev_ctx(
new CUDADeviceContext(CUDAPlace(dev_id))); new CUDADeviceContext(CUDAPlace(dev_id)));
dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
.get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
dev_ctx->PartialInitWithAllocator();
std::shared_ptr<platform::CudaEventObject> compute_event( std::shared_ptr<platform::CudaEventObject> compute_event(
platform::CudaEventResourcePool::Instance().New(dev_id)); platform::CudaEventResourcePool::Instance().New(dev_id));
...@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks, ...@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
auto* dev_ctx = static_cast<platform::XPUDeviceContext*>( auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(
platform::XPUPlace(dev_id))); platform::XPUPlace(dev_id)));
dev_ctx->set_bkcl_context(comm); dev_ctx->SetBkclContext(comm);
} }
return comm_map_[ring_id][dev_id].get(); return comm_map_[ring_id][dev_id].get();
......
IF(WITH_GPU) IF(WITH_GPU)
add_subdirectory(cuda) add_subdirectory(cuda)
nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda) nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
ELSEIF(WITH_ROCM) ELSEIF(WITH_ROCM)
add_subdirectory(rocm) add_subdirectory(rocm)
hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda) hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu) hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
......
nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
#pragma once #pragma once
#include <functional>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -96,8 +98,7 @@ class CublasHandleHolder { ...@@ -96,8 +98,7 @@ class CublasHandleHolder {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
} }
template <typename Callback> inline void Call(const std::function<void(blasHandle_t)>& callback) const {
inline void Call(Callback&& callback) const {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
callback(handle_); callback(handle_);
} }
......
...@@ -14,11 +14,13 @@ limitations under the License. */ ...@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once #pragma once
#include <functional>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cusparse.h" #include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -45,8 +47,8 @@ class CusparseHandleHolder { ...@@ -45,8 +47,8 @@ class CusparseHandleHolder {
#endif #endif
} }
template <typename Callback> inline void Call(
inline void Call(Callback&& callback) const { const std::function<void(pten::sparseHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
callback(handle_); callback(handle_);
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include <cstdlib> #include <cstdlib>
#include <mutex> #include <mutex>
#include <set> #include <set>
...@@ -39,11 +40,12 @@ limitations under the License. */ ...@@ -39,11 +40,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb); DECLARE_uint64(gpu_memory_limit_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f; constexpr static float fraction_reserve_gpu_memory = 0.05f;
...@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f; ...@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT; USE_GPU_MEM_STAT;
namespace paddle { namespace paddle {
namespace platform { namespace platform {
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total; size_t actual_available, actual_total;
...@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) { ...@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr); return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
} }
int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
int GetGPUComputeCapability(int id) {
return pten::backends::gpu::GetGPUComputeCapability(id);
}
int GetGPURuntimeVersion(int id) {
return pten::backends::gpu::GetGPURuntimeVersion(id);
}
int GetGPUDriverVersion(int id) {
return pten::backends::gpu::GetGPUDriverVersion(id);
}
bool TensorCoreAvailable() {
return pten::backends::gpu::TensorCoreAvailable();
}
int GetGPUMultiProcessors(int id) {
return pten::backends::gpu::GetGPUMultiProcessors(id);
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
}
int GetGPUMaxThreadsPerBlock(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
}
int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
return pten::backends::gpu::GetGpuMaxGridDimSize(id);
}
std::vector<int> GetSelectedDevices() {
return pten::backends::gpu::GetSelectedDevices();
}
const gpuDeviceProp &GetDeviceProperties(int id) {
return pten::backends::gpu::GetDeviceProperties(id);
}
void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
void GpuStreamSync(gpuStream_t stream) {
pten::backends::gpu::GpuStreamSync(stream);
}
void GpuDestroyStream(gpuStream_t stream) {
pten::backends::gpu::GpuDestroyStream(stream);
}
void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind) {
pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
count, stream);
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
count);
}
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h> #include <stddef.h>
#include <array>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id); ...@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
int GetCurrentDeviceId(); int GetCurrentDeviceId();
//! Get the maximum GridDim size for GPU buddy allocator. //! Get the maximum GridDim size for GPU buddy allocator.
dim3 GetGpuMaxGridDimSize(int); std::array<int, 3> GetGpuMaxGridDimSize(int);
//! Get a list of device ids from environment variable or use all. //! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices(); std::vector<int> GetSelectedDevices();
...@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream); ...@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
void GpuDestroyStream(gpuStream_t stream); void GpuDestroyStream(gpuStream_t stream);
// ! Blocks until device has completed all operations. // ! Blocks until device has completed all operations.
void GpuDeviceync(); void GpuDeviceSync();
//! CudaMalloc with recorded info //! CudaMalloc with recorded info
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id); gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
......
...@@ -83,8 +83,21 @@ struct NCCLContext { ...@@ -83,8 +83,21 @@ struct NCCLContext {
std::unique_ptr<CUDADeviceContext> ctx_; std::unique_ptr<CUDADeviceContext> ctx_;
ncclComm_t comm_; ncclComm_t comm_;
explicit NCCLContext(int dev_id) explicit NCCLContext(int dev_id) : comm_{nullptr} {
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
ctx_->PartialInitWithAllocator();
}
gpuStream_t stream() const { return ctx_->stream(); } gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; } ncclComm_t comm() const { return comm_; }
......
hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
...@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <functional>
#include <memory> #include <memory>
#include <set> #include <set>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
...@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext( ...@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
cuda_ctx, cuda_ctx,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
dev_ctx->SetDeviceAllocator( // Note: A trick method to init context, why GetAllocator interface
memory::allocation::AllocatorFacade::Instance() // needs a stream parameter?
.GetAllocator(p, cuda_ctx->context()->RawStream()) dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.get()); .GetAllocator(p, cuda_ctx->stream())
.get());
cuda_ctx->PartialInitWithAllocator();
#endif #endif
} else { } else {
dev_ctx->SetDeviceAllocator( dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
memory::allocation::AllocatorFacade::Instance() .GetAllocator(p)
.GetAllocator(p) .get());
.get());
} }
dev_ctx->SetHostAllocator( dev_ctx->SetHostAllocator(
memory::allocation::AllocatorFacade::Instance() memory::allocation::AllocatorFacade::Instance()
...@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool( ...@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
} }
} }
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {} CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
pten::CPUContext::Init();
}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {} CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
pten::CPUContext::Init();
}
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {} IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
Place IPUDeviceContext::GetPlace() const { return place_; } const Place& IPUDeviceContext::GetPlace() const { return place_; }
void IPUDeviceContext::Wait() const { void IPUDeviceContext::Wait() const {
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {} ...@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {} XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
pten::XPUContext::Init();
}
XPUDeviceContext::~XPUDeviceContext() {} XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) { XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
pten::XPUContext::Init();
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place.device); << static_cast<int>(place.device);
} }
...@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const { ...@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; } const Place& NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; } aclrtContext NPUDeviceContext::context() const { return context_; }
...@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const { ...@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
Place NPUPinnedDeviceContext::GetPlace() const { return place_; } const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif #endif
...@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() { ...@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
#endif #endif
} }
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
CUDADeviceGuard guard(place_.device); : pten::GPUContext(place) {
compute_capability_ = GetGPUComputeCapability(place_.device); pten::GPUContext::PartialInitWithoutAllocator();
multi_process_ = GetGPUMultiProcessors(place_.device); cuda_stream_.reset(
max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device); new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
driver_version_ = GetGPUDriverVersion(place_.device);
runtime_version_ = GetGPURuntimeVersion(place_.device);
LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
<< static_cast<int>(place_.device)
<< ", GPU Compute Capability: "
<< compute_capability_ / 10 << "."
<< compute_capability_ % 10
<< ", Driver API Version: " << driver_version_ / 1000
<< "." << (driver_version_ % 100) / 10
<< ", Runtime API Version: "
<< runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", MIOpen Version: " << version_major << "."
<< version_minor << "." << version_patch;
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 1000) / 100 << ".";
#endif
{
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
#else
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
#endif
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
}
default_ctx_.reset(new CUDAContext(place_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
#endif
}
Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process_ * max_threads_per_mp_;
} }
int CUDADeviceContext::GetSMCount() const { return multi_process_; } CUDADeviceContext::~CUDADeviceContext() = default;
int CUDADeviceContext::GetMaxThreadsPerBlock() const {
return max_threads_per_block_;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return context()->EigenDevice().get(); if (thread_ctx_.count(this)) {
} return context()->EigenDevice().get();
}
bool CUDADeviceContext::tensor_core_available() const { return pten::GPUContext::eigen_device();
return context()->CublasTensorCoreHandle() != nullptr;
} }
dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const { void CUDADeviceContext::Wait() const {
return max_grid_dim_size_; if (thread_ctx_.count(this)) {
context()->Stream()->Wait();
return;
}
pten::GPUContext::Wait();
} }
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const { ...@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else #else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif #endif
return context()->CudnnHandle(); if (thread_ctx_.count(this)) {
return context()->CudnnHandle();
}
return pten::GPUContext::cudnn_handle();
} }
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const { rocblas_handle CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle(); if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
} }
#else #else
cublasHandle_t CUDADeviceContext::cublas_handle() const { cublasHandle_t CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle(); if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
} }
cusparseHandle_t CUDADeviceContext::cusparse_handle() const { cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
return context()->CusparseHandle()->GetCusparseHandle(); if (thread_ctx_.count(this)) {
return context()->CusparseHandle()->GetCusparseHandle();
}
return pten::GPUContext::cusparse_handle();
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle();
}
return pten::GPUContext::cusolver_dn_handle();
} }
#endif #endif
void CUDADeviceContext::RecordEvent(
gpuEvent_t ev, const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->RecordEvent(ev, callback);
return;
}
pten::GPUContext::RecordEvent(ev, callback);
}
void CUDADeviceContext::AddStreamCallback(
const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->AddCallback(callback);
return;
}
pten::GPUContext::AddStreamCallback(callback);
}
void CUDADeviceContext::WaitStreamCallback() const {
if (thread_ctx_.count(this)) {
context()->Stream()->WaitCallback();
return;
}
pten::GPUContext::WaitStreamCallback();
}
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
} }
#ifndef PADDLE_WITH_HIP gpuStream_t CUDADeviceContext::stream() const {
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const { if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle(); return context()->RawStream();
}
return pten::GPUContext::stream();
} }
#endif
gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); } std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
if (!thread_ctx_.count(this)) {
PADDLE_THROW(platform::errors::PermissionDenied(
"CUDADeviceContext call context() failed, make sure in the "
"thread_local semantic."));
}
return thread_ctx_.at(this);
}
stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
return cuda_stream_.get();
}
stream::CUDAStream* CUDADeviceContext::SetCudaStream(
stream::CUDAStream* new_stream_ptr) {
auto* old_stream_ptr = cuda_stream_.release();
cuda_stream_.reset(new_stream_ptr);
return old_stream_ptr;
}
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice()); eigen_device_.reset(new Eigen::DefaultDevice());
...@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { ...@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <functional>
#include <future> // NOLINT #include <future> // NOLINT
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
...@@ -18,7 +19,9 @@ limitations under the License. */ ...@@ -18,7 +19,9 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/core/device_context.h" #include "paddle/pten/core/device_context.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
...@@ -28,6 +31,7 @@ limitations under the License. */ ...@@ -28,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/fluid/platform/dynload/cusparse.h" #include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif #endif
...@@ -38,6 +42,7 @@ limitations under the License. */ ...@@ -38,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT #include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h" #include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h" // NOLINT
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h" #include "paddle/fluid/platform/dynload/rccl.h"
#endif #endif
...@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext { ...@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
explicit IPUDeviceContext(IPUPlace place); explicit IPUDeviceContext(IPUPlace place);
virtual ~IPUDeviceContext(); virtual ~IPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; } Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override; const Place& GetPlace() const override;
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
void Wait() const override; void Wait() const override;
...@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext { ...@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
explicit NPUDeviceContext(NPUPlace place); explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext(); virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; } Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override; const Place& GetPlace() const override;
aclrtContext context() const; aclrtContext context() const;
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext { ...@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
NPUPinnedDeviceContext(); NPUPinnedDeviceContext();
explicit NPUPinnedDeviceContext(NPUPinnedPlace place); explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
Place GetPlace() const override; const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
...@@ -326,20 +331,20 @@ class CUDAContext { ...@@ -326,20 +331,20 @@ class CUDAContext {
#endif #endif
/*! \brief Call cublas function safely. */ /*! \brief Call cublas function safely. */
template <typename Callback> inline void CublasCall(
inline void CublasCall(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tf32_tensor_core_handle_) { if (cublas_tf32_tensor_core_handle_) {
cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback)); cublas_tf32_tensor_core_handle_->Call(callback);
} else { } else {
cublas_handle_->Call(std::forward<Callback>(callback)); cublas_handle_->Call(callback);
} }
} }
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */ /*! \brief Call cusparse function safely. */
template <typename Callback> inline void CusparseCall(
inline void CusparseCall(Callback&& callback) const { const std::function<void(pten::sparseHandle_t)>& callback) const {
cusparse_handle_->Call(std::forward<Callback>(callback)); cusparse_handle_->Call(callback);
} }
#endif #endif
...@@ -348,12 +353,12 @@ class CUDAContext { ...@@ -348,12 +353,12 @@ class CUDAContext {
/*! \brief Call cublas function with Tensor Core safely. If /*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */ Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback> inline void TensorCoreCublasCallIfAvailable(
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tensor_core_handle_) { if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(std::forward<Callback>(callback)); cublas_tensor_core_handle_->Call(callback);
} else { } else {
cublas_handle_->Call(std::forward<Callback>(callback)); cublas_handle_->Call(callback);
} }
} }
...@@ -491,7 +496,7 @@ class CUDAContext { ...@@ -491,7 +496,7 @@ class CUDAContext {
DISABLE_COPY_AND_ASSIGN(CUDAContext); DISABLE_COPY_AND_ASSIGN(CUDAContext);
}; };
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public pten::GPUContext {
public: public:
explicit CUDADeviceContext(CUDAPlace place); explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext(); virtual ~CUDADeviceContext();
...@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext { ...@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
void Wait() const override; void Wait() const override;
/*! \brief Return place in the device context. */
Place GetPlace() const override;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
dim3 GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
/*! \brief Call cublas function safely. */ /*! \brief Call cublas function safely. */
template <typename Callback> inline void CublasCall(
inline void CublasCall(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::CublasCall(callback);
return;
}
return context()->CublasCall(callback); return context()->CublasCall(callback);
} }
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */ /*! \brief Call cusparse function safely. */
template <typename Callback> inline void CusparseCall(
inline void CusparseCall(Callback&& callback) const { const std::function<void(pten::sparseHandle_t)>& callback) const {
return context()->CusparseCall(callback); if (!thread_ctx_.count(this)) {
pten::GPUContext::CusparseCall(callback);
return;
}
context()->CusparseCall(callback);
} }
#endif #endif
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If /*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */ Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback> inline void TensorCoreCublasCallIfAvailable(
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
return context()->TensorCoreCublasCallIfAvailable(callback); if (!thread_ctx_.count(this)) {
pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
return;
}
context()->TensorCoreCublasCallIfAvailable(callback);
} }
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
...@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext { ...@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
cusparseHandle_t cusparse_handle() const; cusparseHandle_t cusparse_handle() const;
#endif #endif
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn /*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads. * functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock * Once the first cudnn function is called by the handle, a lock
...@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext { ...@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */ * sequential cudnn function calls. */
CudnnWorkspaceHandle cudnn_workspace_handle() const; CudnnWorkspaceHandle cudnn_workspace_handle() const;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
gpuStream_t stream() const; gpuStream_t stream() const;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
#endif
template <typename Callback>
void RecordEvent(gpuEvent_t ev, Callback callback) const {
return context()->Stream()->RecordEvent(ev, callback);
}
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return context()->Stream()->AddCallback(callback);
}
void WaitStreamCallback() const { void AddStreamCallback(const std::function<void()>& callback) const;
return context()->Stream()->WaitCallback();
}
void ResetDefaultContext(const stream::Priority& priority) { void WaitStreamCallback() const;
default_ctx_.reset(new CUDAContext(place_, priority));
}
void ResetThreadContext(const stream::Priority& priority) { void ResetThreadContext(const stream::Priority& priority) {
std::lock_guard<std::mutex> guard(ctx_mtx_); std::lock_guard<std::mutex> guard(ctx_mtx_);
thread_ctx_[this].reset(new CUDAContext(place_, priority)); thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
} }
std::shared_ptr<CUDAContext> context() const { std::shared_ptr<CUDAContext> context() const;
if (!thread_ctx_.count(this)) {
return default_ctx_;
}
return thread_ctx_.at(this);
}
// Note: Can only be used under thread_local semantics. // Note: Can only be used under thread_local semantics.
void SetThreadLocalStream(const gpuStream_t stream) { void SetThreadLocalStream(const gpuStream_t stream) {
thread_ctx_.at(this)->SetStream(stream); thread_ctx_.at(this)->SetStream(stream);
} }
private: // NOTE: Just for compatibility with the past, please delete if there is an
CUDAPlace place_; // elegant way.
std::shared_ptr<CUDAContext> default_ctx_; stream::CUDAStream* GetCudaStream() const;
stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
private:
// The thread_local static variable will be released before the // The thread_local static variable will be released before the
// global static variable, so avoid using it in dtor. // global static variable, so avoid using it in dtor.
static thread_local std::unordered_map<const CUDADeviceContext*, static thread_local std::unordered_map<const CUDADeviceContext*,
...@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext { ...@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex cudnn_handle_mtx_; mutable std::mutex cudnn_handle_mtx_;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NOTE: Just for compatibility with the past, please delete if there is an
// NCCL communicator (single process version) for NCCL collective operations. // elegant way.
// NCCL collective operations provides fast collectives over multiple GPUs std::unique_ptr<stream::CUDAStream> cuda_stream_;
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
#endif
int compute_capability_;
int runtime_version_;
int driver_version_;
int multi_process_;
int max_threads_per_mp_;
int max_threads_per_block_;
dim3 max_grid_dim_size_;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
}; };
...@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext { ...@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
CUDAPinnedDeviceContext(); CUDAPinnedDeviceContext();
explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
Place GetPlace() const override; const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
...@@ -26,6 +27,20 @@ TEST(Device, Init) { ...@@ -26,6 +27,20 @@ TEST(Device, Init) {
int count = paddle::platform::GetGPUDeviceCount(); int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
delete device_context; delete device_context;
...@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) { ...@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
int count = paddle::platform::GetGPUDeviceCount(); int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
......
...@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) { ...@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext.")); "Failed to dynamic_cast context into CUDADeviceContext."));
wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream()); wrapper->inner_event_.Record(cuda_dev_ctx->stream());
} }
bool DeviceEventQueryCUDA(const DeviceEvent* event) { bool DeviceEventQueryCUDA(const DeviceEvent* event) {
...@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event, ...@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext.")); "Failed to dynamic_cast context into CUDADeviceContext."));
// calling cudaStreamWaitEvent(stream, event, 0) // calling cudaStreamWaitEvent(stream, event, 0)
cuda_dev_ctx->context()->Stream()->WaitEvent( cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
wrapper->inner_event_.GetRawCudaEvent());
} }
void DeviceEventCPUWaitCUDA(const DeviceEvent* event, void DeviceEventCPUWaitCUDA(const DeviceEvent* event,
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/device_event.h" #include "paddle/fluid/platform/device_event.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/place.h"
using ::paddle::platform::kCUDA; using ::paddle::platform::kCUDA;
using ::paddle::platform::kCPU; using ::paddle::platform::kCPU;
...@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) { ...@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
// case 1. test for event_creator // case 1. test for event_creator
DeviceEvent event(place); DeviceEvent event(place);
ASSERT_NE(event.GetEvent().get(), nullptr); ASSERT_NE(event.GetEvent().get(), nullptr);
bool status = event.Query();
ASSERT_EQ(status, true);
// case 2. test for event_recorder // case 2. test for event_recorder
event.Record(context); event.Record(context);
bool status = event.Query(); status = event.Query();
ASSERT_EQ(status, false); ASSERT_EQ(status, false);
// case 3. test for event_finisher // case 3. test for event_finisher
event.Finish(); event.Finish();
......
...@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) { ...@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
::paddle::platform::details::ExternalApiType< \ ::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \ __CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \ paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \ __cond__ = (COND); \
++retry_count; \ ++retry_count; \
} \ } \
...@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) { ...@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
::paddle::platform::details::ExternalApiType< \ ::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \ __CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \ ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \ __cond__ = (COND); \
++retry_count; \ ++retry_count; \
} \ } \
......
...@@ -152,11 +152,11 @@ class CudaEvent { ...@@ -152,11 +152,11 @@ class CudaEvent {
#endif #endif
} }
void Record(const paddle::platform::stream::CUDAStream &stream) { void Record(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream())); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream())); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
#endif #endif
} }
......
...@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) { ...@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor // CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place); CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor // GPU LoDTensor to CPU LoDTensor
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) { ...@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
} }
} }
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <> template <>
struct ForRange<CUDADeviceContext> { struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit) ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
...@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> { ...@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
size_t limit_; size_t limit_;
}; };
template <>
struct ForRange<pten::GPUContext> {
ForRange(const pten::GPUContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int num_thread = 256;
platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
const int num_threads = num_thread;
#else
constexpr int num_threads = 1024;
#endif
size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
}
const pten::GPUContext& dev_ctx_;
size_t limit_;
};
#endif #endif
} // namespace platform } // namespace platform
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) { ...@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
platform::Place device = CUDAPlace(deviceId); platform::Place device = CUDAPlace(deviceId);
auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device)) return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context() ->GetCudaStream();
->Stream()
.get();
return stream;
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
...@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) { ...@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
auto& device = stream->GetPlace(); auto& device = stream->GetPlace();
auto& pool = platform::DeviceContextPool::Instance(); auto& pool = platform::DeviceContextPool::Instance();
return static_cast<platform::CUDADeviceContext*>(pool.Get(device)) return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context() ->SetCudaStream(stream);
->SetStream(stream);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); "Paddle is not compiled with CUDA. Cannot visit cuda current"
return nullptr; "stream."));
return CUDAStream(nullptr);
#endif #endif
} }
} // namespace stream } // namespace stream
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include <functional>
#include <memory> #include <memory>
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
...@@ -51,24 +52,28 @@ class CUDAStream final { ...@@ -51,24 +52,28 @@ class CUDAStream final {
const StreamFlag& flag = StreamFlag::kDefaultFlag) { const StreamFlag& flag = StreamFlag::kDefaultFlag) {
Init(place, priority, flag); Init(place, priority, flag);
} }
explicit CUDAStream(gpuStream_t stream, const Place& place)
: place_(place), stream_(stream) {
owned_stream_ = false;
callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
}
virtual ~CUDAStream() { Destroy(); } virtual ~CUDAStream() { Destroy(); }
bool Init(const Place& place, const Priority& priority = Priority::kNormal, bool Init(const Place& place, const Priority& priority = Priority::kNormal,
const StreamFlag& flag = StreamFlag::kDefaultFlag); const StreamFlag& flag = StreamFlag::kDefaultFlag);
template <typename Callback> void AddCallback(std::function<void()> callback) const {
void AddCallback(Callback&& callback) const {
callback_manager_->AddCallback(callback); callback_manager_->AddCallback(callback);
} }
template <typename Callback>
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
void RecordEvent(hipEvent_t ev, Callback callback) const { void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
callback(); callback();
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
} }
#else #else
void RecordEvent(cudaEvent_t ev, Callback callback) const { void RecordEvent(cudaEvent_t ev,
const std::function<void()>& callback) const {
callback(); callback();
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
} }
...@@ -149,6 +154,7 @@ class CUDAStream final { ...@@ -149,6 +154,7 @@ class CUDAStream final {
}; };
CUDAStream* get_current_stream(int deviceId); CUDAStream* get_current_stream(int deviceId);
// NOTE: There is a problem with the interface and needs to be fixed
CUDAStream* set_current_stream(CUDAStream* stream); CUDAStream* set_current_stream(CUDAStream* stream);
} // namespace stream } // namespace stream
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
...@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) { ...@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
TEST(Transform, GPUUnary) { TEST(Transform, GPUUnary) {
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr()); float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
...@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) { ...@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_allocation = Alloc(gpu0, sizeof(buf)); auto gpu_allocation = Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr()); int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
......
...@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
return paddle::platform::stream::get_current_stream(deviceId); return paddle::platform::stream::get_current_stream(deviceId);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current " "Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream.")); "stream."));
#endif #endif
}, },
...@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
[](paddle::platform::stream::CUDAStream &self, [](paddle::platform::stream::CUDAStream &self,
paddle::platform::stream::CUDAStream &stream) { paddle::platform::stream::CUDAStream &stream) {
paddle::platform::CudaEvent event; paddle::platform::CudaEvent event;
event.Record(stream); event.Record(stream.raw_stream());
self.WaitEvent(event.GetRawCudaEvent()); self.WaitEvent(event.GetRawCudaEvent());
}, },
...@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
if (event == nullptr) { if (event == nullptr) {
event = new paddle::platform::CudaEvent(); event = new paddle::platform::CudaEvent();
} }
event->Record(self); event->Record(self.raw_stream());
return event; return event;
}, },
R"DOC( R"DOC(
...@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
if (stream == nullptr) { if (stream == nullptr) {
stream = paddle::platform::stream::get_current_stream(-1); stream = paddle::platform::stream::get_current_stream(-1);
} }
self.Record(*stream); self.Record(stream->raw_stream());
}, },
R"DOC( R"DOC(
Records the event in the given stream. Records the event in the given stream.
......
...@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create", .def_static("create",
[](paddle::platform::CPUPlace& place) [](paddle::platform::CPUPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
return new paddle::platform::CPUDeviceContext(); auto* context = new paddle::platform::CPUDeviceContext();
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
}) })
.def_static("create", .def_static("create",
[](paddle::platform::XPUPlace& place) [](paddle::platform::XPUPlace& place)
...@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use XPUPlace in CPU/GPU version, " "Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#else #else
return new paddle::platform::XPUDeviceContext(place); auto* context = new paddle::platform::XPUDeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
#endif #endif
}) })
.def_static("create", .def_static("create",
...@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use CUDAPlace in CPU only version, " "Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support.")); "Please recompile or reinstall Paddle with CUDA support."));
#else #else
return new paddle::platform::CUDADeviceContext(place); auto* context = new paddle::platform::CUDADeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, context->stream())
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
context->PartialInitWithAllocator();
return context;
#endif #endif
}) })
.def_static("create", .def_static("create",
......
...@@ -2,6 +2,10 @@ add_subdirectory(dynload) ...@@ -2,6 +2,10 @@ add_subdirectory(dynload)
add_subdirectory(cpu) add_subdirectory(cpu)
if(WITH_GPU OR WITH_ROCM)
add_subdirectory(gpu)
endif()
if(WITH_XPU) if(WITH_XPU)
add_subdirectory(xpu) add_subdirectory(xpu)
endif() endif()
...@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) ...@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
if(WITH_XPU) if(WITH_XPU)
add_dependencies(pten_context xpu_context) add_dependencies(pten_context xpu_context)
endif() endif()
if(WITH_GPU)
add_dependencies(pten_context gpu_context)
endif()
...@@ -15,75 +15,59 @@ ...@@ -15,75 +15,59 @@
#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/api/ext/exception.h" #include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
// NOTE: The paddle framework should add WITH_EIGEN option to support compile // NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen. // without eigen.
#include "paddle/pten/core/device_context.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
namespace pten { namespace pten {
struct CPUContext::CPUImpl { struct CPUContext::Impl {
CPUImpl() { device_ = new Eigen::DefaultDevice(); } Impl() : place_(CPUPlace()) {}
// Users need to manage external resources. explicit Impl(const Place& place) : place_(place) {}
explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
device_ = res_.device;
}
~CPUImpl() { ~Impl() {
if (res_.device == nullptr && device_ != nullptr) { if (owned_) {
delete device_; delete eigen_device_;
device_ = nullptr;
} }
} }
Eigen::DefaultDevice* GetEigenDevice() const { void Init() {
PD_CHECK(device_ != nullptr, "the eigen_device is nullptr."); owned_ = true;
return device_; eigen_device_ = new Eigen::DefaultDevice();
} }
void SetEigenDevice(Eigen::DefaultDevice* device) { Eigen::DefaultDevice* GetEigenDevice() const {
if (device == nullptr) { PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
return; return eigen_device_;
}
res_.device = device;
device_ = device;
} }
Place GetPlace() const { return place_; } bool owned_{false};
Eigen::DefaultDevice* eigen_device_{nullptr};
Eigen::DefaultDevice* device_{nullptr}; Place place_;
CPUContextResource res_;
CPUPlace place_;
}; };
CPUContext::CPUContext() : DeviceContext() { CPUContext::CPUContext()
cpu_impl_ = std::make_unique<CPUImpl>(); : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
}
CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
cpu_impl_->SetEigenDevice(other.eigen_device());
}
CPUContext::CPUContext(CPUContext&& other) : DeviceContext() { CPUContext::CPUContext(const Place& place)
cpu_impl_ = std::move(other.cpu_impl_); : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
}
CPUContext::~CPUContext() = default; CPUContext::~CPUContext() = default;
CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() { void CPUContext::Init() { impl_->Init(); }
cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
}
Eigen::DefaultDevice* CPUContext::eigen_device() const { Eigen::DefaultDevice* CPUContext::eigen_device() const {
return cpu_impl_->GetEigenDevice(); return impl_->GetEigenDevice();
} }
const Place& CPUContext::GetPlace() const { return impl_->place_; }
void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) { void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
cpu_impl_->SetEigenDevice(device); impl_->eigen_device_ = device;
} }
Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
} // namespace pten } // namespace pten
...@@ -24,37 +24,29 @@ limitations under the License. */ ...@@ -24,37 +24,29 @@ limitations under the License. */
namespace pten { namespace pten {
struct CPUContextResource {
Eigen::DefaultDevice* device{nullptr};
};
class CPUContext : public DeviceContext { class CPUContext : public DeviceContext {
public: public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
CPUContext(); CPUContext();
explicit CPUContext(const Place&);
// NOTE: Share the same underlying resources, please ensure that resources are virtual ~CPUContext();
// not released.
CPUContext(const CPUContext&);
CPUContext(CPUContext&&);
~CPUContext();
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
const Place& GetPlace() const override;
// TODO(wilber): Whether the interface should be preserved.
Place GetPlace() const override;
public: public:
// NOTE: External users manage resources. Used in inference scenarios. // NOTE: DeviceContext hold resources. Used in training scenarios.
explicit CPUContext(const CPUContextResource& ctx_res); // The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetEigenDevice(Eigen::DefaultDevice* device); void SetEigenDevice(Eigen::DefaultDevice* device);
private: private:
struct CPUImpl; struct Impl;
std::unique_ptr<CPUImpl> cpu_impl_; std::unique_ptr<Impl> impl_;
}; };
} // namespace pten } // namespace pten
if(WITH_GPU)
add_subdirectory(cuda)
nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
elseif(WITH_ROCM)
add_subdirectory(rocm)
hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
endif()
cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace pten {
namespace backends {
namespace gpu {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
} // namespace gpu
} // namespace backends
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,20 +12,19 @@ ...@@ -12,20 +12,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static std::once_flag g_device_props_size_init_flag; static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags; static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props; static std::vector<pten::gpuDeviceProp> g_device_props;
namespace pten {
namespace backends {
namespace gpu {
namespace paddle {
namespace platform {
int DnnVersion() { int DnnVersion() {
if (!dynload::HasCUDNN()) return -1; if (!dynload::HasCUDNN()) return -1;
return dynload::cudnnGetVersion(); return dynload::cudnnGetVersion();
...@@ -75,11 +74,13 @@ int GetGPUDeviceCount() { ...@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
} }
int GetGPUComputeCapability(int id) { int GetGPUComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int major, minor; int major, minor;
auto major_error_code = auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
...@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) { ...@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
} }
int GetGPURuntimeVersion(int id) { int GetGPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int runtime_version = 0; int runtime_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
return runtime_version; return runtime_version;
} }
int GetGPUDriverVersion(int id) { int GetGPUDriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int driver_version = 0; int driver_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version)); PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
return driver_version; return driver_version;
...@@ -120,11 +125,13 @@ bool TensorCoreAvailable() { ...@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
} }
int GetGPUMultiProcessors(int id) { int GetGPUMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
...@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) { ...@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
} }
int GetGPUMaxThreadsPerMultiProcessor(int id) { int GetGPUMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
...@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { ...@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
} }
int GetGPUMaxThreadsPerBlock(int id) { int GetGPUMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
...@@ -162,32 +173,34 @@ int GetCurrentDeviceId() { ...@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
return device_id; return device_id;
} }
dim3 GetGpuMaxGridDimSize(int id) { std::array<int, 3> GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
dim3 ret; GetGPUDeviceCount()));
std::array<int, 3> ret;
int size; int size;
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
ret.x = size; ret[0] = size;
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
ret.y = size; ret[1] = size;
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
ret.z = size; ret[2] = size;
return ret; return ret;
} }
const gpuDeviceProp &GetDeviceProperties(int id) { const gpuDeviceProp &GetDeviceProperties(int id) {
std::call_once(g_device_props_size_init_flag, [&] { std::call_once(g_device_props_size_init_flag, [&] {
int gpu_num = 0; int gpu_num = 0;
gpu_num = platform::GetGPUDeviceCount(); gpu_num = GetGPUDeviceCount();
g_device_props_init_flags.resize(gpu_num); g_device_props_init_flags.resize(gpu_num);
g_device_props.resize(gpu_num); g_device_props.resize(gpu_num);
for (int i = 0; i < gpu_num; ++i) { for (int i = 0; i < gpu_num; ++i) {
...@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) { ...@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
}); });
if (id == -1) { if (id == -1) {
id = platform::GetCurrentDeviceId(); id = GetCurrentDeviceId();
} }
if (id < 0 || id >= static_cast<int>(g_device_props.size())) { if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
PADDLE_THROW(platform::errors::OutOfRange( PADDLE_THROW(paddle::platform::errors::OutOfRange(
"The device id %d is out of range [0, %d), where %d is the number of " "The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than " "devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input " "or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!", "appropriate device again!",
id, static_cast<int>(g_device_props.size()), id,
static_cast<int>(g_device_props.size()),
static_cast<int>(g_device_props.size()))); static_cast<int>(g_device_props.size())));
} }
...@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) { ...@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
} }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst,
gpuMemcpyKind kind, gpuStream_t stream) { const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) { gpuMemcpyKind kind) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst,
int src_device, size_t count, gpuStream_t stream) { int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(
int src_device, size_t count) { void *dst, int dst_device, const void *src, int src_device, size_t count) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeer(dst, dst_device, src, src_device, count)); cudaMemcpyPeer(dst, dst_device, src, src_device, count));
} }
...@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) { ...@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return cudaGetLastError(); } gpuError_t GpuGetLastError() { return cudaGetLastError(); }
} // namespace platform
} // namespace paddle } // namespace gpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
#pragma once
/// Forward declaration of Eigen types.
namespace Eigen {
struct GpuDevice;
} // namespace Eigen
/// Forward declaration of CUDA types.
// Forward declaration of CUDA runtime types.
using cudaStream_t = struct CUstream_st *;
using cudaEvent_t = struct CUevent_st *;
// Forward declaration of cuDNN types.
using cudnnHandle_t = struct cudnnContext *;
using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
using cudnnSpatialTransformerDescriptor_t =
struct cudnnSpatialTransformerStruct *;
using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
using cudnnFusedOpsVariantParamPack_t =
struct cudnnFusedOpsVariantParamStruct *;
using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
// Forward declaration of cuBLAS types.
using cublasHandle_t = struct cublasContext *;
// Forward declaration of cuSOLVER types.
using cusolverDnHandle_t = struct cusolverDnContext *;
// Forward declaration of cuSparse types.
using cusparseHandle_t = struct cusparseContext *;
// Forward declaration of cuFFT types.
using cufftHandle = int;
// Forward declaration of NCCL types.
using ncclComm_t = struct ncclComm *;
/// Forward declaration of ROCM types.
#include <cstddef>
using hipDevice_t = int;
using hipCtx_t = struct ihipCtx_t *;
using hipModule_t = struct ihipModule_t *;
using hipStream_t = struct ihipStream_t *;
using hipEvent_t = struct ihipEvent_t *;
using hipFunction_t = struct ihipModuleSymbol_t *;
// Forward declaration of MIOpen types.
using miopenHandle_t = struct miopenHandle *;
using miopenAcceleratorQueue_t = hipStream_t;
using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
using miopenOperatorArgs_t = struct miopenOperatorArgs *;
using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
// struct miopenConvAlgoPerf_t;
// struct miopenConvSolution_t;
// Forward declaration of rocBLAS types.
using rocblas_handle = struct _rocblas_handle *;
// Forward declaration of hipfft types.
using hipfftHandle = struct hipfftHandle_t *;
// Forward declaration of rocSOLVER types.
using rocsolver_handle = rocblas_handle;
// Forward declaration of rocSparse types.
using rocsparse_handle = struct _rocsparse_handle *;
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_context.h"
#include <array>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#include "paddle/pten/backends/dynload/cusolver.h"
#include "paddle/pten/backends/dynload/cusparse.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/pten/backends/dynload/nccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/pten/backends/dynload/rccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#endif // PADDLE_WITH_HIP
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "unsupported/Eigen/CXX11/Tensor"
// TODO(pten): remove fluid header.
#include "paddle/fluid/platform/enforce.h"
namespace pten {
namespace internal {
class EigenGpuStreamDevice : public Eigen::StreamInterface {
public:
EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
Eigen::initializeDeviceProp();
}
~EigenGpuStreamDevice() override {}
void Reinitialize(gpuStream_t cuda_stream,
Allocator* allocator,
GPUPlace place) {
stream_ = cuda_stream;
place_ = place;
allocator_ = allocator;
device_prop_ = &Eigen::m_deviceProperties[place.device];
}
const gpuStream_t& stream() const override { return stream_; }
const gpuDeviceProp& deviceProperties() const override {
return *device_prop_;
}
void* allocate(size_t num_bytes) const override {
if (UNLIKELY(num_bytes == 0)) {
return nullptr;
}
auto buf = allocator_->Allocate(num_bytes);
VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
<< num_bytes;
void* retv = buf->ptr();
{
std::lock_guard<std::mutex> lock(mtx_);
allocations_.emplace(retv, std::move(buf));
}
return retv;
}
void deallocate(void* buffer) const override {
if (LIKELY(buffer)) {
std::lock_guard<std::mutex> lock(mtx_);
allocations_.erase(buffer);
}
}
void* scratchpad() const override {
if (scratch_ == NULL) {
scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
}
return scratch_;
}
unsigned int* semaphore() const override {
if (semaphore_ == NULL) {
char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
#endif
}
return semaphore_;
}
private:
GPUPlace place_;
gpuStream_t stream_; // not owned;
Allocator* allocator_; // not owned;
const gpuDeviceProp* device_prop_; // not owned;
mutable void* scratch_;
mutable unsigned int* semaphore_;
mutable std::mutex mtx_; // to protect allocations_
mutable std::unordered_map<void*, Allocator::AllocationPtr> allocations_;
};
#ifdef PADDLE_WITH_HIP
static void StreamCallbackFunc(gpuStream_t stream,
gpuError_t status,
void* user_data)
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void* user_data)
#else
static void CUDART_CB
StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
#endif
#endif
{
std::unique_ptr<std::function<void()>> func(
reinterpret_cast<std::function<void()>*>(user_data));
(*func)();
}
} // namespace internal
class DnnWorkspaceHandle {
public:
explicit inline DnnWorkspaceHandle(Allocator* allocator)
: allocator_(allocator) {}
inline void RunFunc(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) {
if (required_workspace_bytes > WorkspaceSize()) {
ReallocWorkspace(required_workspace_bytes);
}
VLOG(2) << "Cudnn workspace size at RunFunc: "
<< static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
{
std::lock_guard<std::mutex> guard(mtx_);
cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
}
}
/*! \brief Thread which call RunFuncSync() would release gpu memory after
* running the function. Currently this function is only used when cudnn
* exhaustive searching and callers have to guarantee that the input function
* is host blocking */
inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) {
RunFunc(cudnn_func, required_workspace_bytes);
ResetWorkspace();
}
inline size_t WorkspaceSize() {
if (allocation_ == nullptr) {
return 0;
}
return allocation_->size();
}
void ResetWorkspace() { allocation_ = nullptr; }
void ReallocWorkspace(size_t required_workspace_bytes) {
if (required_workspace_bytes <= WorkspaceSize()) return;
// reset allocation first before re-allocate to save memory
allocation_.reset();
allocation_ = allocator_->Allocate(required_workspace_bytes);
}
private:
Allocator::AllocationPtr allocation_{nullptr};
Allocator* allocator_{nullptr};
std::mutex mtx_;
};
struct GPUContext::Impl {
void Init() {
owned_ = true;
backends::gpu::GPUDeviceGuard guard(place_.device);
InitGpuProperties();
InitStream();
InitEigenDevice();
InitBlasHandle();
InitDNNHandle();
InitSolverHandle();
InitSparseHandle();
InitDnnWorkspace();
}
void PartialInitWithoutAllocator() {
owned_ = true;
backends::gpu::GPUDeviceGuard guard(place_.device);
InitGpuProperties();
InitStream();
InitBlasHandle();
InitDNNHandle();
InitSolverHandle();
InitSparseHandle();
}
void PartialInitWithAllocator() {
owned_ = true;
backends::gpu::GPUDeviceGuard guard(place_.device);
InitEigenDevice();
InitDnnWorkspace();
}
Impl() : place_(GPUPlace()) {}
explicit Impl(const GPUPlace& place) : place_(place) {}
~Impl() {
backends::gpu::GPUDeviceGuard guard(place_.device);
DestoryInternalWorkspace();
DestoryInternalEigenDevice();
DestroyInternalSparseHandle();
DestroyInternalSolverHandle();
DestroyInternalDnnHandle();
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
#endif
DestroyInternalBlasHandle();
DestoryInternalStream();
}
const Place& GetPlace() const { return place_; }
bool IsTensorCoreAvailable() const {
return blas_tensor_core_handle_ != nullptr;
}
void InitGpuProperties() {
backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
compute_capability_ =
backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
max_threads_per_mp_ =
backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
max_grid_dim_size_ =
backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
max_threads_per_block_ =
backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
runtime_version_ =
backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
// TODO(wilber): glog may be replaced in the future?
LOG_FIRST_N(WARNING, 1)
<< "Please NOTE: device: " << static_cast<int>(place_.device)
<< ", GPU Compute Capability: " << compute_capability_ / 10 << "."
<< compute_capability_ % 10
<< ", Driver API Version: " << driver_version_ / 1000 << "."
<< (driver_version_ % 100) / 10
<< ", Runtime API Version: " << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
auto cudnn_dso_ver =
(miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
auto compile_miopen_version = MIOPEN_VERSION / 10;
if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 100 << "." << compile_miopen_version % 100
<< ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
<< "." << cudnn_dso_ver % 100
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", cuDNN Version: " << cudnn_dso_ver / 1000
<< "." << (cudnn_dso_ver % 1000) / 100 << ".";
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
#endif
}
void InitDnnWorkspace() {
PD_CHECK(allocator_ != nullptr,
"the device allocator for gpu context is nullptr.");
workspace_ = new DnnWorkspaceHandle(allocator_);
}
void DestoryInternalWorkspace() {
if (owned_ && workspace_ != nullptr) {
delete workspace_;
stream_ = nullptr;
}
}
DnnWorkspaceHandle* GetDnnWorkspace() {
PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
return workspace_;
}
void InitStream() {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
#endif
}
void DestoryInternalStream() {
if (owned_ && stream_ != nullptr) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
#endif
}
stream_ = nullptr;
}
void SetStream(gpuStream_t stream) { stream_ = stream; }
gpuStream_t GetStream() const {
PD_CHECK(stream_ != nullptr, "the gpu stream is nullptr.");
return stream_;
}
void InitEigenDevice() {
PD_CHECK(allocator_ != nullptr,
"the allocator for eigen device is nullptr.");
eigen_stream_.reset(new internal::EigenGpuStreamDevice());
eigen_stream_->Reinitialize(stream_, allocator_, place_);
eigen_device_ = new Eigen::GpuDevice(eigen_stream_.get());
}
void DestoryInternalEigenDevice() {
if (owned_ && eigen_device_ != nullptr) {
delete eigen_device_;
eigen_device_ = nullptr;
}
}
void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; }
Eigen::GpuDevice* eigen_device() const {
PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr.");
return eigen_device_;
}
void InitBlasHandle() {
#ifdef PADDLE_WITH_HIP
pten::dynload::rocblas_create_handle(&blas_handle_);
pten::dynload::rocblas_set_stream(blas_handle_, stream_);
#else // PADDLE_WITH_CUDA
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasCreate(&blas_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasSetStream(blas_handle_, stream_));
#if CUDA_VERSION >= 9000
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasCreate(&blas_tensor_core_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 9000
#endif // PADDLE_WITH_HIP
}
void DestroyInternalBlasHandle() {
#ifdef PADDLE_WITH_HIP
if (owned_ && blas_handle_ != nullptr) {
pten::dynload::rocblas_destroy_handle(blas_handle_);
blas_handle_ = nullptr;
}
#else
if (owned_ && blas_handle_ != nullptr) {
pten::dynload::cublasDestroy(blas_handle_);
blas_handle_ = nullptr;
}
if (owned_ && blas_tensor_core_handle_ != nullptr) {
pten::dynload::cublasDestroy(blas_tensor_core_handle_);
blas_tensor_core_handle_ = nullptr;
}
if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
pten::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
blas_tf32_tensor_core_handle_ = nullptr;
}
#endif // PADDLE_WITH_HIP
}
blasHandle_t GetBlasHandle() const {
PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
return blas_handle_;
}
void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
void InitDNNHandle() {
if (pten::dynload::HasCUDNN()) {
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&miopen_major, &miopen_minor, &miopen_patch));
auto local_miopen_version =
(miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
auto compile_miopen_version = MIOPEN_VERSION / 10;
if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 100 << "."
<< compile_miopen_version % 100
<< ", but MIOPEN version in your machine is "
<< local_miopen_version / 100 << "." << local_miopen_version % 100
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenSetStream(dnn_handle_, stream_));
#else
auto local_cudnn_version = pten::dynload::cudnnGetVersion() / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cudnnCreate(&dnn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cudnnSetStream(dnn_handle_, stream_));
#endif
} else {
dnn_handle_ = nullptr;
}
}
dnnHandle_t GetDnnHandle() {
PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
return dnn_handle_;
}
void DestroyInternalDnnHandle() {
#ifdef PADDLE_WITH_HIP
if (owned_ && dnn_handle_ != nullptr) {
PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::miopenDestroy(dnn_handle_));
dnn_handle_ = nullptr;
}
#else
if (owned_ && dnn_handle_ != nullptr) {
PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::cudnnDestroy(dnn_handle_));
dnn_handle_ = nullptr;
}
#endif // PADDLE_WITH_HIP
}
void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
void InitSolverHandle() {
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cusolverDnCreate(&solver_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cusolverDnSetStream(solver_handle_, stream_));
#endif
}
void DestroyInternalSolverHandle() {
#ifndef PADDLE_WITH_HIP
if (owned_ && solver_handle_ != nullptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
pten::dynload::cusolverDnDestroy(solver_handle_));
solver_handle_ = nullptr;
}
#endif
}
solverHandle_t GetSolverHandle() const {
PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
return solver_handle_;
}
void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
void InitSparseHandle() {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cusparseSetStream(sparse_handle_, stream_));
#endif
#endif
}
void DestroyInternalSparseHandle() {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
if (owned_ && sparse_handle_ != nullptr) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
sparse_handle_ = nullptr;
}
#endif
#endif
}
sparseHandle_t GetSparseHandle() const {
PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
return sparse_handle_;
}
void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; }
void Wait() const {
#ifdef PADDLE_WITH_HIP
hipError_t e_sync = hipSuccess;
#if !defined(_WIN32)
e_sync = hipStreamSynchronize(stream_);
#else
while (e_sync = hipStreamQuery(stream_)) {
if (e_sync == hipErrorNotReady) continue;
break;
}
#endif // !defined(_WIN32)
#else // PADDLE_WITH_HIP
cudaError_t e_sync = cudaSuccess;
#if !defined(_WIN32)
e_sync = cudaStreamSynchronize(stream_);
#else
while (e_sync = cudaStreamQuery(stream_)) {
if (e_sync == cudaErrorNotReady) continue;
break;
}
#endif // !defined(_WIN32)
#endif // PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
}
void WaitEvent(gpuEvent_t ev) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
#endif
}
ncclComm_t GetNcclComm() const {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
return nccl_comm_;
#endif
return nullptr;
}
void SetNcclComm(ncclComm_t comm) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_comm_ = comm;
#endif
}
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (blas_tf32_tensor_core_handle_ != nullptr) {
std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
callback(blas_tf32_tensor_core_handle_);
} else {
std::lock_guard<std::mutex> guard(blas_mtx_);
callback(blas_handle_);
}
}
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (blas_tensor_core_handle_ != nullptr) {
std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
callback(blas_tensor_core_handle_);
} else {
std::lock_guard<std::mutex> guard(blas_mtx_);
callback(blas_handle_);
}
}
inline void CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(sparse_mtx_);
callback(sparse_handle_);
}
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const {
callback();
RecordEvent(ev);
}
void RecordEvent(gpuEvent_t ev) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
#endif
}
void AddStreamCallback(const std::function<void()>& callback) const {
// TODO(wilber): Do we need ThreadPool?
auto* func = new std::function<void()>([this, callback] {
std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
});
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
PADDLE_ENFORCE_GPU_SUCCESS(
cudaLaunchHostFunc(stream_, internal::StreamCallbackFunc, func));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
#endif
#endif
}
void WaitStreamCallback() const {
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
pten::backends::gpu::GpuStreamSync(stream_);
#endif
{
std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
if (last_future_.valid()) {
last_future_.wait();
}
}
}
bool owned_{false};
Place place_;
int compute_capability_;
int runtime_version_;
int driver_version_;
int multi_process_;
int max_threads_per_mp_;
int max_threads_per_block_;
std::array<int, 3> max_grid_dim_size_;
gpuStream_t stream_{nullptr};
Eigen::GpuDevice* eigen_device_{nullptr};
blasHandle_t blas_handle_{nullptr};
blasHandle_t blas_tensor_core_handle_{nullptr};
blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
dnnHandle_t dnn_handle_{nullptr};
solverHandle_t solver_handle_{nullptr};
sparseHandle_t sparse_handle_{nullptr};
DnnWorkspaceHandle* workspace_{nullptr};
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
// NOTE: Distributed communicator, distributed framework manages its
// resources.
ncclComm_t nccl_comm_{nullptr};
#endif
mutable std::mutex blas_mtx_;
mutable std::mutex blas_tensor_core_mtx_;
mutable std::mutex blas_tf32_mtx_;
mutable std::mutex sparse_mtx_;
mutable std::mutex stream_call_back_mtx_;
mutable std::future<void> last_future_;
Allocator* allocator_{nullptr}; // external resource.
// A internal resouce to initinalize eigen_device.
std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
};
GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
GPUContext::GPUContext(const GPUPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
GPUContext::~GPUContext() = default;
const Place& GPUContext::GetPlace() const { return impl_->GetPlace(); }
gpuStream_t GPUContext::stream() const { return impl_->GetStream(); }
dnnHandle_t GPUContext::cudnn_handle() const { return impl_->GetDnnHandle(); }
blasHandle_t GPUContext::cublas_handle() const {
return impl_->GetBlasHandle();
}
solverHandle_t GPUContext::cusolver_dn_handle() const {
return impl_->GetSolverHandle();
}
sparseHandle_t GPUContext::cusparse_handle() const {
return impl_->GetSparseHandle();
}
void GPUContext::Wait() const { impl_->Wait(); }
void GPUContext::WaitEvent(gpuEvent_t ev) const { impl_->WaitEvent(ev); }
bool GPUContext::tensor_core_available() const {
return impl_->IsTensorCoreAvailable();
}
int GPUContext::GetComputeCapability() const {
return impl_->compute_capability_;
}
int GPUContext::GetMaxPhysicalThreadCount() const {
return impl_->multi_process_ * impl_->max_threads_per_mp_;
}
int GPUContext::GetSMCount() const { return impl_->multi_process_; }
int GPUContext::GetMaxThreadsPerBlock() const {
return impl_->max_threads_per_block_;
}
std::array<int, 3> GPUContext::GetCUDAMaxGridDimSize() const {
return impl_->max_grid_dim_size_;
}
Eigen::GpuDevice* GPUContext::eigen_device() const {
return impl_->eigen_device();
}
DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() {
return impl_->GetDnnWorkspace();
}
void GPUContext::CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
impl_->CublasCall(callback);
}
void GPUContext::TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
impl_->TensorCoreCublasCallIfAvailable(callback);
}
void GPUContext::CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
impl_->CusparseCall(callback);
}
void GPUContext::RecordEvent(gpuEvent_t ev,
const std::function<void()>& callback) const {
impl_->RecordEvent(ev, callback);
}
void GPUContext::RecordEvent(gpuEvent_t ev) const { impl_->RecordEvent(ev); }
void GPUContext::AddStreamCallback(
const std::function<void()>& callback) const {
impl_->AddStreamCallback(callback);
}
void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
void GPUContext::Init() {
impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
impl_->Init();
}
void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
impl_->SetEigenDevice(device);
}
void GPUContext::SetBlasHandle(blasHandle_t blas) {
impl_->SetBlasHandle(blas);
}
void GPUContext::SetDnnHandle(dnnHandle_t handle) {
impl_->SetDnnHandle(handle);
}
void GPUContext::SetSolverHandle(solverHandle_t handle) {
impl_->SetSolverHandle(handle);
}
void GPUContext::SetSparseHandle(sparseHandle_t handle) {
impl_->SetSparseHandle(handle);
}
void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) {
impl_->workspace_ = handle;
}
void GPUContext::PartialInitWithoutAllocator() {
impl_->PartialInitWithoutAllocator();
}
void GPUContext::PartialInitWithAllocator() {
impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
impl_->PartialInitWithAllocator();
}
void GPUContext::SetComputeCapability(int val) {
impl_->compute_capability_ = val;
}
void GPUContext::SetMaxThreadsPerMultiProcessor(int val) {
impl_->max_threads_per_mp_ = val;
}
void GPUContext::SetMultiProcessors(int val) { impl_->multi_process_ = val; }
void GPUContext::SetMaxThreadsPerBlock(int val) {
impl_->max_threads_per_block_ = val;
}
void GPUContext::SetMaxGridDimSize(const std::array<int, 3>& val) {
impl_->max_grid_dim_size_ = val;
}
void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }
void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -14,13 +14,162 @@ limitations under the License. */ ...@@ -14,13 +14,162 @@ limitations under the License. */
#pragma once #pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include <array>
#include <functional>
// See Note [ Why still include the fluid headers? ] #include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_helper.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/device_context.h"
namespace pten { namespace pten {
using GPUContext = paddle::platform::CUDADeviceContext;
} // namespace pten
#endif class DnnWorkspaceHandle;
class GPUContext : public DeviceContext {
public:
GPUContext();
explicit GPUContext(const GPUPlace& place);
virtual ~GPUContext();
/*! \brief Return place in the device context. */
const Place& GetPlace() const override;
/*! \brief Return gpu stream in the device context. */
gpuStream_t stream() const;
/*! \brief Return cudnn handle in the device context. */
dnnHandle_t cudnn_handle() const;
/*! \brief Return cublas handle in the device context. */
blasHandle_t cublas_handle() const;
/*! \brief Return cusolver handle in the device context. */
solverHandle_t cusolver_dn_handle() const;
/*! \brief Return cusparse handle in the device context. */
sparseHandle_t cusparse_handle() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Wait for event in the stream. */
void WaitEvent(gpuEvent_t ev) const;
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
std::array<int, 3> GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
* would be acquired to prevent other threads from accessing the
* workspace. Once the handle is destructed, the lock would be released.
*/
DnnWorkspaceHandle* cudnn_workspace_handle();
public:
/*! \brief Call cublas function safely. */
void CublasCall(const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cusparse function safely. */
void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void RecordEvent(gpuEvent_t ev) const;
void AddStreamCallback(const std::function<void()>& callback) const;
void WaitStreamCallback() const;
public:
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const;
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm);
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
// Note that you must set the Allocator before calling Init function.
void Init();
// TODO(wilber): Why does the GetAllocator interface require a stream
// parameter?
// The temporary trick method bypasses this problem, and the following
// interfaces
// need to be deleted later.
// Note that this is a trick implementation, which can be used to partially
// initialize when the SetAllocator interface is not called.
void PartialInitWithoutAllocator();
// Note that this is a trick implementation that can be used to initialize
// resources that require an Allocator when the SetAllocator interface is
// called.
void PartialInitWithAllocator();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetStream(gpuStream_t);
void SetEigenDevice(Eigen::GpuDevice*);
void SetBlasHandle(blasHandle_t);
void SetDnnHandle(dnnHandle_t);
void SetSolverHandle(solverHandle_t);
void SetSparseHandle(sparseHandle_t);
void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
void SetComputeCapability(int val);
void SetMaxThreadsPerMultiProcessor(int val);
void SetMultiProcessors(int val);
void SetMaxThreadsPerBlock(int val);
void SetMaxGridDimSize(const std::array<int, 3>& val);
void SetDriverVersion(int val);
void SetRuntimeVersion(int val);
private:
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
namespace pten {
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
cudnnActivationStruct,
miopenActivationDescriptor);
DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
cudnnTensorStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
cudnnFilterStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
cudnnFilterDescriptor_t,
miopenTensorDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
cudnnConvolutionStruct,
miopenConvolutionDescriptor);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
cudnnConvolutionDescriptor_t,
miopenConvolutionDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
cudnnPoolingDescriptor_t,
miopenPoolingDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
cudnnDropoutDescriptor_t,
miopenDropoutDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
#undef DECLARE_TYPE_FOR_GPU
using CUDAGraphID = unsigned long long; // NOLINT
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
#else
#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
#endif
#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_info.h"
#include <vector>
#include "gflags/gflags.h"
DECLARE_string(selected_gpus);
namespace pten {
namespace backends {
namespace gpu {
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
} // namespace gpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h>
#include <array>
#include <string>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_types.h"
namespace pten {
namespace backends {
namespace gpu {
//! Get the version of dnn
int DnnVersion();
//! Get the total number of GPU devices in system.
int GetGPUDeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int GetGPUComputeCapability(int id);
//! Get the runtime version of the ith GPU
int GetGPURuntimeVersion(int id);
//! Get the driver version of the ith GPU
int GetGPUDriverVersion(int id);
//! Wheter the current device support TensorCore
bool TensorCoreAvailable();
//! Get the MultiProcessors of the ith GPU.
int GetGPUMultiProcessors(int id);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int GetGPUMaxThreadsPerMultiProcessor(int id);
//! Get the MaxThreads of each block of the ith GPU.
int GetGPUMaxThreadsPerBlock(int id);
//! Get the current GPU device id in system.
int GetCurrentDeviceId();
//! Get the maximum GridDim size for GPU buddy allocator.
std::array<int, 3> GetGpuMaxGridDimSize(int);
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices();
//! Get the properties of the ith GPU device.
const gpuDeviceProp &GetDeviceProperties(int id);
//! Set the GPU device id for next execution.
void SetDeviceId(int device_id);
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind);
//! Copy memory from one device to another device asynchronously.
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream);
//! Copy memory from one device to another device synchronously.
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count);
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
//! Blocks until stream has completed all operations.
void GpuStreamSync(gpuStream_t stream);
void GpuDestroyStream(gpuStream_t stream);
// ! Blocks until device has completed all operations.
void GpuDeviceSync();
gpuError_t GpuGetLastError();
class GPUDeviceGuard {
public:
explicit inline GPUDeviceGuard(int dev_id) {
int prev_id = GetCurrentDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
SetDeviceId(dev_id);
}
}
inline ~GPUDeviceGuard() {
if (prev_id_ != -1) {
SetDeviceId(prev_id_);
}
}
GPUDeviceGuard(const GPUDeviceGuard &o) = delete;
GPUDeviceGuard &operator=(const GPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
};
} // namespace gpu
} // namespace backends
} // namespace pten
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Used for compute gpu launch parameter config
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h>
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace pten {
namespace backends {
namespace gpu {
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
static inline int RoundToPowerOfTwo(int n) {
n--;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
#else
return std::min(1024, std::max(32, (n + 1)));
#endif
}
#ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 .
inline void ChangeThreadNum(const pten::CUDAContext& context,
int* num_thread,
int alternative_num_thread = 512) {
if (context.GetComputeCapability() == 53 ||
context.GetComputeCapability() == 62) {
*num_thread = alternative_num_thread;
}
}
#endif
struct GpuLaunchConfig {
public:
GpuLaunchConfig() {}
size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
size_t GetGridSize() const {
return block_per_grid.x * block_per_grid.y * block_per_grid.z;
}
size_t GetBlockSize() const {
return thread_per_block.x * thread_per_block.y * thread_per_block.z;
}
int compute_capability = 0;
dim3 thread_per_block = dim3(1, 1, 1);
dim3 block_per_grid = dim3(1, 1, 1);
};
/* According to NVIDIA, if number of threads per block is 64/128/256/512,
* cuda performs better. And number of blocks should be greater (at least
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block. */
inline GpuLaunchConfig GetGpuLaunchConfig1D(const pten::GPUContext& context,
int64_t numel,
int vec_size = 1) {
PADDLE_ENFORCE_GT(numel,
0,
paddle::platform::errors::InvalidArgument(
"element quantity should be greater than 0,"
" but received value is: %d.",
numel));
// Get compute_capability
const int capability = context.GetComputeCapability();
/* If thread number per block is 64/128/256/512, cuda performs better.*/
int limit_threads =
std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
#ifdef WITH_NV_JETSON
if (capability == 53 || capability == 62) {
limit_threads = 512;
}
#endif
int threads = limit_threads;
int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
} else if (active_threads_num / (sm_count << 2) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
}
// Number of threads per block shall be larger than 64.
threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads);
GpuLaunchConfig config;
config.thread_per_block.x = threads;
config.block_per_grid.x = blocks;
config.compute_capability = capability;
return config;
}
inline GpuLaunchConfig GetGpuLaunchConfig2D(const pten::GPUContext& context,
int x_dim,
int y_dim) {
PADDLE_ENFORCE_GT(x_dim,
0,
paddle::platform::errors::InvalidArgument(
"x dim number should greater than 0,"
" but received value is: %d",
x_dim));
PADDLE_ENFORCE_GT(y_dim,
0,
paddle::platform::errors::InvalidArgument(
"y dim number should greater than 0,"
" but received value is: %d",
y_dim));
const int kThreadsPerBlock = 256;
int block_cols = (std::min)(x_dim, kThreadsPerBlock);
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
int grid_y =
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1);
return config;
}
} // namespace gpu
} // namespace backends
} // namespace pten
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#else // PADDLE_WITH_CUDA
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#endif
namespace pten {
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
cudnnTensorFormat_t,
miopenTensorFormat_t);
DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
cudnnActivationMode_t,
miopenActivationMode_t);
#undef DECLARE_TYPE_FOR_GPU
#ifdef PADDLE_WITH_HIP
#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
constexpr auto GPU_CV = ROCM_CV;
#else // PADDLE_WITH_CUDA
#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
constexpr auto GPU_CV = CUDA_CV;
#endif
DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
cudaErrorMemoryAllocation,
hipErrorOutOfMemory);
DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
#undef DECLARE_CONSTANT_FOR_GPU
} // namespace pten
#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace pten {
namespace backends {
namespace gpu {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
} // namespace gpu
} // namespace backends
} // namespace pten
...@@ -12,20 +12,20 @@ ...@@ -12,20 +12,20 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include <array>
#include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static std::once_flag g_device_props_size_init_flag; static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags; static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props; static std::vector<paddle::gpuDeviceProp> g_device_props;
namespace paddle { namespace pten {
namespace platform { namespace backends {
namespace gpu {
int DnnVersion() { int DnnVersion() {
if (!dynload::HasCUDNN()) return -1; if (!dynload::HasCUDNN()) return -1;
size_t version_major, version_minor, version_patch; size_t version_major, version_minor, version_patch;
...@@ -78,11 +78,13 @@ int GetGPUDeviceCount() { ...@@ -78,11 +78,13 @@ int GetGPUDeviceCount() {
} }
int GetGPUComputeCapability(int id) { int GetGPUComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int major, minor; int major, minor;
auto major_error_code = hipDeviceGetAttribute( auto major_error_code = hipDeviceGetAttribute(
&major, hipDeviceAttributeComputeCapabilityMajor, id); &major, hipDeviceAttributeComputeCapabilityMajor, id);
...@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) { ...@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) {
} }
int GetGPURuntimeVersion(int id) { int GetGPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int runtime_version = 0; int runtime_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version)); PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
return runtime_version; return runtime_version;
} }
int GetGPUDriverVersion(int id) { int GetGPUDriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int driver_version = 0; int driver_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version)); PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
return driver_version; return driver_version;
...@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) { ...@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) {
bool TensorCoreAvailable() { return false; } bool TensorCoreAvailable() { return false; }
int GetGPUMultiProcessors(int id) { int GetGPUMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id)); hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
...@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) { ...@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
} }
int GetGPUMaxThreadsPerMultiProcessor(int id) { int GetGPUMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
&count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id)); &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
...@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { ...@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
} }
int GetGPUMaxThreadsPerBlock(int id) { int GetGPUMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id)); hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
...@@ -161,35 +173,37 @@ int GetCurrentDeviceId() { ...@@ -161,35 +173,37 @@ int GetCurrentDeviceId() {
return device_id; return device_id;
} }
dim3 GetGpuMaxGridDimSize(int id) { std::array<int, 3> GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
dim3 ret; GetGPUDeviceCount()));
std::array<int, 3> ret;
int size; int size;
auto error_code_x = auto error_code_x =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id); hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
ret.x = size; ret[0] = size;
auto error_code_y = auto error_code_y =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id); hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
ret.y = size; ret[1] = size;
auto error_code_z = auto error_code_z =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id); hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
ret.z = size; ret[2] = size;
return ret; return ret;
} }
const gpuDeviceProp &GetDeviceProperties(int id) { const gpuDeviceProp &GetDeviceProperties(int id) {
std::call_once(g_device_props_size_init_flag, [&] { std::call_once(g_device_props_size_init_flag, [&] {
int gpu_num = 0; int gpu_num = 0;
gpu_num = platform::GetGPUDeviceCount(); gpu_num = GetGPUDeviceCount();
g_device_props_init_flags.resize(gpu_num); g_device_props_init_flags.resize(gpu_num);
g_device_props.resize(gpu_num); g_device_props.resize(gpu_num);
for (int i = 0; i < gpu_num; ++i) { for (int i = 0; i < gpu_num; ++i) {
...@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) { ...@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
}); });
if (id == -1) { if (id == -1) {
id = platform::GetCurrentDeviceId(); id = GetCurrentDeviceId();
} }
if (id < 0 || id >= static_cast<int>(g_device_props.size())) { if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
PADDLE_THROW(platform::errors::OutOfRange( PADDLE_THROW(paddle::platform::errors::OutOfRange(
"The device id %d is out of range [0, %d), where %d is the number of " "The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than " "devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input " "or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!", "appropriate device again!",
id, static_cast<int>(g_device_props.size()), id,
static_cast<int>(g_device_props.size()),
static_cast<int>(g_device_props.size()))); static_cast<int>(g_device_props.size())));
} }
...@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) { ...@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id)); PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
} }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst,
gpuMemcpyKind kind, gpuStream_t stream) { const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) { gpuMemcpyKind kind) {
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind)); PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst,
int src_device, size_t count, gpuStream_t stream) { int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(
int src_device, size_t count) { void *dst, int dst_device, const void *src, int src_device, size_t count) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyPeer(dst, dst_device, src, src_device, count)); hipMemcpyPeer(dst, dst_device, src, src_device, count));
} }
...@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) { ...@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return hipGetLastError(); } gpuError_t GpuGetLastError() { return hipGetLastError(); }
} // namespace platform
} // namespace paddle } // namespace gpu
} // namespace backends
} // namespace pten
...@@ -13,8 +13,11 @@ ...@@ -13,8 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/pten/backends/xpu/xpu_context.h" #include "paddle/pten/backends/xpu/xpu_context.h"
#include <memory> #include <memory>
#include "paddle/pten/api/ext/exception.h" #include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
#include "xpu/runtime.h" #include "xpu/runtime.h"
#include "xpu/runtime_ex.h" #include "xpu/runtime_ex.h"
...@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api; ...@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api;
namespace pten { namespace pten {
struct XPUContext::XPUImpl { struct XPUContext::Impl {
void SetL3Cache() { void SetL3Cache(int l3_size = 14155776) {
const int MAX_XPU_NUM = 16; const int MAX_XPU_NUM = 16;
static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
int l3_size = 13.5 * 1024 * 1024;
if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
} }
...@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl { ...@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl {
} }
} }
XPUImpl() { Impl() : place_(XPUPlace()) {}
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
}
explicit XPUImpl(XPUPlace place) : place_(place) {
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place_.device);
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
// Users need to manage external resources. explicit Impl(const Place& place) : place_(place) {}
explicit XPUImpl(const XPUContextResource& ctx_res,
const XPUPlace& place = XPUPlace(0))
: res_(ctx_res), place_(place) {
context_ = res_.context;
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
~XPUImpl() { ~Impl() {
if (res_.context == nullptr && context_ != nullptr) { if (owned_ && context_ != nullptr) {
xpu::destroy_context(context_); xpu::destroy_context(context_);
context_ = nullptr; context_ = nullptr;
} }
} }
Place GetPlace() const { return place_; } const Place& GetPlace() const { return place_; }
backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
xpu::Context* GetXContext() const { xpu::Context* GetXContext() const {
PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
return context_; return context_;
} }
xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; } xpu::BKCLContext_t GetBkclContext() const {
PD_CHECK(bkcl_context_ != nullptr, "the xpu bkcl_context is nullptr.");
return bkcl_context_;
}
void Wait() const { void Wait() const {
backends::xpu::SetXPUDeviceId(place_.GetDeviceId()); backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
...@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl { ...@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl {
xpu_wait(context_->xpu_stream); xpu_wait(context_->xpu_stream);
} }
void SetXContext(xpu::Context* context) { void Init() {
if (context == nullptr) { owned_ = true;
return; backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
} LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
res_.context = context; << static_cast<int>(place_.device);
context_ = context; context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
} }
void SetXContext(xpu::Context* context) { context_ = context; }
void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
XPUContextResource res_; bool owned_{false};
XPUPlace place_; Place place_;
backends::xpu::XPUVersion xpu_version_; backends::xpu::XPUVersion xpu_version_;
xpu::Context* context_{nullptr}; xpu::Context* context_{nullptr};
// NOTE: Distributed communicator, distributed framework manages its // NOTE: Distributed communicator, distributed framework manages its
// resources, XPUContext only holds references. // resources, XPUContext only holds references.
xpu::BKCLContext_t bkcl_context_{nullptr}; xpu::BKCLContext_t bkcl_context_{nullptr};
}; };
XPUContext::XPUContext() : DeviceContext() { XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
impl_ = std::make_unique<XPUImpl>();
}
XPUContext::XPUContext(const XPUPlace& place) { XPUContext::XPUContext(const XPUPlace& place)
impl_ = std::make_unique<XPUImpl>(place); : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
}
XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
impl_ = std::make_unique<XPUImpl>();
impl_->SetXContext(other.x_context());
impl_->SetBkclContext(other.bkcl_context());
}
XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
impl_ = std::move(other.impl_);
}
XPUContext::~XPUContext() = default; XPUContext::~XPUContext() = default;
XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() { const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); }
impl_ = std::make_unique<XPUImpl>(ctx_res);
}
Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
backends::xpu::XPUVersion XPUContext::xpu_version() const { backends::xpu::XPUVersion XPUContext::xpu_version() const {
return impl_->GetXpuVersion(); return impl_->xpu_version_;
} }
xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); } xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
...@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const { ...@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const {
void XPUContext::Wait() const { impl_->Wait(); } void XPUContext::Wait() const { impl_->Wait(); }
void XPUContext::set_x_context(xpu::Context* context) { void XPUContext::SetXContext(xpu::Context* context) {
impl_->SetXContext(context); impl_->SetXContext(context);
} }
void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) { void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); }
void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
impl_->SetBkclContext(context); impl_->SetBkclContext(context);
} }
void XPUContext::Init() { impl_->Init(); }
} // namespace pten } // namespace pten
...@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api; ...@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api;
namespace pten { namespace pten {
struct XPUContextResource {
xpu::Context* context{nullptr};
};
class XPUContext : public DeviceContext { class XPUContext : public DeviceContext {
public: public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
XPUContext(); XPUContext();
explicit XPUContext(const XPUPlace&); explicit XPUContext(const XPUPlace&);
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
XPUContext(const XPUContext&);
XPUContext(XPUContext&&);
virtual ~XPUContext(); virtual ~XPUContext();
Place GetPlace() const override; const Place& GetPlace() const override;
backends::xpu::XPUVersion xpu_version() const; backends::xpu::XPUVersion xpu_version() const;
...@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext { ...@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext {
// Return bkcl context. // Return bkcl context.
xpu::BKCLContext_t bkcl_context() const; xpu::BKCLContext_t bkcl_context() const;
void SetBkclContext(xpu::BKCLContext_t context);
// Wait for all operations completion in the stream. // Wait for all operations completion in the stream.
void Wait() const override; void Wait() const override;
public: public:
// NOTE: External users manage resources. Used in inference scenarios. // NOTE: DeviceContext hold resources. Used in training scenarios.
explicit XPUContext(const XPUContextResource&); // The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
void set_x_context(xpu::Context*); public:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetXContext(xpu::Context*);
void set_bkcl_context(xpu::BKCLContext_t context); void SetL3Cache(int l3_size = 14155776);
private: private:
struct XPUImpl; struct Impl;
std::unique_ptr<XPUImpl> impl_; std::unique_ptr<Impl> impl_;
}; };
} // namespace pten } // namespace pten
...@@ -23,7 +23,7 @@ struct DeviceContext::Impl { ...@@ -23,7 +23,7 @@ struct DeviceContext::Impl {
Impl() = default; Impl() = default;
~Impl() = default; ~Impl() = default;
void SetDeviceAllocator(const Allocator* allocator) { void SetAllocator(const Allocator* allocator) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
allocator, allocator,
pten::errors::InvalidArgument( pten::errors::InvalidArgument(
...@@ -47,7 +47,7 @@ struct DeviceContext::Impl { ...@@ -47,7 +47,7 @@ struct DeviceContext::Impl {
zero_allocator_ = allocator; zero_allocator_ = allocator;
} }
const Allocator& GetDeviceAllocator() const { const Allocator& GetAllocator() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
device_allocator_, device_allocator_,
pten::errors::InvalidArgument("Required device_allocator_ shall not be " pten::errors::InvalidArgument("Required device_allocator_ shall not be "
...@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); } ...@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
DeviceContext::DeviceContext(const DeviceContext& other) { DeviceContext::DeviceContext(const DeviceContext& other) {
impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetHostAllocator(&other.GetHostAllocator());
impl_->SetDeviceAllocator(&other.GetDeviceAllocator()); impl_->SetAllocator(&other.GetAllocator());
impl_->SetZeroAllocator(&other.GetZeroAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator());
} }
...@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) { ...@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
DeviceContext::~DeviceContext() = default; DeviceContext::~DeviceContext() = default;
void DeviceContext::SetDeviceAllocator(const Allocator* allocator) { void DeviceContext::SetAllocator(const Allocator* allocator) {
impl_->SetDeviceAllocator(allocator); impl_->SetAllocator(allocator);
} }
const Allocator& DeviceContext::GetDeviceAllocator() const { const Allocator& DeviceContext::GetAllocator() const {
return impl_->GetDeviceAllocator(); return impl_->GetAllocator();
} }
void DeviceContext::SetHostAllocator(const Allocator* allocator) { void DeviceContext::SetHostAllocator(const Allocator* allocator) {
......
...@@ -60,7 +60,7 @@ class DeviceContext { ...@@ -60,7 +60,7 @@ class DeviceContext {
* *
* @param allocator * @param allocator
*/ */
void SetDeviceAllocator(const Allocator*); void SetAllocator(const Allocator*);
/** /**
* @brief Set the host Allocator object. * @brief Set the host Allocator object.
...@@ -81,7 +81,7 @@ class DeviceContext { ...@@ -81,7 +81,7 @@ class DeviceContext {
* *
* @return Allocator * @return Allocator
*/ */
const Allocator& GetDeviceAllocator() const; const Allocator& GetAllocator() const;
/** /**
* @brief Get the const device-related Allocator object. * @brief Get the const device-related Allocator object.
...@@ -114,7 +114,7 @@ class DeviceContext { ...@@ -114,7 +114,7 @@ class DeviceContext {
// TODO(wilber): Just for the convenience of migrating the code, it will be // TODO(wilber): Just for the convenience of migrating the code, it will be
// modified or removed later. // modified or removed later.
virtual Place GetPlace() const = 0; virtual const Place& GetPlace() const = 0;
// TODO(wilber): The fluid framework uses wait() in many places, how to delete // TODO(wilber): The fluid framework uses wait() in many places, how to delete
// this API interface. // this API interface.
virtual void Wait() const {} virtual void Wait() const {}
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#ifdef __HIPCC__ #ifdef __HIPCC__
#define ELEMENTWISE_BLOCK_SIZE 256 #define ELEMENTWISE_BLOCK_SIZE 256
...@@ -31,7 +32,7 @@ namespace funcs { ...@@ -31,7 +32,7 @@ namespace funcs {
* 2x~4x) than number of SMs. Hence, SM count is took into account within * 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block. * this function to determine the right number of threads per block.
*/ */
inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx, inline int GetThreadsConfig(const pten::GPUContext &ctx,
int64_t numel, int64_t numel,
int vec_size) { int vec_size) {
int threads = ELEMENTWISE_BLOCK_SIZE; int threads = ELEMENTWISE_BLOCK_SIZE;
......
...@@ -23,8 +23,8 @@ limitations under the License. */ ...@@ -23,8 +23,8 @@ limitations under the License. */
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h" #include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/backends/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h" #include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace kps = pten::kps; namespace kps = pten::kps;
...@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
VecSize><<<grid_size, block_size, 0, stream>>>( VecSize><<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func); ins_data, outs_data, numel, main_offset, func);
#else #else
auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize); auto gpu_config =
pten::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize * int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
gpu_config.GetBlockSize(); gpu_config.GetBlockSize();
auto stream = ctx.stream(); auto stream = ctx.stream();
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/ddim.h" #include "paddle/pten/core/ddim.h"
......
...@@ -237,12 +237,11 @@ __global__ void SplitKernel(const T* input_data, ...@@ -237,12 +237,11 @@ __global__ void SplitKernel(const T* input_data,
SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data); SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
} }
static inline void GetBlockDims( static inline void GetBlockDims(const pten::GPUContext& context,
const paddle::platform::CUDADeviceContext& context, int64_t num_rows,
int64_t num_rows, int64_t num_cols,
int64_t num_cols, dim3* block_dims,
dim3* block_dims, dim3* grid_dims) {
dim3* grid_dims) {
// Set the thread block and grid according to CurrentDeviceId // Set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
......
...@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx, ...@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place)); ctx_gpu_place));
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>( : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
dev_ctx)
.stream();
paddle::memory::Copy( paddle::memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cpu_place(src_place) && // NOLINT } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT
...@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx, ...@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place)); ctx_gpu_place));
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>( : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
dev_ctx)
.stream();
paddle::memory::Copy( paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
...@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx, ...@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place.device)); ctx_gpu_place.device));
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>( : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
dev_ctx)
.stream();
paddle::memory::Copy( paddle::memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
...@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx, ...@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place.device)); ctx_gpu_place.device));
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>( : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
dev_ctx)
.stream();
paddle::memory::Copy( paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
...@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx, ...@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx,
ctx_place)); ctx_place));
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>( : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
dev_ctx)
.stream();
if (paddle::platform::is_same_place(src_place, dst_place)) { if (paddle::platform::is_same_place(src_place, dst_place)) {
paddle::memory::Copy( paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
......
...@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, ...@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
T, T,
kps::AddFunctor, kps::AddFunctor,
kps::IdentityFunctor<T>>( kps::IdentityFunctor<T>>(
dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream); ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
} }
// dy // dy
...@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, ...@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
T, T,
kps::AddFunctor, kps::AddFunctor,
kps::IdentityFunctor<T>>( kps::IdentityFunctor<T>>(
dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream); ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
} }
} }
...@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, ...@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
T, T,
kps::AddFunctor, kps::AddFunctor,
kps::IdentityFunctor<T>>( kps::IdentityFunctor<T>>(
dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream); ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
} }
// dy // dy
...@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, ...@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
T, T,
kps::AddFunctor, kps::AddFunctor,
kps::InverseFunctor<T>>( kps::InverseFunctor<T>>(
dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream); ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
} }
} }
} }
......
...@@ -1064,7 +1064,8 @@ template <typename Tx, ...@@ -1064,7 +1064,8 @@ template <typename Tx,
typename Ty, typename Ty,
template <typename> class ReduceOp, template <typename> class ReduceOp,
typename TransformOp> typename TransformOp>
void TensorReduceFunctorImpl(const pten::DenseTensor& x, void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx,
const pten::DenseTensor& x,
pten::DenseTensor* y, pten::DenseTensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
...@@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x, ...@@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
auto x_data = x.data<Tx>(); auto x_data = x.data<Tx>();
auto y_data = y->data<Ty>(); auto y_data = y->data<Ty>();
auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(x.place()));
if (config.reduce_num == 1) { if (config.reduce_num == 1) {
std::vector<const DenseTensor*> inputs = {&x}; std::vector<const DenseTensor*> inputs = {&x};
std::vector<DenseTensor*> outputs = {y}; std::vector<DenseTensor*> outputs = {y};
funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>( funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
*dev_ctx, inputs, &outputs, transform); dev_ctx, inputs, &outputs, transform);
return; return;
} }
...@@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx, ...@@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx,
data_t, data_t,
ReduceOp, ReduceOp,
TransformOp<T, MPType>>( TransformOp<T, MPType>>(
x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream); dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
stream);
})); }));
} else { } else {
using MPType = typename kps::details::MPTypeTrait<T>::Type; using MPType = typename kps::details::MPTypeTrait<T>::Type;
pten::kernels:: pten::kernels::
TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>( TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream); dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
stream);
} }
} }
} // namespace pten } // namespace pten
......
...@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> { ...@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
DenseTensor* output, DenseTensor* output,
const std::vector<int>& reduce_dims) { const std::vector<int>& reduce_dims) {
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
kernels:: kernels::TensorReduceFunctorImpl<T,
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( T,
input, output, kps::IdentityFunctor<T>(), reduce_dims, stream); kps::AddFunctor,
kps::IdentityFunctor<T>>(
dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
}; };
#endif #endif
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/pten/api/include/api.h" #include "paddle/pten/api/include/api.h"
#include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/kernels/copy_kernel.h" #include "paddle/pten/kernels/copy_kernel.h"
...@@ -122,7 +123,7 @@ TEST(API, matmul_cuda) { ...@@ -122,7 +123,7 @@ TEST(API, matmul_cuda) {
auto& pool = paddle::platform::DeviceContextPool::Instance(); auto& pool = paddle::platform::DeviceContextPool::Instance();
auto place = paddle::platform::CUDAPlace(); auto place = paddle::platform::CUDAPlace();
auto* dev_ctx = pool.GetByPlace(place); auto* dev_ctx = static_cast<const pten::GPUContext*>(pool.GetByPlace(place));
pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get()); pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get()); pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
......
...@@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) { ...@@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) {
std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data); std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
pten::CPUContext dev_ctx_cpu; pten::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
// 1. test dense_to_sparse_coo // 1. test dense_to_sparse_coo
paddle::experimental::Tensor x(dense_x); paddle::experimental::Tensor x(dense_x);
......
...@@ -25,43 +25,29 @@ limitations under the License. */ ...@@ -25,43 +25,29 @@ limitations under the License. */
namespace pten { namespace pten {
namespace tests { namespace tests {
class InferenceCPUContext : public CPUContext {
public:
void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
CPUContext::SetEigenDevice(eigen_device);
}
};
TEST(DeviceContext, cpu_context) { TEST(DeviceContext, cpu_context) {
std::cout << "test training scenarios" << std::endl; std::cout << "test training scenarios" << std::endl;
{ {
pten::CPUContext ctx; pten::CPUContext ctx;
ctx.Init();
EXPECT_TRUE(ctx.eigen_device() != nullptr); EXPECT_TRUE(ctx.eigen_device() != nullptr);
} }
std::cout << "test inference scenarios" << std::endl; std::cout << "test inference scenarios" << std::endl;
Eigen::DefaultDevice* device = new Eigen::DefaultDevice(); Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
{ {
pten::CPUContextResource ctx_res{device}; InferenceCPUContext ctx;
pten::CPUContext ctx(ctx_res);
EXPECT_TRUE(ctx.eigen_device() != nullptr);
}
{
pten::CPUContextResource ctx_res{nullptr};
pten::CPUContext ctx(ctx_res);
ctx.SetEigenDevice(device); ctx.SetEigenDevice(device);
EXPECT_TRUE(ctx.eigen_device() != nullptr); EXPECT_TRUE(ctx.eigen_device() != nullptr);
} }
delete device; delete device;
std::cout << "test copy constructor" << std::endl;
{
pten::CPUContext ctx1;
pten::CPUContext ctx2(ctx1);
EXPECT_EQ(ctx1.eigen_device(), ctx2.eigen_device());
}
std::cout << "test move constructor" << std::endl;
{
pten::CPUContext ctx1 = pten::CPUContext();
auto* eigen_device1 = ctx1.eigen_device();
pten::CPUContext ctx2(std::move(ctx1));
auto* eigen_device2 = ctx2.eigen_device();
EXPECT_EQ(eigen_device1, eigen_device2);
}
} }
} // namespace tests } // namespace tests
......
...@@ -50,10 +50,10 @@ TEST(DEV_API, cast) { ...@@ -50,10 +50,10 @@ TEST(DEV_API, cast) {
} }
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
pten::DataType out_dtype = pten::DataType::FLOAT64; pten::DataType out_dtype = pten::DataType::FLOAT64;
// 2. test API // 2. test API
......
...@@ -59,10 +59,10 @@ TEST(DEV_API, concat) { ...@@ -59,10 +59,10 @@ TEST(DEV_API, concat) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = pten::Concat<float>(dev_ctx, inputs, 0); auto out = pten::Concat<float>(dev_ctx, inputs, 0);
// 3. check result // 3. check result
......
...@@ -46,10 +46,10 @@ TEST(DEV_API, conj) { ...@@ -46,10 +46,10 @@ TEST(DEV_API, conj) {
} }
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
// 2. test API // 2. test API
auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x); auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x);
......
...@@ -58,10 +58,10 @@ TEST(DEV_API, copy) { ...@@ -58,10 +58,10 @@ TEST(DEV_API, copy) {
std::cout << typeid(a).name() << std::endl; std::cout << typeid(a).name() << std::endl;
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
// 3. check result // 3. check result
......
...@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim; ...@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim;
TEST(DEV_API, empty) { TEST(DEV_API, empty) {
// 1. create input // 1. create input
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
// 2. test API // 2. test API
auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32); auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32);
...@@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) { ...@@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = pten::EmptyLike<float>(dev_ctx, dense_x); auto out = pten::EmptyLike<float>(dev_ctx, dense_x);
// 3. check result // 3. check result
...@@ -84,10 +84,10 @@ TEST(DEV_API, full) { ...@@ -84,10 +84,10 @@ TEST(DEV_API, full) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32); auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32);
// 3. check result // 3. check result
...@@ -118,10 +118,10 @@ TEST(DEV_API, full_like) { ...@@ -118,10 +118,10 @@ TEST(DEV_API, full_like) {
float val = 1.0; float val = 1.0;
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
// 2. test API // 2. test API
auto out = pten::FullLike<float>(dev_ctx, dense_x, val); auto out = pten::FullLike<float>(dev_ctx, dense_x, val);
......
...@@ -60,10 +60,10 @@ TEST(DEV_API, dot) { ...@@ -60,10 +60,10 @@ TEST(DEV_API, dot) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y); auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
......
...@@ -62,10 +62,10 @@ TEST(DEV_API, add) { ...@@ -62,10 +62,10 @@ TEST(DEV_API, add) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y); auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
...@@ -116,10 +116,10 @@ TEST(DEV_API, subtract) { ...@@ -116,10 +116,10 @@ TEST(DEV_API, subtract) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y); auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
...@@ -170,10 +170,10 @@ TEST(DEV_API, divide) { ...@@ -170,10 +170,10 @@ TEST(DEV_API, divide) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y); auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
...@@ -224,10 +224,10 @@ TEST(DEV_API, multiply) { ...@@ -224,10 +224,10 @@ TEST(DEV_API, multiply) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y); auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
......
...@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) { ...@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) {
} }
int start_axis = 1, stop_axis = 2; int start_axis = 1, stop_axis = 2;
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
// 2. test API // 2. test API
auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis); auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
......
...@@ -55,10 +55,10 @@ TEST(DEV_API, dot) { ...@@ -55,10 +55,10 @@ TEST(DEV_API, dot) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false); auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
// 3. check result // 3. check result
......
...@@ -49,10 +49,10 @@ TEST(DEV_API, mean) { ...@@ -49,10 +49,10 @@ TEST(DEV_API, mean) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false); auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false);
// 3. check result // 3. check result
......
...@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) { ...@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = pten::Reshape<float>(dev_ctx, dense_x, shape); auto out = pten::Reshape<float>(dev_ctx, dense_x, shape);
// 3. check result // 3. check result
std::vector<int64_t> expect_shape = {12, 3}; std::vector<int64_t> expect_shape = {12, 3};
......
...@@ -49,10 +49,11 @@ TEST(DEV_API, scale) { ...@@ -49,10 +49,11 @@ TEST(DEV_API, scale) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = auto out =
pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale); pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
...@@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) { ...@@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) {
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
auto out = auto out =
pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale); pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/kernels/copy_kernel.h" #include "paddle/pten/kernels/copy_kernel.h"
#include "paddle/pten/kernels/sparse/sparse_utils_kernel.h" #include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
...@@ -22,6 +24,8 @@ limitations under the License. */ ...@@ -22,6 +24,8 @@ limitations under the License. */
#include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/core/kernel_registry.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace pten { namespace pten {
namespace tests { namespace tests {
...@@ -38,9 +42,8 @@ inline void CheckResult( ...@@ -38,9 +42,8 @@ inline void CheckResult(
ASSERT_EQ(coo.nnz(), non_zero_num); ASSERT_EQ(coo.nnz(), non_zero_num);
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
if (coo.place() == paddle::platform::CUDAPlace()) { if (coo.place() == pten::GPUPlace()) {
const auto* dev_ctx_cuda = const auto* dev_ctx_cuda = static_cast<const pten::GPUContext*>(dev_ctx);
static_cast<const paddle::platform::CUDADeviceContext*>(dev_ctx);
DenseTensor indices( DenseTensor indices(
alloc.get(), alloc.get(),
DenseTensorMeta( DenseTensorMeta(
...@@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, ...@@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
paddle::platform::CPUPlace()); paddle::platform::CPUPlace());
pten::CPUContext dev_ctx_cpu; pten::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
// 1. test cpu // 1. test cpu
auto cpu_sparse_out = auto cpu_sparse_out =
sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim); sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
...@@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, ...@@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
// 2. test cuda // 2. test cuda
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
paddle::platform::DeviceContextPool& pool = // paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); // paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace()); // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten::GPUContext dev_ctx_gpu;
dev_ctx_gpu.PartialInitWithoutAllocator();
dev_ctx_gpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
.get());
dev_ctx_gpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(pten::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
const auto cuda_alloc = const auto cuda_alloc =
std::make_shared<paddle::experimental::DefaultAllocator>( std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CUDAPlace()); paddle::platform::CUDAPlace());
...@@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, ...@@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
cuda_alloc.get(), cuda_alloc.get(),
DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout())); DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
pten::Copy(*dev_ctx_cuda, dense_x, true, &d_dense_x); pten::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
auto sparse_out = auto sparse_out =
sparse::DenseToSparseCoo<T>(*dev_ctx_cuda, d_dense_x, sparse_dim); sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
CheckResult<T, int64_t>(dev_ctx_cuda, CheckResult<T, int64_t>(&dev_ctx_gpu,
sparse_out, sparse_out,
non_zero_data, non_zero_data,
indices_data, indices_data,
...@@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims, ...@@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
alloc); alloc);
// 2. test cuda // 2. test cuda
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
pten::GPUContext dev_ctx_gpu;
dev_ctx_gpu.PartialInitWithoutAllocator();
dev_ctx_gpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
.get());
dev_ctx_gpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(pten::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
const auto cuda_alloc = const auto cuda_alloc =
std::make_shared<paddle::experimental::DefaultAllocator>( std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CUDAPlace()); paddle::platform::CUDAPlace());
auto& pool = paddle::platform::DeviceContextPool::Instance(); // auto& pool = paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace()); // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta); pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta); pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
pten::DenseTensor d_values(cuda_alloc.get(), values_meta); pten::DenseTensor d_values(cuda_alloc.get(), values_meta);
pten::Copy(*dev_ctx_cuda, crows, true, &d_crows); pten::Copy(dev_ctx_gpu, crows, true, &d_crows);
pten::Copy(*dev_ctx_cuda, cols, true, &d_cols); pten::Copy(dev_ctx_gpu, cols, true, &d_cols);
pten::Copy(*dev_ctx_cuda, values, true, &d_values); pten::Copy(dev_ctx_gpu, values, true, &d_values);
pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(*dev_ctx_cuda, d_csr); auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
CheckResult<T, int64_t>(dev_ctx_cuda, CheckResult<T, int64_t>(&dev_ctx_gpu,
cuda_sparse_out, cuda_sparse_out,
non_zero_data, non_zero_data,
indices_data, indices_data,
......
...@@ -47,10 +47,11 @@ TEST(DEV_API, sum) { ...@@ -47,10 +47,11 @@ TEST(DEV_API, sum) {
std::vector<int64_t> axis = {0, 1}; std::vector<int64_t> axis = {0, 1};
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator( dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace())
.GetAllocator(paddle::platform::CPUPlace()) .get());
.get()); dev_ctx.Init();
// 2. test API // 2. test API
auto out = auto out =
pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false); pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册