未验证 提交 a821c4a9 编写于 作者: W Wilber 提交者: GitHub

[PTEN] Add Gpu context (#39305)

上级 dcff7fa8
...@@ -33,7 +33,7 @@ namespace distributed { ...@@ -33,7 +33,7 @@ namespace distributed {
template <typename T> template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T> inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() { GetBlas() {
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx); T>(cpu_ctx);
} }
......
...@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) { ...@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto &t_latest = var_latest->Get<framework::LoDTensor>(); auto &t_latest = var_latest->Get<framework::LoDTensor>();
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>(); auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname); auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>(); auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace()); t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
...@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) { ...@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense(varnames, table_id, pserver_scope_.get()); RpcRecvDense(varnames, table_id, pserver_scope_.get());
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
for (auto &varname : varnames) { for (auto &varname : varnames) {
auto *var_latest = recv_scope_->FindVar(varname); auto *var_latest = recv_scope_->FindVar(varname);
auto t_latest = var_latest->GetMutable<framework::LoDTensor>(); auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
...@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, ...@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto *t_old = var_old->GetMutable<framework::LoDTensor>(); auto *t_old = var_old->GetMutable<framework::LoDTensor>();
auto dims1 = t_latest.dims()[1]; auto dims1 = t_latest.dims()[1];
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname); auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>(); auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
...@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id, ...@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std::vector<float> v_delta; std::vector<float> v_delta;
v_delta.resize(numel); v_delta.resize(numel);
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
auto blas = auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>( paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx); cpu_ctx);
......
...@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name, ...@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
} }
// set output tensor to 0. // set output tensor to 0.
auto cpu_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext cpu_ctx;
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T> paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
constant_functor; constant_functor;
constant_functor(cpu_ctx, out_t, static_cast<T>(0)); constant_functor(cpu_ctx, out_t, static_cast<T>(0));
...@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name, ...@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for (auto &var : vars) { for (auto &var : vars) {
inputs.push_back(&var->Get<pten::SelectedRows>()); inputs.push_back(&var->Get<pten::SelectedRows>());
} }
auto dev_ctx = paddle::platform::CPUDeviceContext(); paddle::platform::CPUDeviceContext dev_ctx;
if (merge_add) { if (merge_add) {
paddle::operators::math::scatter::MergeAdd< paddle::operators::math::scatter::MergeAdd<
paddle::platform::CPUDeviceContext, T> paddle::platform::CPUDeviceContext, T>
......
...@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace(); auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0); auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
auto kernel_fp16 = paddle::framework::OpKernelType( auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place, paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout, paddle::framework::DataLayout::kAnyLayout,
......
...@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { ...@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>( auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
pool.Get(member_->places_[dev_id])); pool.Get(member_->places_[dev_id]));
auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_bkcl_context(bkcl_ctx.comm()); dev_ctx->SetBkclContext(bkcl_ctx.comm());
} }
#else #else
PADDLE_THROW( PADDLE_THROW(
......
...@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> { ...@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using TYPE = pten::CPUContext; using TYPE = pten::CPUContext;
}; };
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct ConvertToPtenContext<platform::CUDADeviceContext> {
using TYPE = pten::GPUContext;
};
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template <> template <>
struct ConvertToPtenContext<platform::XPUDeviceContext> { struct ConvertToPtenContext<platform::XPUDeviceContext> {
......
...@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.seekg(seekg, is.cur); is.seekg(seekg, is.cur);
void* buf; void* buf;
auto ctx = platform::CPUDeviceContext(); platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
...@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims)); tensor->Resize(framework::make_ddim(dims));
void* buf; void* buf;
auto ctx = platform::CPUDeviceContext(); platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
...@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) { ...@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
VLOG(1) << "PrintVar: unrecognized data type:" << t.type(); VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
return os; return os;
} }
} } // namespace pten
...@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) { ...@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0); auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
...@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) { ...@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor.Resize(paddle::framework::make_ddim({3, 3})); gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace(); auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
...@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) { ...@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place); paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<int> dst; std::vector<int> dst;
...@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) { ...@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place); paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor); paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<bool> dst; std::vector<bool> dst;
...@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) { ...@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
auto gpu_place = new platform::CUDAPlace(); auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
......
...@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() { ...@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
gloo_wrapper->Init(); gloo_wrapper->Init();
device_ = std::unique_ptr<platform::CPUDeviceContext>( device_ = std::unique_ptr<platform::CPUDeviceContext>(
new platform::CPUDeviceContext(platform::CPUPlace())); new platform::CPUDeviceContext(platform::CPUPlace()));
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
device_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
} }
void GLOOParallelContext::InitWithRingID(int ring_id) { void GLOOParallelContext::InitWithRingID(int ring_id) {
......
...@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) { ...@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else #else
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
......
...@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
ctx_->PartialInitWithAllocator();
engine_ = new TensorRTEngine(10, 1 << 10); engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork(); engine_->InitNetwork();
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
...@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) { ...@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
platform::CUDAPlace gpu(0); platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu); platform::CUDADeviceContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream())
.get());
dev_ctx.PartialInitWithAllocator();
auto th_main = [&](std::random_device::result_type seed) { auto th_main = [&](std::random_device::result_type seed) {
std::default_random_engine engine(seed); std::default_random_engine engine(seed);
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) { ...@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset(); main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>( auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place))); new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]); MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
} }
...@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { ...@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset(); main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>( auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place))); new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i], threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
std::cref(*dev_ctx[i]))); std::cref(*dev_ctx[i])));
} }
......
...@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, ...@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
return block_size; return block_size;
}; };
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post; int64_t height = pre * post;
int64_t width = n; int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
...@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, ...@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
int block_size = ComputeBlockSize(num_cols); int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size // actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
// Init a index array // Init a index array
...@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, ...@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
int block_size = ComputeBlockSize(num_cols); int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size // actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
FillGrad<<<grid_size, block_size, 0, cu_stream>>>( FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
......
...@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> { ...@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
// reduce_sum implementation on CUDA // reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input_tensor, output_tensor, kps::IdentityFunctor<T>(), context.cuda_device_context(), *input_tensor, output_tensor,
reduce_dims_vec, stream); kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
} }
} }
} }
......
...@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> { ...@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
} }
gpuStream_t stream = ctx.cuda_device_context().stream(); gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream); ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
} }
}; };
......
...@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16> ...@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
{1}, dev_ctx); {1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor, TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>( kps::SquareFunctor<platform::float16, float>>(
*input, &tmp, kps::SquareFunctor<platform::float16, float>(), dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream()); reduce_dims, dev_ctx.stream());
auto tmp_eigen = EigenVector<float>::Flatten(tmp); auto tmp_eigen = EigenVector<float>::Flatten(tmp);
auto x_norm = tmp_eigen.sqrt(); auto x_norm = tmp_eigen.sqrt();
......
...@@ -65,7 +65,8 @@ class CompareReduceOpKernel ...@@ -65,7 +65,8 @@ class CompareReduceOpKernel
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd, TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
kps::IdentityFunctor<bool>>( kps::IdentityFunctor<bool>>(
tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream); context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream);
} }
} }
}; };
......
...@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
TEST(copy_cross_scope, CUDA_fp32) { TEST(copy_cross_scope, CUDA_fp32) {
f::Scope scope; f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0)); p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare1<float>(&scope, ctx, "copy_cross_scope"); Compare1<float>(&scope, ctx, "copy_cross_scope");
} }
TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
f::Scope scope; f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0)); p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare2<float>(&scope, ctx, "copy_cross_scope"); Compare2<float>(&scope, ctx, "copy_cross_scope");
} }
#elif PADDLE_WITH_ASCEND_CL #elif PADDLE_WITH_ASCEND_CL
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
...@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T> ...@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
pt_z.get()); *pt_x.get(), *pt_y.get(), axis, pt_z.get());
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"X's type[%s] is not supported by elementwise_op. X's type should be " "X's type[%s] is not supported by elementwise_op. X's type should be "
......
...@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, ...@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) { framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream()); dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream());
} }
template <ElementwiseType ET, typename T, typename Functor> template <ElementwiseType ET, typename T, typename Functor>
......
...@@ -275,6 +275,18 @@ class TestFeedForward { ...@@ -275,6 +275,18 @@ class TestFeedForward {
output_size_ = 3 * num_head_ * dim_head_; output_size_ = 3 * num_head_ * dim_head_;
input_size_ = dim_embed_; input_size_ = dim_embed_;
ctx_ = new platform::CUDADeviceContext(place_); ctx_ = new platform::CUDADeviceContext(place_);
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
ctx_->PartialInitWithAllocator();
size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim] size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim]
size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim] size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim]
......
...@@ -166,7 +166,8 @@ class AttnMatMul { ...@@ -166,7 +166,8 @@ class AttnMatMul {
if (support_case_1 || support_case_2) { if (support_case_1 || support_case_2) {
gpuStream_t stream = dev_ctx_.stream(); gpuStream_t stream = dev_ctx_.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream); dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
stream);
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Only support reduce when the input dims are [0,1,2,3,4] and " "Only support reduce when the input dims are [0,1,2,3,4] and "
......
...@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( ...@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
is_aligned(y, kAlignment)) { \ is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \ size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \ size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \ block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \
<< " , thread = " << thread; \ << " , thread = " << thread; \
FP16FastGeluFwdCUDAKernel< \ FP16FastGeluFwdCUDAKernel< \
...@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( ...@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
is_aligned(x_g, kAlignment)) { \ is_aligned(x_g, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \ size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \ size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \ block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \
<< " , thread = " << thread; \ << " , thread = " << thread; \
FP16FastGeluBwdCUDAKernel< \ FP16FastGeluBwdCUDAKernel< \
......
...@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input, ...@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
int block = 1024; int block = 1024;
#endif #endif
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block; int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
// 1. Insert data into keys and values. // 1. Insert data into keys and values.
...@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, ...@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
int block = 1024; int block = 1024;
#endif #endif
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (outputs->size() + block - 1) / block; int64_t grid_tmp = (outputs->size() + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
ReindexSrcOutput< ReindexSrcOutput<
......
...@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper( ...@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
#endif #endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0]; int64_t input_size = src_dims[0];
...@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( ...@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
#endif #endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0]; int64_t input_size = src_dims[0];
......
...@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> { ...@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
const int size_from_axis = SizeFromAxis(axis, X.dims()); const int size_from_axis = SizeFromAxis(axis, X.dims());
const int size_out_axis = SizeOutAxis(axis, X.dims()); const int size_out_axis = SizeOutAxis(axis, X.dims());
constexpr int thread_size = 512; constexpr int thread_size = 512;
int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
int64_t height = size_to_axis * size_out_axis; int64_t height = size_to_axis * size_out_axis;
int block_size = height < max_grid_dimx ? height : max_grid_dimx; int block_size = height < max_grid_dimx ? height : max_grid_dimx;
......
...@@ -27,10 +27,10 @@ namespace operators { ...@@ -27,10 +27,10 @@ namespace operators {
namespace { namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>() auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize(); .GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x; grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y; grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
} }
} }
......
...@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( ...@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
int block_y = std::min(GetLastPow2(height), max_threads / block_x); int block_y = std::min(GetLastPow2(height), max_threads / block_x);
int block_z = std::min(num_img, max_threads / block_x / block_y); int block_z = std::min(num_img, max_threads / block_x / block_y);
dim3 max_grid_dim = context.GetCUDAMaxGridDimSize(); auto max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x)); int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y)); int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
int grid_z = int grid_z =
std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4)); std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));
const int capability = context.GetComputeCapability(); const int capability = context.GetComputeCapability();
platform::GpuLaunchConfig config; platform::GpuLaunchConfig config;
......
...@@ -306,11 +306,11 @@ struct KronGradOpFunctor { ...@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
auto stream = dev_ctx.stream(); // it is a cuda device_context auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) { if (dx) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream); dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
} }
if (dy) { if (dy) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream); dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
} }
#else #else
auto* place = dev_ctx.eigen_device(); auto* place = dev_ctx.eigen_device();
......
...@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx, ...@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
input_indices.mutable_data<int64_t>(ctx.GetPlace()); input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1; size_t temp_storage_bytes = -1;
int block_size = getBlockSize(num_cols); int block_size = getBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
unsigned int grid_size = num_rows < maxGridDimX unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows) ? static_cast<unsigned int>(num_rows)
: maxGridDimX; : maxGridDimX;
......
...@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) { ...@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else #else
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
......
...@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place); T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1}, dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx.stream()); dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
...@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1}, dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx.stream()); dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h" #include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
void PrepareCPUTensors(paddle::framework::LoDTensor* ids, void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* scores, paddle::framework::LoDTensor* scores,
...@@ -129,6 +131,83 @@ void TestBeamSearch() { ...@@ -129,6 +131,83 @@ void TestBeamSearch() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestBeamSearch<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::LoDTensor ids;
paddle::framework::LoDTensor scores;
paddle::framework::LoDTensor pre_ids;
paddle::framework::LoDTensor pre_scores;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
} else {
paddle::framework::LoDTensor cpu_ids;
paddle::framework::LoDTensor cpu_scores;
paddle::framework::LoDTensor cpu_pre_ids;
paddle::framework::LoDTensor cpu_pre_scores;
PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
ids.set_lod(cpu_ids.lod());
scores.set_lod(cpu_scores.lod());
pre_ids.set_lod(cpu_pre_ids.lod());
pre_scores.set_lod(cpu_pre_scores.lod());
}
paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores;
paddle::framework::LoDTensor parent_idx;
size_t level = 0;
size_t beam_size = 2;
int end_id = 0;
paddle::operators::math::BeamSearchFunctor<
paddle::platform::CUDADeviceContext, float>
beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, &parent_idx, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
paddle::framework::LoDTensor cpu_selected_ids;
paddle::framework::LoDTensor cpu_selected_scores;
if (paddle::platform::is_cpu_place(*place)) {
cpu_selected_ids = selected_ids;
cpu_selected_scores = selected_scores;
} else {
paddle::framework::TensorCopySync(
selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
paddle::framework::TensorCopySync(
selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
cpu_selected_ids.set_lod(selected_ids.lod());
cpu_selected_scores.set_lod(selected_scores.lod());
}
std::vector<int64_t> expected_ids({4, 5, 3, 8});
std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
}
delete place;
delete context;
}
#endif
TEST(BeamSearch, CPU) { TEST(BeamSearch, CPU) {
TestBeamSearch<paddle::platform::CPUDeviceContext, TestBeamSearch<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>(); paddle::platform::CPUPlace>();
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
/** /**
* case 1: * case 1:
...@@ -441,6 +443,31 @@ void TestConcatMain() { ...@@ -441,6 +443,31 @@ void TestConcatMain() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestConcatMain<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
auto* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
delete context;
}
#endif
TEST(math, concat) { TEST(math, concat) {
TestConcatMain<paddle::platform::CPUDeviceContext, TestConcatMain<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>(); paddle::platform::CPUPlace>();
......
...@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz, ...@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
const int rows, const int cols) { const int rows, const int cols) {
paddle::platform::CUDADeviceContext* context = paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
auto sparse = auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext, paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
T>(*context); T>(*context);
...@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data, ...@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
const std::string& mode) { const std::string& mode) {
paddle::platform::CUDADeviceContext* context = paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
// get sparse // get sparse
auto sparse = auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext, paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testIm2col() { void testIm2col() {
...@@ -60,6 +62,7 @@ void testIm2col() { ...@@ -60,6 +62,7 @@ void testIm2col() {
auto* place = new Place(); auto* place = new Place();
DeviceContext* context = new DeviceContext(*place); DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
...@@ -164,6 +167,165 @@ void testIm2col() { ...@@ -164,6 +167,165 @@ void testIm2col() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor input;
paddle::framework::Tensor output_cfo;
paddle::framework::Tensor output_ocf;
paddle::framework::Tensor output_tmp;
/**
* input = [0, 1, 2,
* 3, 4, 5]
*
* output_cfo = [0, 1
* 1, 2
* 3, 4
* 4, 5]
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*
* col2im_cfo = [0, 2, 2
* 3, 4, 5]
*
* col2im_ocf = [0, 2, 2
* 3, 4, 5]
*/
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> stride({1, 1}); // stride_y, stride_x
std::vector<int> padding(
{0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad
std::vector<int> dilation({1, 1}); // dilation_y, dilation_x
int output_height =
(input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
int output_width =
(input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
float* input_ptr = input_tmp.mutable_data<float>(
{1, input_height, input_width}, paddle::platform::CPUPlace());
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input_ptr, arr, 6 * sizeof(float));
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
output_ocf.mutable_data<float>(
{output_height, output_width, 1, filter_size, filter_size}, *place);
// Im2Col
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
im2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
im2col_ocf;
im2col(*context, input, dilation, stride, padding, &output_cfo);
im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
}
float* out_ocf_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
&output_tmp);
out_ocf_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
}
// Col2Im: kCFO
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
col2im;
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
col2im_ocf;
float col2im_data[] = {0, 2, 2, 3, 8, 5};
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
// Col2Im: kOCF
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
delete place;
delete context;
}
#endif
TEST(math, im2col) { TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, ...@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
constexpr size_t kThreadNumY = 32; constexpr size_t kThreadNumY = 32;
size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY; size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x); grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
dim3 thread_dims(kThreadNumX, kThreadNumY); dim3 thread_dims(kThreadNumX, kThreadNumY);
if (reverse) { if (reverse) {
InclusiveScanInnerDimCUDAKernel< InclusiveScanInnerDimCUDAKernel<
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h" #include "paddle/pten/kernels/funcs/eigen/common.h"
namespace paddle { namespace paddle {
...@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext, ...@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
template struct SetConstant<platform::CUDADeviceContext, template struct SetConstant<platform::CUDADeviceContext,
platform::complex<double>>; platform::complex<double>>;
template struct SetConstant<pten::GPUContext, platform::float16>;
template struct SetConstant<pten::GPUContext, platform::bfloat16>;
template struct SetConstant<pten::GPUContext, float>;
template struct SetConstant<pten::GPUContext, double>;
template struct SetConstant<pten::GPUContext, uint8_t>;
template struct SetConstant<pten::GPUContext, int>;
template struct SetConstant<pten::GPUContext, int16_t>;
template struct SetConstant<pten::GPUContext, int64_t>;
template struct SetConstant<pten::GPUContext, bool>;
template struct SetConstant<pten::GPUContext, platform::complex<float>>;
template struct SetConstant<pten::GPUContext, platform::complex<double>>;
template struct SetConstant<platform::CUDAPinnedDeviceContext, template struct SetConstant<platform::CUDAPinnedDeviceContext,
platform::float16>; platform::float16>;
template struct SetConstant<platform::CUDAPinnedDeviceContext, template struct SetConstant<platform::CUDAPinnedDeviceContext,
......
...@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) { ...@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
t.Resize({10, 10}); t.Resize({10, 10});
t.mutable_data<int>(paddle::platform::CPUPlace()); t.mutable_data<int>(paddle::platform::CPUPlace());
auto* ctx = new paddle::platform::CPUDeviceContext(); auto* ctx = new paddle::platform::CPUDeviceContext();
ctx->Init();
paddle::operators::math::set_constant(*ctx, &t, 10); paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) { for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i], PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
......
...@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
...@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
...@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) { ...@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place); T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testVol2col() { void testVol2col() {
...@@ -25,7 +27,6 @@ void testVol2col() { ...@@ -25,7 +27,6 @@ void testVol2col() {
auto* place = new Place(); auto* place = new Place();
DeviceContext* context = new DeviceContext(*place); DeviceContext* context = new DeviceContext(*place);
/** /**
* input = [[0, 1, 2, * input = [[0, 1, 2,
* 3, 4, 5] * 3, 4, 5]
...@@ -123,6 +124,124 @@ void testVol2col() { ...@@ -123,6 +124,124 @@ void testVol2col() {
delete context; delete context;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testVol2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input;
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor output;
paddle::framework::Tensor output_tmp;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
/**
* input = [[0, 1, 2,
* 3, 4, 5]
* [6, 7, 8,
* 9, 10, 11]]
*
* output = [0, 1
* 1, 2
* 3, 4
* 4, 5
* 6, 7
* 7, 8
* 9, 10
* 10, 11]
*
* col2vol = [[0, 2, 2,
* 3, 8, 5]
* [6, 14, 8,
* 9, 20, 11]]
*
*/
int input_depth = 2;
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> strides({1, 1, 1});
std::vector<int> paddings({0, 0, 0});
std::vector<int> dilations({1, 1, 1});
int output_depth =
(input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
int output_height =
(input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
int output_width =
(input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
// Vol2Col test
float* input_ptr =
input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
paddle::platform::CPUPlace());
float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input_ptr, arr, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
*place);
paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
float>
vol2col;
vol2col(*context, input, dilations, strides, paddings, &output);
float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 16; ++i) {
EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
}
// Col2Vol test
float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
memset(input_ptr, 0, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
float>
col2vol;
col2vol(*context, output, dilations, strides, paddings, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 12; ++i) {
EXPECT_EQ(in_ptr[i], col_2_vol[i]);
}
delete place;
delete context;
}
#endif
TEST(math, vol2col) { TEST(math, vol2col) {
testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> { ...@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>( TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
*input, output, Div(numel), reduce_dims, stream); context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream);
} }
}; };
......
...@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test { ...@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list_.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i); p::CUDAPlace place(i);
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place)); auto *ctx = new p::CUDADeviceContext(place);
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->PartialInitWithAllocator();
dev_ctxs_.emplace_back(ctx);
} }
NCCLInitOp(); NCCLInitOp();
......
...@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> { ...@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using MT = typename details::MPTypeTrait<T>::Type; using MT = typename details::MPTypeTrait<T>::Type;
if (porder == 0) { if (porder == 0) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
*in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
reduce_axis, stream);
} else if (porder == INFINITY) { } else if (porder == INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else if (porder == -INFINITY) { } else if (porder == -INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else { } else {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
*in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream); ctx.cuda_device_context(), *in_x, out_norm,
UnsignedPowFunctor<T>(porder), reduce_axis, stream);
const framework::Tensor* tmp_norm = out_norm; const framework::Tensor* tmp_norm = out_norm;
std::vector<const framework::Tensor*> ins = {tmp_norm}; std::vector<const framework::Tensor*> ins = {tmp_norm};
......
...@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, TensorReduceFunctorImpl<T, T, kps::AddFunctor,
kps::DivideFunctor<T>>( kps::DivideFunctor<T>>(
*in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim, dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
stream); reduce_dim, stream);
#else // for cpu #else // for cpu
paddle::operators::math::Pool2dFunctor< paddle::operators::math::Pool2dFunctor<
DeviceContext, paddle::operators::math::AvgPool<T>, T> DeviceContext, paddle::operators::math::AvgPool<T>, T>
......
...@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> { ...@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
} }
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream); context.cuda_device_context(), dalpha_tmp, dalpha,
kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
}; };
......
...@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) { ...@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
framework::Scope* scope = (*micro_scope)[0]; framework::Scope* scope = (*micro_scope)[0];
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
framework::Executor exe(place); framework::Executor exe(place);
// create var on local scope // create var on local scope
......
...@@ -39,14 +39,16 @@ namespace operators { ...@@ -39,14 +39,16 @@ namespace operators {
template <typename Tx, typename Ty, template <typename> class ReduceOp, template <typename Tx, typename Ty, template <typename> class ReduceOp,
typename TransformOp> typename TransformOp>
void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { gpuStream_t stream) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>( pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
x, y, transform, origin_reduce_dims, stream); static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims, stream);
} }
} // namespace operators } // namespace operators
......
...@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> { ...@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
cuda_ctx, ins, &outs, func); cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream); cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
stream);
RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>( RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm); numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
RenormKernelFunc4<T><<<grid, block, 0, stream>>>( RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
...@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> { ...@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
dim_divisor); dim_divisor);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream); ctx.cuda_device_context(), pow_value, &dim_value,
kps::IdentityFunctor<T>(), reduce_axis, stream);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis, ctx.cuda_device_context(), mul_value, &weight_derivative,
stream); kps::IdentityFunctor<T>(), reduce_axis, stream);
RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>( RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
x_data, dout_data, dx_data, numel, x_data, dout_data, dx_data, numel,
dim_value.mutable_data<T>(ctx.GetPlace()), dim_value.mutable_data<T>(ctx.GetPlace()),
......
...@@ -389,7 +389,8 @@ class ReshapeKernel { ...@@ -389,7 +389,8 @@ class ReshapeKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out); pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
pt_scalar_shape, out);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -417,7 +418,8 @@ class ReshapeGradKernel { ...@@ -417,7 +418,8 @@ class ReshapeGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeGradKernel(dev_ctx, *d_out, d_x); pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
*d_out, d_x);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel { ...@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out); pten::ReshapeDoubleGradKernel(
static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
......
...@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index, ...@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
int64_t max_grid_dimx = int64_t max_grid_dimx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx) reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.GetCUDAMaxGridDimSize() .GetCUDAMaxGridDimSize()[0];
.x;
int64_t grid = height < max_grid_dimx ? height : max_grid_dimx; int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
ScatterInitCUDAKernel<T, IndexT><<< ScatterInitCUDAKernel<T, IndexT><<<
......
...@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output, ...@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input, output, kps::IdentityFunctor<T>(), reduce_dims, stream); ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
reduce_dims, stream);
#else #else
ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>( ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
input, output, reduce_dims, keep_dim, false, ctx) input, output, reduce_dims, keep_dim, false, ctx)
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto src_allocation = memory::Alloc(gpu0, sizeof(src)); auto src_allocation = memory::Alloc(gpu0, sizeof(src));
...@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform::CUDAPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src)); auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
......
...@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name, ...@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
tensor->Resize(dims); tensor->Resize(dims);
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
inference::tensorrt::RandomizeTensor(tensor, place, ctx); inference::tensorrt::RandomizeTensor(tensor, place, ctx);
} }
...@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) { ...@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
// Prepare variables. // Prepare variables.
if (allow_build_at_runtime) if (allow_build_at_runtime)
CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1})); CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
...@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0); block_->set_idx(0);
......
...@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx, ...@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
}; };
int block_size = ComputeBlockSize(num_cols); int block_size = ComputeBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size // actually, int num_rows < max_grid_size
unsigned int grid_size = num_rows < maxGridDimX unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows) ? static_cast<unsigned int>(num_rows)
......
...@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
std::vector<int> reduce_dims; std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size()); reduce_dims.push_back(out->dims().size());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream); context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
reduce_dims, stream);
} else { } else {
math::SetConstant<DeviceContext, T> functor; math::SetConstant<DeviceContext, T> functor;
functor(context.device_context<DeviceContext>(), out, static_cast<T>(0)); functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
......
...@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> { ...@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
} }
gpuStream_t stream = ctx.cuda_device_context().stream(); gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream); ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
} }
}; };
......
...@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> { ...@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
} }
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
auto cu_stream = dev_ctx.stream(); auto cu_stream = dev_ctx.stream();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post; int64_t height = pre * post;
int64_t width = n; int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
...@@ -138,6 +138,7 @@ if(WITH_CNCL) ...@@ -138,6 +138,7 @@ if(WITH_CNCL)
endif() endif()
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
target_link_libraries(device_context gpu_resource_pool) target_link_libraries(device_context gpu_resource_pool)
endif() endif()
......
...@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) { ...@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor // CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place); CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor // GPU LoDTensor to CPU LoDTensor
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include <utility> #include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
...@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank, ...@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
int dev_id, int ring_id) { int dev_id, int ring_id) {
std::unique_ptr<CUDADeviceContext> dev_ctx( std::unique_ptr<CUDADeviceContext> dev_ctx(
new CUDADeviceContext(CUDAPlace(dev_id))); new CUDADeviceContext(CUDAPlace(dev_id)));
dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
.get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
dev_ctx->PartialInitWithAllocator();
std::shared_ptr<platform::CudaEventObject> compute_event( std::shared_ptr<platform::CudaEventObject> compute_event(
platform::CudaEventResourcePool::Instance().New(dev_id)); platform::CudaEventResourcePool::Instance().New(dev_id));
...@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks, ...@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
auto* dev_ctx = static_cast<platform::XPUDeviceContext*>( auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(
platform::XPUPlace(dev_id))); platform::XPUPlace(dev_id)));
dev_ctx->set_bkcl_context(comm); dev_ctx->SetBkclContext(comm);
} }
return comm_map_[ring_id][dev_id].get(); return comm_map_[ring_id][dev_id].get();
......
IF(WITH_GPU) IF(WITH_GPU)
add_subdirectory(cuda) add_subdirectory(cuda)
nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda) nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
ELSEIF(WITH_ROCM) ELSEIF(WITH_ROCM)
add_subdirectory(rocm) add_subdirectory(rocm)
hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda) hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu) hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
......
nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
#pragma once #pragma once
#include <functional>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -96,8 +98,7 @@ class CublasHandleHolder { ...@@ -96,8 +98,7 @@ class CublasHandleHolder {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
} }
template <typename Callback> inline void Call(const std::function<void(blasHandle_t)>& callback) const {
inline void Call(Callback&& callback) const {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
callback(handle_); callback(handle_);
} }
......
...@@ -14,11 +14,13 @@ limitations under the License. */ ...@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once #pragma once
#include <functional>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cusparse.h" #include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -45,8 +47,8 @@ class CusparseHandleHolder { ...@@ -45,8 +47,8 @@ class CusparseHandleHolder {
#endif #endif
} }
template <typename Callback> inline void Call(
inline void Call(Callback&& callback) const { const std::function<void(pten::sparseHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
callback(handle_); callback(handle_);
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include <cstdlib> #include <cstdlib>
#include <mutex> #include <mutex>
#include <set> #include <set>
...@@ -39,11 +40,12 @@ limitations under the License. */ ...@@ -39,11 +40,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb); DECLARE_uint64(gpu_memory_limit_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f; constexpr static float fraction_reserve_gpu_memory = 0.05f;
...@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f; ...@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT; USE_GPU_MEM_STAT;
namespace paddle { namespace paddle {
namespace platform { namespace platform {
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total; size_t actual_available, actual_total;
...@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) { ...@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr); return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
} }
int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
int GetGPUComputeCapability(int id) {
return pten::backends::gpu::GetGPUComputeCapability(id);
}
int GetGPURuntimeVersion(int id) {
return pten::backends::gpu::GetGPURuntimeVersion(id);
}
int GetGPUDriverVersion(int id) {
return pten::backends::gpu::GetGPUDriverVersion(id);
}
bool TensorCoreAvailable() {
return pten::backends::gpu::TensorCoreAvailable();
}
int GetGPUMultiProcessors(int id) {
return pten::backends::gpu::GetGPUMultiProcessors(id);
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
}
int GetGPUMaxThreadsPerBlock(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
}
int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
return pten::backends::gpu::GetGpuMaxGridDimSize(id);
}
std::vector<int> GetSelectedDevices() {
return pten::backends::gpu::GetSelectedDevices();
}
const gpuDeviceProp &GetDeviceProperties(int id) {
return pten::backends::gpu::GetDeviceProperties(id);
}
void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
void GpuStreamSync(gpuStream_t stream) {
pten::backends::gpu::GpuStreamSync(stream);
}
void GpuDestroyStream(gpuStream_t stream) {
pten::backends::gpu::GpuDestroyStream(stream);
}
void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind) {
pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
count, stream);
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
count);
}
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h> #include <stddef.h>
#include <array>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id); ...@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
int GetCurrentDeviceId(); int GetCurrentDeviceId();
//! Get the maximum GridDim size for GPU buddy allocator. //! Get the maximum GridDim size for GPU buddy allocator.
dim3 GetGpuMaxGridDimSize(int); std::array<int, 3> GetGpuMaxGridDimSize(int);
//! Get a list of device ids from environment variable or use all. //! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices(); std::vector<int> GetSelectedDevices();
...@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream); ...@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
void GpuDestroyStream(gpuStream_t stream); void GpuDestroyStream(gpuStream_t stream);
// ! Blocks until device has completed all operations. // ! Blocks until device has completed all operations.
void GpuDeviceync(); void GpuDeviceSync();
//! CudaMalloc with recorded info //! CudaMalloc with recorded info
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id); gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
......
...@@ -83,8 +83,21 @@ struct NCCLContext { ...@@ -83,8 +83,21 @@ struct NCCLContext {
std::unique_ptr<CUDADeviceContext> ctx_; std::unique_ptr<CUDADeviceContext> ctx_;
ncclComm_t comm_; ncclComm_t comm_;
explicit NCCLContext(int dev_id) explicit NCCLContext(int dev_id) : comm_{nullptr} {
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
ctx_->PartialInitWithAllocator();
}
gpuStream_t stream() const { return ctx_->stream(); } gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; } ncclComm_t comm() const { return comm_; }
......
hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
...@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <functional>
#include <memory> #include <memory>
#include <set> #include <set>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
...@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext( ...@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
cuda_ctx, cuda_ctx,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
dev_ctx->SetDeviceAllocator( // Note: A trick method to init context, why GetAllocator interface
memory::allocation::AllocatorFacade::Instance() // needs a stream parameter?
.GetAllocator(p, cuda_ctx->context()->RawStream()) dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.get()); .GetAllocator(p, cuda_ctx->stream())
.get());
cuda_ctx->PartialInitWithAllocator();
#endif #endif
} else { } else {
dev_ctx->SetDeviceAllocator( dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
memory::allocation::AllocatorFacade::Instance() .GetAllocator(p)
.GetAllocator(p) .get());
.get());
} }
dev_ctx->SetHostAllocator( dev_ctx->SetHostAllocator(
memory::allocation::AllocatorFacade::Instance() memory::allocation::AllocatorFacade::Instance()
...@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool( ...@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
} }
} }
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {} CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
pten::CPUContext::Init();
}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {} CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
pten::CPUContext::Init();
}
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {} IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
Place IPUDeviceContext::GetPlace() const { return place_; } const Place& IPUDeviceContext::GetPlace() const { return place_; }
void IPUDeviceContext::Wait() const { void IPUDeviceContext::Wait() const {
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {} ...@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {} XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
pten::XPUContext::Init();
}
XPUDeviceContext::~XPUDeviceContext() {} XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) { XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
pten::XPUContext::Init();
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place.device); << static_cast<int>(place.device);
} }
...@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const { ...@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; } const Place& NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; } aclrtContext NPUDeviceContext::context() const { return context_; }
...@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const { ...@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
Place NPUPinnedDeviceContext::GetPlace() const { return place_; } const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif #endif
...@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() { ...@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
#endif #endif
} }
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
CUDADeviceGuard guard(place_.device); : pten::GPUContext(place) {
compute_capability_ = GetGPUComputeCapability(place_.device); pten::GPUContext::PartialInitWithoutAllocator();
multi_process_ = GetGPUMultiProcessors(place_.device); cuda_stream_.reset(
max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device); new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
driver_version_ = GetGPUDriverVersion(place_.device);
runtime_version_ = GetGPURuntimeVersion(place_.device);
LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
<< static_cast<int>(place_.device)
<< ", GPU Compute Capability: "
<< compute_capability_ / 10 << "."
<< compute_capability_ % 10
<< ", Driver API Version: " << driver_version_ / 1000
<< "." << (driver_version_ % 100) / 10
<< ", Runtime API Version: "
<< runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", MIOpen Version: " << version_major << "."
<< version_minor << "." << version_patch;
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 1000) / 100 << ".";
#endif
{
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
#else
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
#endif
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
}
default_ctx_.reset(new CUDAContext(place_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
#endif
}
Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process_ * max_threads_per_mp_;
} }
int CUDADeviceContext::GetSMCount() const { return multi_process_; } CUDADeviceContext::~CUDADeviceContext() = default;
int CUDADeviceContext::GetMaxThreadsPerBlock() const {
return max_threads_per_block_;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return context()->EigenDevice().get(); if (thread_ctx_.count(this)) {
} return context()->EigenDevice().get();
}
bool CUDADeviceContext::tensor_core_available() const { return pten::GPUContext::eigen_device();
return context()->CublasTensorCoreHandle() != nullptr;
} }
dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const { void CUDADeviceContext::Wait() const {
return max_grid_dim_size_; if (thread_ctx_.count(this)) {
context()->Stream()->Wait();
return;
}
pten::GPUContext::Wait();
} }
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const { ...@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else #else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif #endif
return context()->CudnnHandle(); if (thread_ctx_.count(this)) {
return context()->CudnnHandle();
}
return pten::GPUContext::cudnn_handle();
} }
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const { rocblas_handle CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle(); if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
} }
#else #else
cublasHandle_t CUDADeviceContext::cublas_handle() const { cublasHandle_t CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle(); if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
} }
cusparseHandle_t CUDADeviceContext::cusparse_handle() const { cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
return context()->CusparseHandle()->GetCusparseHandle(); if (thread_ctx_.count(this)) {
return context()->CusparseHandle()->GetCusparseHandle();
}
return pten::GPUContext::cusparse_handle();
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle();
}
return pten::GPUContext::cusolver_dn_handle();
} }
#endif #endif
void CUDADeviceContext::RecordEvent(
gpuEvent_t ev, const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->RecordEvent(ev, callback);
return;
}
pten::GPUContext::RecordEvent(ev, callback);
}
void CUDADeviceContext::AddStreamCallback(
const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->AddCallback(callback);
return;
}
pten::GPUContext::AddStreamCallback(callback);
}
void CUDADeviceContext::WaitStreamCallback() const {
if (thread_ctx_.count(this)) {
context()->Stream()->WaitCallback();
return;
}
pten::GPUContext::WaitStreamCallback();
}
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
} }
#ifndef PADDLE_WITH_HIP gpuStream_t CUDADeviceContext::stream() const {
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const { if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle(); return context()->RawStream();
}
return pten::GPUContext::stream();
} }
#endif
gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); } std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
if (!thread_ctx_.count(this)) {
PADDLE_THROW(platform::errors::PermissionDenied(
"CUDADeviceContext call context() failed, make sure in the "
"thread_local semantic."));
}
return thread_ctx_.at(this);
}
stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
return cuda_stream_.get();
}
stream::CUDAStream* CUDADeviceContext::SetCudaStream(
stream::CUDAStream* new_stream_ptr) {
auto* old_stream_ptr = cuda_stream_.release();
cuda_stream_.reset(new_stream_ptr);
return old_stream_ptr;
}
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice()); eigen_device_.reset(new Eigen::DefaultDevice());
...@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { ...@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <functional>
#include <future> // NOLINT #include <future> // NOLINT
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
...@@ -18,7 +19,9 @@ limitations under the License. */ ...@@ -18,7 +19,9 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/core/device_context.h" #include "paddle/pten/core/device_context.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
...@@ -28,6 +31,7 @@ limitations under the License. */ ...@@ -28,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/fluid/platform/dynload/cusparse.h" #include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif #endif
...@@ -38,6 +42,7 @@ limitations under the License. */ ...@@ -38,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT #include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h" #include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h" // NOLINT
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h" #include "paddle/fluid/platform/dynload/rccl.h"
#endif #endif
...@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext { ...@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
explicit IPUDeviceContext(IPUPlace place); explicit IPUDeviceContext(IPUPlace place);
virtual ~IPUDeviceContext(); virtual ~IPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; } Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override; const Place& GetPlace() const override;
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
void Wait() const override; void Wait() const override;
...@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext { ...@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
explicit NPUDeviceContext(NPUPlace place); explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext(); virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; } Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override; const Place& GetPlace() const override;
aclrtContext context() const; aclrtContext context() const;
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext { ...@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
NPUPinnedDeviceContext(); NPUPinnedDeviceContext();
explicit NPUPinnedDeviceContext(NPUPinnedPlace place); explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
Place GetPlace() const override; const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
...@@ -326,20 +331,20 @@ class CUDAContext { ...@@ -326,20 +331,20 @@ class CUDAContext {
#endif #endif
/*! \brief Call cublas function safely. */ /*! \brief Call cublas function safely. */
template <typename Callback> inline void CublasCall(
inline void CublasCall(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tf32_tensor_core_handle_) { if (cublas_tf32_tensor_core_handle_) {
cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback)); cublas_tf32_tensor_core_handle_->Call(callback);
} else { } else {
cublas_handle_->Call(std::forward<Callback>(callback)); cublas_handle_->Call(callback);
} }
} }
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */ /*! \brief Call cusparse function safely. */
template <typename Callback> inline void CusparseCall(
inline void CusparseCall(Callback&& callback) const { const std::function<void(pten::sparseHandle_t)>& callback) const {
cusparse_handle_->Call(std::forward<Callback>(callback)); cusparse_handle_->Call(callback);
} }
#endif #endif
...@@ -348,12 +353,12 @@ class CUDAContext { ...@@ -348,12 +353,12 @@ class CUDAContext {
/*! \brief Call cublas function with Tensor Core safely. If /*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */ Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback> inline void TensorCoreCublasCallIfAvailable(
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tensor_core_handle_) { if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(std::forward<Callback>(callback)); cublas_tensor_core_handle_->Call(callback);
} else { } else {
cublas_handle_->Call(std::forward<Callback>(callback)); cublas_handle_->Call(callback);
} }
} }
...@@ -491,7 +496,7 @@ class CUDAContext { ...@@ -491,7 +496,7 @@ class CUDAContext {
DISABLE_COPY_AND_ASSIGN(CUDAContext); DISABLE_COPY_AND_ASSIGN(CUDAContext);
}; };
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public pten::GPUContext {
public: public:
explicit CUDADeviceContext(CUDAPlace place); explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext(); virtual ~CUDADeviceContext();
...@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext { ...@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
void Wait() const override; void Wait() const override;
/*! \brief Return place in the device context. */
Place GetPlace() const override;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
dim3 GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
/*! \brief Call cublas function safely. */ /*! \brief Call cublas function safely. */
template <typename Callback> inline void CublasCall(
inline void CublasCall(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::CublasCall(callback);
return;
}
return context()->CublasCall(callback); return context()->CublasCall(callback);
} }
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */ /*! \brief Call cusparse function safely. */
template <typename Callback> inline void CusparseCall(
inline void CusparseCall(Callback&& callback) const { const std::function<void(pten::sparseHandle_t)>& callback) const {
return context()->CusparseCall(callback); if (!thread_ctx_.count(this)) {
pten::GPUContext::CusparseCall(callback);
return;
}
context()->CusparseCall(callback);
} }
#endif #endif
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If /*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */ Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback> inline void TensorCoreCublasCallIfAvailable(
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { const std::function<void(blasHandle_t)>& callback) const {
return context()->TensorCoreCublasCallIfAvailable(callback); if (!thread_ctx_.count(this)) {
pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
return;
}
context()->TensorCoreCublasCallIfAvailable(callback);
} }
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
...@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext { ...@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
cusparseHandle_t cusparse_handle() const; cusparseHandle_t cusparse_handle() const;
#endif #endif
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn /*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads. * functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock * Once the first cudnn function is called by the handle, a lock
...@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext { ...@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */ * sequential cudnn function calls. */
CudnnWorkspaceHandle cudnn_workspace_handle() const; CudnnWorkspaceHandle cudnn_workspace_handle() const;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
gpuStream_t stream() const; gpuStream_t stream() const;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
#endif
template <typename Callback>
void RecordEvent(gpuEvent_t ev, Callback callback) const {
return context()->Stream()->RecordEvent(ev, callback);
}
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return context()->Stream()->AddCallback(callback);
}
void WaitStreamCallback() const { void AddStreamCallback(const std::function<void()>& callback) const;
return context()->Stream()->WaitCallback();
}
void ResetDefaultContext(const stream::Priority& priority) { void WaitStreamCallback() const;
default_ctx_.reset(new CUDAContext(place_, priority));
}
void ResetThreadContext(const stream::Priority& priority) { void ResetThreadContext(const stream::Priority& priority) {
std::lock_guard<std::mutex> guard(ctx_mtx_); std::lock_guard<std::mutex> guard(ctx_mtx_);
thread_ctx_[this].reset(new CUDAContext(place_, priority)); thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
} }
std::shared_ptr<CUDAContext> context() const { std::shared_ptr<CUDAContext> context() const;
if (!thread_ctx_.count(this)) {
return default_ctx_;
}
return thread_ctx_.at(this);
}
// Note: Can only be used under thread_local semantics. // Note: Can only be used under thread_local semantics.
void SetThreadLocalStream(const gpuStream_t stream) { void SetThreadLocalStream(const gpuStream_t stream) {
thread_ctx_.at(this)->SetStream(stream); thread_ctx_.at(this)->SetStream(stream);
} }
private: // NOTE: Just for compatibility with the past, please delete if there is an
CUDAPlace place_; // elegant way.
std::shared_ptr<CUDAContext> default_ctx_; stream::CUDAStream* GetCudaStream() const;
stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
private:
// The thread_local static variable will be released before the // The thread_local static variable will be released before the
// global static variable, so avoid using it in dtor. // global static variable, so avoid using it in dtor.
static thread_local std::unordered_map<const CUDADeviceContext*, static thread_local std::unordered_map<const CUDADeviceContext*,
...@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext { ...@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex cudnn_handle_mtx_; mutable std::mutex cudnn_handle_mtx_;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NOTE: Just for compatibility with the past, please delete if there is an
// NCCL communicator (single process version) for NCCL collective operations. // elegant way.
// NCCL collective operations provides fast collectives over multiple GPUs std::unique_ptr<stream::CUDAStream> cuda_stream_;
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
#endif
int compute_capability_;
int runtime_version_;
int driver_version_;
int multi_process_;
int max_threads_per_mp_;
int max_threads_per_block_;
dim3 max_grid_dim_size_;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
}; };
...@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext { ...@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
CUDAPinnedDeviceContext(); CUDAPinnedDeviceContext();
explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
Place GetPlace() const override; const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
...@@ -26,6 +27,20 @@ TEST(Device, Init) { ...@@ -26,6 +27,20 @@ TEST(Device, Init) {
int count = paddle::platform::GetGPUDeviceCount(); int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
delete device_context; delete device_context;
...@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) { ...@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
int count = paddle::platform::GetGPUDeviceCount(); int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
......
...@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) { ...@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext.")); "Failed to dynamic_cast context into CUDADeviceContext."));
wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream()); wrapper->inner_event_.Record(cuda_dev_ctx->stream());
} }
bool DeviceEventQueryCUDA(const DeviceEvent* event) { bool DeviceEventQueryCUDA(const DeviceEvent* event) {
...@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event, ...@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext.")); "Failed to dynamic_cast context into CUDADeviceContext."));
// calling cudaStreamWaitEvent(stream, event, 0) // calling cudaStreamWaitEvent(stream, event, 0)
cuda_dev_ctx->context()->Stream()->WaitEvent( cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
wrapper->inner_event_.GetRawCudaEvent());
} }
void DeviceEventCPUWaitCUDA(const DeviceEvent* event, void DeviceEventCPUWaitCUDA(const DeviceEvent* event,
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/device_event.h" #include "paddle/fluid/platform/device_event.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/place.h"
using ::paddle::platform::kCUDA; using ::paddle::platform::kCUDA;
using ::paddle::platform::kCPU; using ::paddle::platform::kCPU;
...@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) { ...@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
// case 1. test for event_creator // case 1. test for event_creator
DeviceEvent event(place); DeviceEvent event(place);
ASSERT_NE(event.GetEvent().get(), nullptr); ASSERT_NE(event.GetEvent().get(), nullptr);
bool status = event.Query();
ASSERT_EQ(status, true);
// case 2. test for event_recorder // case 2. test for event_recorder
event.Record(context); event.Record(context);
bool status = event.Query(); status = event.Query();
ASSERT_EQ(status, false); ASSERT_EQ(status, false);
// case 3. test for event_finisher // case 3. test for event_finisher
event.Finish(); event.Finish();
......
...@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) { ...@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
::paddle::platform::details::ExternalApiType< \ ::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \ __CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \ paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \ __cond__ = (COND); \
++retry_count; \ ++retry_count; \
} \ } \
...@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) { ...@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
::paddle::platform::details::ExternalApiType< \ ::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \ __CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \ ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \ __cond__ = (COND); \
++retry_count; \ ++retry_count; \
} \ } \
......
...@@ -152,11 +152,11 @@ class CudaEvent { ...@@ -152,11 +152,11 @@ class CudaEvent {
#endif #endif
} }
void Record(const paddle::platform::stream::CUDAStream &stream) { void Record(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream())); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream())); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
#endif #endif
} }
......
...@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) { ...@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor // CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place); CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor // GPU LoDTensor to CPU LoDTensor
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) { ...@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
} }
} }
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <> template <>
struct ForRange<CUDADeviceContext> { struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit) ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
...@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> { ...@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
size_t limit_; size_t limit_;
}; };
template <>
struct ForRange<pten::GPUContext> {
ForRange(const pten::GPUContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int num_thread = 256;
platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
const int num_threads = num_thread;
#else
constexpr int num_threads = 1024;
#endif
size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
}
const pten::GPUContext& dev_ctx_;
size_t limit_;
};
#endif #endif
} // namespace platform } // namespace platform
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) { ...@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
platform::Place device = CUDAPlace(deviceId); platform::Place device = CUDAPlace(deviceId);
auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device)) return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context() ->GetCudaStream();
->Stream()
.get();
return stream;
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
...@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) { ...@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
auto& device = stream->GetPlace(); auto& device = stream->GetPlace();
auto& pool = platform::DeviceContextPool::Instance(); auto& pool = platform::DeviceContextPool::Instance();
return static_cast<platform::CUDADeviceContext*>(pool.Get(device)) return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context() ->SetCudaStream(stream);
->SetStream(stream);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); "Paddle is not compiled with CUDA. Cannot visit cuda current"
return nullptr; "stream."));
return CUDAStream(nullptr);
#endif #endif
} }
} // namespace stream } // namespace stream
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include <functional>
#include <memory> #include <memory>
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
...@@ -51,24 +52,28 @@ class CUDAStream final { ...@@ -51,24 +52,28 @@ class CUDAStream final {
const StreamFlag& flag = StreamFlag::kDefaultFlag) { const StreamFlag& flag = StreamFlag::kDefaultFlag) {
Init(place, priority, flag); Init(place, priority, flag);
} }
explicit CUDAStream(gpuStream_t stream, const Place& place)
: place_(place), stream_(stream) {
owned_stream_ = false;
callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
}
virtual ~CUDAStream() { Destroy(); } virtual ~CUDAStream() { Destroy(); }
bool Init(const Place& place, const Priority& priority = Priority::kNormal, bool Init(const Place& place, const Priority& priority = Priority::kNormal,
const StreamFlag& flag = StreamFlag::kDefaultFlag); const StreamFlag& flag = StreamFlag::kDefaultFlag);
template <typename Callback> void AddCallback(std::function<void()> callback) const {
void AddCallback(Callback&& callback) const {
callback_manager_->AddCallback(callback); callback_manager_->AddCallback(callback);
} }
template <typename Callback>
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
void RecordEvent(hipEvent_t ev, Callback callback) const { void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
callback(); callback();
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_)); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
} }
#else #else
void RecordEvent(cudaEvent_t ev, Callback callback) const { void RecordEvent(cudaEvent_t ev,
const std::function<void()>& callback) const {
callback(); callback();
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
} }
...@@ -149,6 +154,7 @@ class CUDAStream final { ...@@ -149,6 +154,7 @@ class CUDAStream final {
}; };
CUDAStream* get_current_stream(int deviceId); CUDAStream* get_current_stream(int deviceId);
// NOTE: There is a problem with the interface and needs to be fixed
CUDAStream* set_current_stream(CUDAStream* stream); CUDAStream* set_current_stream(CUDAStream* stream);
} // namespace stream } // namespace stream
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
...@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) { ...@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
TEST(Transform, GPUUnary) { TEST(Transform, GPUUnary) {
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr()); float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
...@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) { ...@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_allocation = Alloc(gpu0, sizeof(buf)); auto gpu_allocation = Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr()); int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
......
...@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
return paddle::platform::stream::get_current_stream(deviceId); return paddle::platform::stream::get_current_stream(deviceId);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current " "Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream.")); "stream."));
#endif #endif
}, },
...@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
[](paddle::platform::stream::CUDAStream &self, [](paddle::platform::stream::CUDAStream &self,
paddle::platform::stream::CUDAStream &stream) { paddle::platform::stream::CUDAStream &stream) {
paddle::platform::CudaEvent event; paddle::platform::CudaEvent event;
event.Record(stream); event.Record(stream.raw_stream());
self.WaitEvent(event.GetRawCudaEvent()); self.WaitEvent(event.GetRawCudaEvent());
}, },
...@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
if (event == nullptr) { if (event == nullptr) {
event = new paddle::platform::CudaEvent(); event = new paddle::platform::CudaEvent();
} }
event->Record(self); event->Record(self.raw_stream());
return event; return event;
}, },
R"DOC( R"DOC(
...@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) { ...@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
if (stream == nullptr) { if (stream == nullptr) {
stream = paddle::platform::stream::get_current_stream(-1); stream = paddle::platform::stream::get_current_stream(-1);
} }
self.Record(*stream); self.Record(stream->raw_stream());
}, },
R"DOC( R"DOC(
Records the event in the given stream. Records the event in the given stream.
......
...@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create", .def_static("create",
[](paddle::platform::CPUPlace& place) [](paddle::platform::CPUPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
return new paddle::platform::CPUDeviceContext(); auto* context = new paddle::platform::CPUDeviceContext();
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
}) })
.def_static("create", .def_static("create",
[](paddle::platform::XPUPlace& place) [](paddle::platform::XPUPlace& place)
...@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use XPUPlace in CPU/GPU version, " "Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#else #else
return new paddle::platform::XPUDeviceContext(place); auto* context = new paddle::platform::XPUDeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
#endif #endif
}) })
.def_static("create", .def_static("create",
...@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use CUDAPlace in CPU only version, " "Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support.")); "Please recompile or reinstall Paddle with CUDA support."));
#else #else
return new paddle::platform::CUDADeviceContext(place); auto* context = new paddle::platform::CUDADeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, context->stream())
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
context->PartialInitWithAllocator();
return context;
#endif #endif
}) })
.def_static("create", .def_static("create",
......
...@@ -2,6 +2,10 @@ add_subdirectory(dynload) ...@@ -2,6 +2,10 @@ add_subdirectory(dynload)
add_subdirectory(cpu) add_subdirectory(cpu)
if(WITH_GPU OR WITH_ROCM)
add_subdirectory(gpu)
endif()
if(WITH_XPU) if(WITH_XPU)
add_subdirectory(xpu) add_subdirectory(xpu)
endif() endif()
...@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) ...@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
if(WITH_XPU) if(WITH_XPU)
add_dependencies(pten_context xpu_context) add_dependencies(pten_context xpu_context)
endif() endif()
if(WITH_GPU)
add_dependencies(pten_context gpu_context)
endif()
...@@ -15,75 +15,59 @@ ...@@ -15,75 +15,59 @@
#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/api/ext/exception.h" #include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
// NOTE: The paddle framework should add WITH_EIGEN option to support compile // NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen. // without eigen.
#include "paddle/pten/core/device_context.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
namespace pten { namespace pten {
struct CPUContext::CPUImpl { struct CPUContext::Impl {
CPUImpl() { device_ = new Eigen::DefaultDevice(); } Impl() : place_(CPUPlace()) {}
// Users need to manage external resources. explicit Impl(const Place& place) : place_(place) {}
explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
device_ = res_.device;
}
~CPUImpl() { ~Impl() {
if (res_.device == nullptr && device_ != nullptr) { if (owned_) {
delete device_; delete eigen_device_;
device_ = nullptr;
} }
} }
Eigen::DefaultDevice* GetEigenDevice() const { void Init() {
PD_CHECK(device_ != nullptr, "the eigen_device is nullptr."); owned_ = true;
return device_; eigen_device_ = new Eigen::DefaultDevice();
} }
void SetEigenDevice(Eigen::DefaultDevice* device) { Eigen::DefaultDevice* GetEigenDevice() const {
if (device == nullptr) { PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
return; return eigen_device_;
}
res_.device = device;
device_ = device;
} }
Place GetPlace() const { return place_; } bool owned_{false};
Eigen::DefaultDevice* eigen_device_{nullptr};
Eigen::DefaultDevice* device_{nullptr}; Place place_;
CPUContextResource res_;
CPUPlace place_;
}; };
CPUContext::CPUContext() : DeviceContext() { CPUContext::CPUContext()
cpu_impl_ = std::make_unique<CPUImpl>(); : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
}
CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
cpu_impl_->SetEigenDevice(other.eigen_device());
}
CPUContext::CPUContext(CPUContext&& other) : DeviceContext() { CPUContext::CPUContext(const Place& place)
cpu_impl_ = std::move(other.cpu_impl_); : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
}
CPUContext::~CPUContext() = default; CPUContext::~CPUContext() = default;
CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() { void CPUContext::Init() { impl_->Init(); }
cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
}
Eigen::DefaultDevice* CPUContext::eigen_device() const { Eigen::DefaultDevice* CPUContext::eigen_device() const {
return cpu_impl_->GetEigenDevice(); return impl_->GetEigenDevice();
} }
const Place& CPUContext::GetPlace() const { return impl_->place_; }
void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) { void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
cpu_impl_->SetEigenDevice(device); impl_->eigen_device_ = device;
} }
Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
} // namespace pten } // namespace pten
...@@ -24,37 +24,29 @@ limitations under the License. */ ...@@ -24,37 +24,29 @@ limitations under the License. */
namespace pten { namespace pten {
struct CPUContextResource {
Eigen::DefaultDevice* device{nullptr};
};
class CPUContext : public DeviceContext { class CPUContext : public DeviceContext {
public: public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
CPUContext(); CPUContext();
explicit CPUContext(const Place&);
// NOTE: Share the same underlying resources, please ensure that resources are virtual ~CPUContext();
// not released.
CPUContext(const CPUContext&);
CPUContext(CPUContext&&);
~CPUContext();
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
const Place& GetPlace() const override;
// TODO(wilber): Whether the interface should be preserved.
Place GetPlace() const override;
public: public:
// NOTE: External users manage resources. Used in inference scenarios. // NOTE: DeviceContext hold resources. Used in training scenarios.
explicit CPUContext(const CPUContextResource& ctx_res); // The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetEigenDevice(Eigen::DefaultDevice* device); void SetEigenDevice(Eigen::DefaultDevice* device);
private: private:
struct CPUImpl; struct Impl;
std::unique_ptr<CPUImpl> cpu_impl_; std::unique_ptr<Impl> impl_;
}; };
} // namespace pten } // namespace pten
if(WITH_GPU)
add_subdirectory(cuda)
nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
elseif(WITH_ROCM)
add_subdirectory(rocm)
hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
endif()
cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace pten {
namespace backends {
namespace gpu {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
} // namespace gpu
} // namespace backends
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,20 +12,19 @@ ...@@ -12,20 +12,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static std::once_flag g_device_props_size_init_flag; static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags; static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props; static std::vector<pten::gpuDeviceProp> g_device_props;
namespace pten {
namespace backends {
namespace gpu {
namespace paddle {
namespace platform {
int DnnVersion() { int DnnVersion() {
if (!dynload::HasCUDNN()) return -1; if (!dynload::HasCUDNN()) return -1;
return dynload::cudnnGetVersion(); return dynload::cudnnGetVersion();
...@@ -75,11 +74,13 @@ int GetGPUDeviceCount() { ...@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
} }
int GetGPUComputeCapability(int id) { int GetGPUComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int major, minor; int major, minor;
auto major_error_code = auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
...@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) { ...@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
} }
int GetGPURuntimeVersion(int id) { int GetGPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int runtime_version = 0; int runtime_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
return runtime_version; return runtime_version;
} }
int GetGPUDriverVersion(int id) { int GetGPUDriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int driver_version = 0; int driver_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version)); PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
return driver_version; return driver_version;
...@@ -120,11 +125,13 @@ bool TensorCoreAvailable() { ...@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
} }
int GetGPUMultiProcessors(int id) { int GetGPUMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
...@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) { ...@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
} }
int GetGPUMaxThreadsPerMultiProcessor(int id) { int GetGPUMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute( PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
...@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { ...@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
} }
int GetGPUMaxThreadsPerBlock(int id) { int GetGPUMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
int count; int count;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
...@@ -162,32 +173,34 @@ int GetCurrentDeviceId() { ...@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
return device_id; return device_id;
} }
dim3 GetGpuMaxGridDimSize(int id) { std::array<int, 3> GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
dim3 ret; GetGPUDeviceCount()));
std::array<int, 3> ret;
int size; int size;
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_x); PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
ret.x = size; ret[0] = size;
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_y); PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
ret.y = size; ret[1] = size;
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_z); PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
ret.z = size; ret[2] = size;
return ret; return ret;
} }
const gpuDeviceProp &GetDeviceProperties(int id) { const gpuDeviceProp &GetDeviceProperties(int id) {
std::call_once(g_device_props_size_init_flag, [&] { std::call_once(g_device_props_size_init_flag, [&] {
int gpu_num = 0; int gpu_num = 0;
gpu_num = platform::GetGPUDeviceCount(); gpu_num = GetGPUDeviceCount();
g_device_props_init_flags.resize(gpu_num); g_device_props_init_flags.resize(gpu_num);
g_device_props.resize(gpu_num); g_device_props.resize(gpu_num);
for (int i = 0; i < gpu_num; ++i) { for (int i = 0; i < gpu_num; ++i) {
...@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) { ...@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
}); });
if (id == -1) { if (id == -1) {
id = platform::GetCurrentDeviceId(); id = GetCurrentDeviceId();
} }
if (id < 0 || id >= static_cast<int>(g_device_props.size())) { if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
PADDLE_THROW(platform::errors::OutOfRange( PADDLE_THROW(paddle::platform::errors::OutOfRange(
"The device id %d is out of range [0, %d), where %d is the number of " "The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than " "devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input " "or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!", "appropriate device again!",
id, static_cast<int>(g_device_props.size()), id,
static_cast<int>(g_device_props.size()),
static_cast<int>(g_device_props.size()))); static_cast<int>(g_device_props.size())));
} }
...@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) { ...@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(), PADDLE_ENFORCE_LT(id,
platform::errors::InvalidArgument( GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount())); id,
GetGPUDeviceCount()));
PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
} }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst,
gpuMemcpyKind kind, gpuStream_t stream) { const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) { gpuMemcpyKind kind) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst,
int src_device, size_t count, gpuStream_t stream) { int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(
int src_device, size_t count) { void *dst, int dst_device, const void *src, int src_device, size_t count) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeer(dst, dst_device, src, src_device, count)); cudaMemcpyPeer(dst, dst_device, src, src_device, count));
} }
...@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) { ...@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); } void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return cudaGetLastError(); } gpuError_t GpuGetLastError() { return cudaGetLastError(); }
} // namespace platform
} // namespace paddle } // namespace gpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
#pragma once
/// Forward declaration of Eigen types.
namespace Eigen {
struct GpuDevice;
} // namespace Eigen
/// Forward declaration of CUDA types.
// Forward declaration of CUDA runtime types.
using cudaStream_t = struct CUstream_st *;
using cudaEvent_t = struct CUevent_st *;
// Forward declaration of cuDNN types.
using cudnnHandle_t = struct cudnnContext *;
using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
using cudnnSpatialTransformerDescriptor_t =
struct cudnnSpatialTransformerStruct *;
using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
using cudnnFusedOpsVariantParamPack_t =
struct cudnnFusedOpsVariantParamStruct *;
using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
// Forward declaration of cuBLAS types.
using cublasHandle_t = struct cublasContext *;
// Forward declaration of cuSOLVER types.
using cusolverDnHandle_t = struct cusolverDnContext *;
// Forward declaration of cuSparse types.
using cusparseHandle_t = struct cusparseContext *;
// Forward declaration of cuFFT types.
using cufftHandle = int;
// Forward declaration of NCCL types.
using ncclComm_t = struct ncclComm *;
/// Forward declaration of ROCM types.
#include <cstddef>
using hipDevice_t = int;
using hipCtx_t = struct ihipCtx_t *;
using hipModule_t = struct ihipModule_t *;
using hipStream_t = struct ihipStream_t *;
using hipEvent_t = struct ihipEvent_t *;
using hipFunction_t = struct ihipModuleSymbol_t *;
// Forward declaration of MIOpen types.
using miopenHandle_t = struct miopenHandle *;
using miopenAcceleratorQueue_t = hipStream_t;
using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
using miopenOperatorArgs_t = struct miopenOperatorArgs *;
using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
// struct miopenConvAlgoPerf_t;
// struct miopenConvSolution_t;
// Forward declaration of rocBLAS types.
using rocblas_handle = struct _rocblas_handle *;
// Forward declaration of hipfft types.
using hipfftHandle = struct hipfftHandle_t *;
// Forward declaration of rocSOLVER types.
using rocsolver_handle = rocblas_handle;
// Forward declaration of rocSparse types.
using rocsparse_handle = struct _rocsparse_handle *;
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -14,13 +14,162 @@ limitations under the License. */ ...@@ -14,13 +14,162 @@ limitations under the License. */
#pragma once #pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include <array>
#include <functional>
// See Note [ Why still include the fluid headers? ] #include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_helper.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/device_context.h"
namespace pten { namespace pten {
using GPUContext = paddle::platform::CUDADeviceContext;
} // namespace pten
#endif class DnnWorkspaceHandle;
class GPUContext : public DeviceContext {
public:
GPUContext();
explicit GPUContext(const GPUPlace& place);
virtual ~GPUContext();
/*! \brief Return place in the device context. */
const Place& GetPlace() const override;
/*! \brief Return gpu stream in the device context. */
gpuStream_t stream() const;
/*! \brief Return cudnn handle in the device context. */
dnnHandle_t cudnn_handle() const;
/*! \brief Return cublas handle in the device context. */
blasHandle_t cublas_handle() const;
/*! \brief Return cusolver handle in the device context. */
solverHandle_t cusolver_dn_handle() const;
/*! \brief Return cusparse handle in the device context. */
sparseHandle_t cusparse_handle() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Wait for event in the stream. */
void WaitEvent(gpuEvent_t ev) const;
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
std::array<int, 3> GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
* would be acquired to prevent other threads from accessing the
* workspace. Once the handle is destructed, the lock would be released.
*/
DnnWorkspaceHandle* cudnn_workspace_handle();
public:
/*! \brief Call cublas function safely. */
void CublasCall(const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cusparse function safely. */
void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void RecordEvent(gpuEvent_t ev) const;
void AddStreamCallback(const std::function<void()>& callback) const;
void WaitStreamCallback() const;
public:
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const;
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm);
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
// Note that you must set the Allocator before calling Init function.
void Init();
// TODO(wilber): Why does the GetAllocator interface require a stream
// parameter?
// The temporary trick method bypasses this problem, and the following
// interfaces
// need to be deleted later.
// Note that this is a trick implementation, which can be used to partially
// initialize when the SetAllocator interface is not called.
void PartialInitWithoutAllocator();
// Note that this is a trick implementation that can be used to initialize
// resources that require an Allocator when the SetAllocator interface is
// called.
void PartialInitWithAllocator();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetStream(gpuStream_t);
void SetEigenDevice(Eigen::GpuDevice*);
void SetBlasHandle(blasHandle_t);
void SetDnnHandle(dnnHandle_t);
void SetSolverHandle(solverHandle_t);
void SetSparseHandle(sparseHandle_t);
void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
void SetComputeCapability(int val);
void SetMaxThreadsPerMultiProcessor(int val);
void SetMultiProcessors(int val);
void SetMaxThreadsPerBlock(int val);
void SetMaxGridDimSize(const std::array<int, 3>& val);
void SetDriverVersion(int val);
void SetRuntimeVersion(int val);
private:
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
namespace pten {
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
cudnnActivationStruct,
miopenActivationDescriptor);
DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
cudnnTensorStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
cudnnFilterStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
cudnnFilterDescriptor_t,
miopenTensorDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
cudnnConvolutionStruct,
miopenConvolutionDescriptor);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
cudnnConvolutionDescriptor_t,
miopenConvolutionDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
cudnnPoolingDescriptor_t,
miopenPoolingDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
cudnnDropoutDescriptor_t,
miopenDropoutDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
#undef DECLARE_TYPE_FOR_GPU
using CUDAGraphID = unsigned long long; // NOLINT
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
#else
#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
#endif
#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_info.h"
#include <vector>
#include "gflags/gflags.h"
DECLARE_string(selected_gpus);
namespace pten {
namespace backends {
namespace gpu {
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
} // namespace gpu
} // namespace backends
} // namespace pten
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册