未验证 提交 a821c4a9 编写于 作者: W Wilber 提交者: GitHub

[PTEN] Add Gpu context (#39305)

上级 dcff7fa8
......@@ -33,7 +33,7 @@ namespace distributed {
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() {
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx);
}
......
......@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto &t_latest = var_latest->Get<framework::LoDTensor>();
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
......@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense(varnames, table_id, pserver_scope_.get());
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
for (auto &varname : varnames) {
auto *var_latest = recv_scope_->FindVar(varname);
auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
......@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto *t_old = var_old->GetMutable<framework::LoDTensor>();
auto dims1 = t_latest.dims()[1];
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
......@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std::vector<float> v_delta;
v_delta.resize(numel);
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
......
......@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
}
// set output tensor to 0.
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
constant_functor;
constant_functor(cpu_ctx, out_t, static_cast<T>(0));
......@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for (auto &var : vars) {
inputs.push_back(&var->Get<pten::SelectedRows>());
}
auto dev_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext dev_ctx;
if (merge_add) {
paddle::operators::math::scatter::MergeAdd<
paddle::platform::CPUDeviceContext, T>
......
......@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
......
......@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
pool.Get(member_->places_[dev_id]));
auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_bkcl_context(bkcl_ctx.comm());
dev_ctx->SetBkclContext(bkcl_ctx.comm());
}
#else
PADDLE_THROW(
......
......@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using TYPE = pten::CPUContext;
};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct ConvertToPtenContext<platform::CUDADeviceContext> {
using TYPE = pten::GPUContext;
};
#endif
#ifdef PADDLE_WITH_XPU
template <>
struct ConvertToPtenContext<platform::XPUDeviceContext> {
......
......@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.seekg(seekg, is.cur);
void* buf;
auto ctx = platform::CPUDeviceContext();
platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
......@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
......@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
return os;
}
}
} // namespace pten
......@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
......@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
......@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<int> dst;
......@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<bool> dst;
......@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
......
......@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
gloo_wrapper->Init();
device_ = std::unique_ptr<platform::CPUDeviceContext>(
new platform::CPUDeviceContext(platform::CPUPlace()));
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
device_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
}
void GLOOParallelContext::InitWithRingID(int ring_id) {
......
......@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
......
......@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
ctx_->PartialInitWithAllocator();
engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork();
......
......@@ -18,6 +18,7 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
......@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream())
.get());
dev_ctx.PartialInitWithAllocator();
auto th_main = [&](std::random_device::result_type seed) {
std::default_random_engine engine(seed);
......
......@@ -25,6 +25,7 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place)));
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
}
......@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place)));
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
std::cref(*dev_ctx[i])));
}
......
......@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
return block_size;
};
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post;
int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
......@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
// Init a index array
......@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
......
......@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
// reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input_tensor, output_tensor, kps::IdentityFunctor<T>(),
reduce_dims_vec, stream);
context.cuda_device_context(), *input_tensor, output_tensor,
kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
}
}
}
......
......@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
};
......
......@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
{1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>(
*input, &tmp, kps::SquareFunctor<platform::float16, float>(),
dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream());
auto tmp_eigen = EigenVector<float>::Flatten(tmp);
auto x_norm = tmp_eigen.sqrt();
......
......@@ -65,7 +65,8 @@ class CompareReduceOpKernel
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
kps::IdentityFunctor<bool>>(
tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream);
}
}
};
......
......@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
TEST(copy_cross_scope, CUDA_fp32) {
f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare1<float>(&scope, ctx, "copy_cross_scope");
}
TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare2<float>(&scope, ctx, "copy_cross_scope");
}
#elif PADDLE_WITH_ASCEND_CL
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
......@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
pt_z.get());
pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"X's type[%s] is not supported by elementwise_op. X's type should be "
......
......@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream());
}
template <ElementwiseType ET, typename T, typename Functor>
......
......@@ -275,6 +275,18 @@ class TestFeedForward {
output_size_ = 3 * num_head_ * dim_head_;
input_size_ = dim_embed_;
ctx_ = new platform::CUDADeviceContext(place_);
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
ctx_->PartialInitWithAllocator();
size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim]
size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim]
......
......@@ -166,7 +166,8 @@ class AttnMatMul {
if (support_case_1 || support_case_2) {
gpuStream_t stream = dev_ctx_.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream);
dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
stream);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support reduce when the input dims are [0,1,2,3,4] and "
......
......@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluFwdCUDAKernel< \
......@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
is_aligned(x_g, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluBwdCUDAKernel< \
......
......@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context();
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
// 1. Insert data into keys and values.
......@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (outputs->size() + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
ReindexSrcOutput<
......
......@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
#endif
int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0];
......@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
#endif
int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0];
......
......@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
const int size_from_axis = SizeFromAxis(axis, X.dims());
const int size_out_axis = SizeOutAxis(axis, X.dims());
constexpr int thread_size = 512;
int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
int64_t height = size_to_axis * size_out_axis;
int block_size = height < max_grid_dimx ? height : max_grid_dimx;
......
......@@ -27,10 +27,10 @@ namespace operators {
namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
}
......
......@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
int block_y = std::min(GetLastPow2(height), max_threads / block_x);
int block_z = std::min(num_img, max_threads / block_x / block_y);
dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
auto max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
int grid_z =
std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));
const int capability = context.GetComputeCapability();
platform::GpuLaunchConfig config;
......
......@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
}
if (dy) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
}
#else
auto* place = dev_ctx.eigen_device();
......
......@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1;
int block_size = getBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
: maxGridDimX;
......
......@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
......
......@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* scores,
......@@ -129,6 +131,83 @@ void TestBeamSearch() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestBeamSearch<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::LoDTensor ids;
paddle::framework::LoDTensor scores;
paddle::framework::LoDTensor pre_ids;
paddle::framework::LoDTensor pre_scores;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
} else {
paddle::framework::LoDTensor cpu_ids;
paddle::framework::LoDTensor cpu_scores;
paddle::framework::LoDTensor cpu_pre_ids;
paddle::framework::LoDTensor cpu_pre_scores;
PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
ids.set_lod(cpu_ids.lod());
scores.set_lod(cpu_scores.lod());
pre_ids.set_lod(cpu_pre_ids.lod());
pre_scores.set_lod(cpu_pre_scores.lod());
}
paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores;
paddle::framework::LoDTensor parent_idx;
size_t level = 0;
size_t beam_size = 2;
int end_id = 0;
paddle::operators::math::BeamSearchFunctor<
paddle::platform::CUDADeviceContext, float>
beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, &parent_idx, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
paddle::framework::LoDTensor cpu_selected_ids;
paddle::framework::LoDTensor cpu_selected_scores;
if (paddle::platform::is_cpu_place(*place)) {
cpu_selected_ids = selected_ids;
cpu_selected_scores = selected_scores;
} else {
paddle::framework::TensorCopySync(
selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
paddle::framework::TensorCopySync(
selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
cpu_selected_ids.set_lod(selected_ids.lod());
cpu_selected_scores.set_lod(selected_scores.lod());
}
std::vector<int64_t> expected_ids({4, 5, 3, 8});
std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
}
delete place;
delete context;
}
#endif
TEST(BeamSearch, CPU) {
TestBeamSearch<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>();
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
/**
* case 1:
......@@ -441,6 +443,31 @@ void TestConcatMain() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestConcatMain<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
auto* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
delete context;
}
#endif
TEST(math, concat) {
TestConcatMain<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>();
......
......@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
const int rows, const int cols) {
paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
T>(*context);
......@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
const std::string& mode) {
paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
// get sparse
auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place>
void testIm2col() {
......@@ -60,6 +62,7 @@ void testIm2col() {
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
......@@ -164,6 +167,165 @@ void testIm2col() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor input;
paddle::framework::Tensor output_cfo;
paddle::framework::Tensor output_ocf;
paddle::framework::Tensor output_tmp;
/**
* input = [0, 1, 2,
* 3, 4, 5]
*
* output_cfo = [0, 1
* 1, 2
* 3, 4
* 4, 5]
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*
* col2im_cfo = [0, 2, 2
* 3, 4, 5]
*
* col2im_ocf = [0, 2, 2
* 3, 4, 5]
*/
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> stride({1, 1}); // stride_y, stride_x
std::vector<int> padding(
{0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad
std::vector<int> dilation({1, 1}); // dilation_y, dilation_x
int output_height =
(input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
int output_width =
(input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
float* input_ptr = input_tmp.mutable_data<float>(
{1, input_height, input_width}, paddle::platform::CPUPlace());
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input_ptr, arr, 6 * sizeof(float));
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
output_ocf.mutable_data<float>(
{output_height, output_width, 1, filter_size, filter_size}, *place);
// Im2Col
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
im2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
im2col_ocf;
im2col(*context, input, dilation, stride, padding, &output_cfo);
im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
}
float* out_ocf_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
&output_tmp);
out_ocf_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
}
// Col2Im: kCFO
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
col2im;
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
col2im_ocf;
float col2im_data[] = {0, 2, 2, 3, 8, 5};
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
// Col2Im: kOCF
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
delete place;
delete context;
}
#endif
TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
constexpr size_t kThreadNumY = 32;
size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x);
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
dim3 thread_dims(kThreadNumX, kThreadNumY);
if (reverse) {
InclusiveScanInnerDimCUDAKernel<
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h"
namespace paddle {
......@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
template struct SetConstant<platform::CUDADeviceContext,
platform::complex<double>>;
template struct SetConstant<pten::GPUContext, platform::float16>;
template struct SetConstant<pten::GPUContext, platform::bfloat16>;
template struct SetConstant<pten::GPUContext, float>;
template struct SetConstant<pten::GPUContext, double>;
template struct SetConstant<pten::GPUContext, uint8_t>;
template struct SetConstant<pten::GPUContext, int>;
template struct SetConstant<pten::GPUContext, int16_t>;
template struct SetConstant<pten::GPUContext, int64_t>;
template struct SetConstant<pten::GPUContext, bool>;
template struct SetConstant<pten::GPUContext, platform::complex<float>>;
template struct SetConstant<pten::GPUContext, platform::complex<double>>;
template struct SetConstant<platform::CUDAPinnedDeviceContext,
platform::float16>;
template struct SetConstant<platform::CUDAPinnedDeviceContext,
......
......@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
t.Resize({10, 10});
t.mutable_data<int>(paddle::platform::CPUPlace());
auto* ctx = new paddle::platform::CPUDeviceContext();
ctx->Init();
paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
......
......@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
......@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
......@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2;
int n = 3;
......@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2;
int n = 3;
......@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h"
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place>
void testVol2col() {
......@@ -25,7 +27,6 @@ void testVol2col() {
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
/**
* input = [[0, 1, 2,
* 3, 4, 5]
......@@ -123,6 +124,124 @@ void testVol2col() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testVol2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input;
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor output;
paddle::framework::Tensor output_tmp;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
/**
* input = [[0, 1, 2,
* 3, 4, 5]
* [6, 7, 8,
* 9, 10, 11]]
*
* output = [0, 1
* 1, 2
* 3, 4
* 4, 5
* 6, 7
* 7, 8
* 9, 10
* 10, 11]
*
* col2vol = [[0, 2, 2,
* 3, 8, 5]
* [6, 14, 8,
* 9, 20, 11]]
*
*/
int input_depth = 2;
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> strides({1, 1, 1});
std::vector<int> paddings({0, 0, 0});
std::vector<int> dilations({1, 1, 1});
int output_depth =
(input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
int output_height =
(input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
int output_width =
(input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
// Vol2Col test
float* input_ptr =
input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
paddle::platform::CPUPlace());
float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input_ptr, arr, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
*place);
paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
float>
vol2col;
vol2col(*context, input, dilations, strides, paddings, &output);
float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 16; ++i) {
EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
}
// Col2Vol test
float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
memset(input_ptr, 0, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
float>
col2vol;
col2vol(*context, output, dilations, strides, paddings, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 12; ++i) {
EXPECT_EQ(in_ptr[i], col_2_vol[i]);
}
delete place;
delete context;
}
#endif
TEST(math, vol2col) {
testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i);
}
TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
*input, output, Div(numel), reduce_dims, stream);
context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream);
}
};
......
......@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i);
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
auto *ctx = new p::CUDADeviceContext(place);
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->PartialInitWithAllocator();
dev_ctxs_.emplace_back(ctx);
}
NCCLInitOp();
......
......@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using MT = typename details::MPTypeTrait<T>::Type;
if (porder == 0) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
*in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
reduce_axis, stream);
} else if (porder == INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else if (porder == -INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
*in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm,
UnsignedPowFunctor<T>(porder), reduce_axis, stream);
const framework::Tensor* tmp_norm = out_norm;
std::vector<const framework::Tensor*> ins = {tmp_norm};
......
......@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
auto stream = dev_ctx.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor,
kps::DivideFunctor<T>>(
*in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim,
stream);
dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
reduce_dim, stream);
#else // for cpu
paddle::operators::math::Pool2dFunctor<
DeviceContext, paddle::operators::math::AvgPool<T>, T>
......
......@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
}
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream);
context.cuda_device_context(), dalpha_tmp, dalpha,
kps::IdentityFunctor<T>(), reduce_dims, stream);
}
};
......
......@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
framework::Scope* scope = (*micro_scope)[0];
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
framework::Executor exe(place);
// create var on local scope
......
......@@ -39,14 +39,16 @@ namespace operators {
template <typename Tx, typename Ty, template <typename> class ReduceOp,
typename TransformOp>
void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) {
y->mutable_data<Ty>(x.place());
pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
x, y, transform, origin_reduce_dims, stream);
static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims, stream);
}
} // namespace operators
......
......@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
stream);
RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
......@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
dim_divisor);
std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), pow_value, &dim_value,
kps::IdentityFunctor<T>(), reduce_axis, stream);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
stream);
ctx.cuda_device_context(), mul_value, &weight_derivative,
kps::IdentityFunctor<T>(), reduce_axis, stream);
RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
x_data, dout_data, dx_data, numel,
dim_value.mutable_data<T>(ctx.GetPlace()),
......
......@@ -389,7 +389,8 @@ class ReshapeKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
pt_scalar_shape, out);
}
#endif
#ifdef PADDLE_WITH_XPU
......@@ -417,7 +418,8 @@ class ReshapeGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
*d_out, d_x);
}
#endif
#ifdef PADDLE_WITH_XPU
......@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
pten::ReshapeDoubleGradKernel(
static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
}
#endif
#ifdef PADDLE_WITH_XPU
......
......@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
int64_t max_grid_dimx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.GetCUDAMaxGridDimSize()
.x;
.GetCUDAMaxGridDimSize()[0];
int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
ScatterInitCUDAKernel<T, IndexT><<<
......
......@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
#if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
reduce_dims, stream);
#else
ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
input, output, reduce_dims, keep_dim, false, ctx)
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace paddle {
namespace operators {
......@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto src_allocation = memory::Alloc(gpu0, sizeof(src));
......@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
......
......@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
tensor->Resize(dims);
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
inference::tensorrt::RandomizeTensor(tensor, place, ctx);
}
......@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
// Prepare variables.
if (allow_build_at_runtime)
CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
......@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0);
......
......@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
};
int block_size = ComputeBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
......
......@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
reduce_dims, stream);
} else {
math::SetConstant<DeviceContext, T> functor;
functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
......
......@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
};
......
......@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
}
const auto& dev_ctx = ctx.cuda_device_context();
auto cu_stream = dev_ctx.stream();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post;
int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
......@@ -138,6 +138,7 @@ if(WITH_CNCL)
endif()
if(WITH_GPU OR WITH_ROCM)
target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
target_link_libraries(device_context gpu_resource_pool)
endif()
......
......@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
......@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
int dev_id, int ring_id) {
std::unique_ptr<CUDADeviceContext> dev_ctx(
new CUDADeviceContext(CUDAPlace(dev_id)));
dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
.get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
dev_ctx->PartialInitWithAllocator();
std::shared_ptr<platform::CudaEventObject> compute_event(
platform::CudaEventResourcePool::Instance().New(dev_id));
......@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(
platform::XPUPlace(dev_id)));
dev_ctx->set_bkcl_context(comm);
dev_ctx->SetBkclContext(comm);
}
return comm_map_[ring_id][dev_id].get();
......
IF(WITH_GPU)
add_subdirectory(cuda)
nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
ELSEIF(WITH_ROCM)
add_subdirectory(rocm)
hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
......
nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
......
......@@ -14,8 +14,10 @@
#pragma once
#include <functional>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
......@@ -96,8 +98,7 @@ class CublasHandleHolder {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
}
template <typename Callback>
inline void Call(Callback&& callback) const {
inline void Call(const std::function<void(blasHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(mtx_);
callback(handle_);
}
......
......@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include <functional>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
namespace paddle {
namespace platform {
......@@ -45,8 +47,8 @@ class CusparseHandleHolder {
#endif
}
template <typename Callback>
inline void Call(Callback&& callback) const {
inline void Call(
const std::function<void(pten::sparseHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(mtx_);
callback(handle_);
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include <cstdlib>
#include <mutex>
#include <set>
......@@ -39,11 +40,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
......@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT;
namespace paddle {
namespace platform {
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total;
......@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
}
int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
int GetGPUComputeCapability(int id) {
return pten::backends::gpu::GetGPUComputeCapability(id);
}
int GetGPURuntimeVersion(int id) {
return pten::backends::gpu::GetGPURuntimeVersion(id);
}
int GetGPUDriverVersion(int id) {
return pten::backends::gpu::GetGPUDriverVersion(id);
}
bool TensorCoreAvailable() {
return pten::backends::gpu::TensorCoreAvailable();
}
int GetGPUMultiProcessors(int id) {
return pten::backends::gpu::GetGPUMultiProcessors(id);
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
}
int GetGPUMaxThreadsPerBlock(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
}
int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
return pten::backends::gpu::GetGpuMaxGridDimSize(id);
}
std::vector<int> GetSelectedDevices() {
return pten::backends::gpu::GetSelectedDevices();
}
const gpuDeviceProp &GetDeviceProperties(int id) {
return pten::backends::gpu::GetDeviceProperties(id);
}
void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
void GpuStreamSync(gpuStream_t stream) {
pten::backends::gpu::GpuStreamSync(stream);
}
void GpuDestroyStream(gpuStream_t stream) {
pten::backends::gpu::GpuDestroyStream(stream);
}
void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind) {
pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
count, stream);
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
count);
}
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
}
} // namespace platform
} // namespace paddle
......@@ -14,6 +14,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h>
#include <array>
#include <string>
#include <vector>
......@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
int GetCurrentDeviceId();
//! Get the maximum GridDim size for GPU buddy allocator.
dim3 GetGpuMaxGridDimSize(int);
std::array<int, 3> GetGpuMaxGridDimSize(int);
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices();
......@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
void GpuDestroyStream(gpuStream_t stream);
// ! Blocks until device has completed all operations.
void GpuDeviceync();
void GpuDeviceSync();
//! CudaMalloc with recorded info
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
......
......@@ -83,8 +83,21 @@ struct NCCLContext {
std::unique_ptr<CUDADeviceContext> ctx_;
ncclComm_t comm_;
explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
explicit NCCLContext(int dev_id) : comm_{nullptr} {
ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
ctx_->PartialInitWithAllocator();
}
gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; }
......
hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
......@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <functional>
#include <memory>
#include <set>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
......@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
cuda_ctx,
platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
dev_ctx->SetDeviceAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->context()->RawStream())
.get());
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->stream())
.get());
cuda_ctx->PartialInitWithAllocator();
#endif
} else {
dev_ctx->SetDeviceAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
}
dev_ctx->SetHostAllocator(
memory::allocation::AllocatorFacade::Instance()
......@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
}
}
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
pten::CPUContext::Init();
}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
pten::CPUContext::Init();
}
#ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
Place IPUDeviceContext::GetPlace() const { return place_; }
const Place& IPUDeviceContext::GetPlace() const { return place_; }
void IPUDeviceContext::Wait() const {
/*! \brief Wait for all operations completion in the stream. */
......@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
pten::XPUContext::Init();
}
XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
pten::XPUContext::Init();
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place.device);
}
......@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; }
const Place& NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; }
......@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif
......@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
#endif
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceGuard guard(place_.device);
compute_capability_ = GetGPUComputeCapability(place_.device);
multi_process_ = GetGPUMultiProcessors(place_.device);
max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
driver_version_ = GetGPUDriverVersion(place_.device);
runtime_version_ = GetGPURuntimeVersion(place_.device);
LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
<< static_cast<int>(place_.device)
<< ", GPU Compute Capability: "
<< compute_capability_ / 10 << "."
<< compute_capability_ % 10
<< ", Driver API Version: " << driver_version_ / 1000
<< "." << (driver_version_ % 100) / 10
<< ", Runtime API Version: "
<< runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", MIOpen Version: " << version_major << "."
<< version_minor << "." << version_patch;
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 1000) / 100 << ".";
#endif
{
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
#else
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
#endif
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
}
default_ctx_.reset(new CUDAContext(place_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
#endif
}
Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process_ * max_threads_per_mp_;
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: pten::GPUContext(place) {
pten::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(
new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
}
int CUDADeviceContext::GetSMCount() const { return multi_process_; }
int CUDADeviceContext::GetMaxThreadsPerBlock() const {
return max_threads_per_block_;
}
CUDADeviceContext::~CUDADeviceContext() = default;
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return context()->EigenDevice().get();
}
bool CUDADeviceContext::tensor_core_available() const {
return context()->CublasTensorCoreHandle() != nullptr;
if (thread_ctx_.count(this)) {
return context()->EigenDevice().get();
}
return pten::GPUContext::eigen_device();
}
dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_;
void CUDADeviceContext::Wait() const {
if (thread_ctx_.count(this)) {
context()->Stream()->Wait();
return;
}
pten::GPUContext::Wait();
}
#ifdef PADDLE_WITH_HIP
......@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif
return context()->CudnnHandle();
if (thread_ctx_.count(this)) {
return context()->CudnnHandle();
}
return pten::GPUContext::cudnn_handle();
}
#ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle();
if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
}
#else
cublasHandle_t CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle();
if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
}
cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
return context()->CusparseHandle()->GetCusparseHandle();
if (thread_ctx_.count(this)) {
return context()->CusparseHandle()->GetCusparseHandle();
}
return pten::GPUContext::cusparse_handle();
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle();
}
return pten::GPUContext::cusolver_dn_handle();
}
#endif
void CUDADeviceContext::RecordEvent(
gpuEvent_t ev, const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->RecordEvent(ev, callback);
return;
}
pten::GPUContext::RecordEvent(ev, callback);
}
void CUDADeviceContext::AddStreamCallback(
const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->AddCallback(callback);
return;
}
pten::GPUContext::AddStreamCallback(callback);
}
void CUDADeviceContext::WaitStreamCallback() const {
if (thread_ctx_.count(this)) {
context()->Stream()->WaitCallback();
return;
}
pten::GPUContext::WaitStreamCallback();
}
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
}
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
return context()->CusolverDnHandle();
gpuStream_t CUDADeviceContext::stream() const {
if (thread_ctx_.count(this)) {
return context()->RawStream();
}
return pten::GPUContext::stream();
}
#endif
gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
if (!thread_ctx_.count(this)) {
PADDLE_THROW(platform::errors::PermissionDenied(
"CUDADeviceContext call context() failed, make sure in the "
"thread_local semantic."));
}
return thread_ctx_.at(this);
}
stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
return cuda_stream_.get();
}
stream::CUDAStream* CUDADeviceContext::SetCudaStream(
stream::CUDAStream* new_stream_ptr) {
auto* old_stream_ptr = cuda_stream_.release();
cuda_stream_.reset(new_stream_ptr);
return old_stream_ptr;
}
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
......@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
......@@ -18,7 +19,9 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/core/device_context.h"
#include "paddle/fluid/memory/malloc.h"
......@@ -28,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
......@@ -38,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h" // NOLINT
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
......@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
explicit IPUDeviceContext(IPUPlace place);
virtual ~IPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
const Place& GetPlace() const override;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
......@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
const Place& GetPlace() const override;
aclrtContext context() const;
/*! \brief Wait for all operations completion in the stream. */
......@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
NPUPinnedDeviceContext();
explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
Place GetPlace() const override;
const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
......@@ -326,20 +331,20 @@ class CUDAContext {
#endif
/*! \brief Call cublas function safely. */
template <typename Callback>
inline void CublasCall(Callback&& callback) const {
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tf32_tensor_core_handle_) {
cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
cublas_tf32_tensor_core_handle_->Call(callback);
} else {
cublas_handle_->Call(std::forward<Callback>(callback));
cublas_handle_->Call(callback);
}
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
template <typename Callback>
inline void CusparseCall(Callback&& callback) const {
cusparse_handle_->Call(std::forward<Callback>(callback));
inline void CusparseCall(
const std::function<void(pten::sparseHandle_t)>& callback) const {
cusparse_handle_->Call(callback);
}
#endif
......@@ -348,12 +353,12 @@ class CUDAContext {
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback>
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
cublas_tensor_core_handle_->Call(callback);
} else {
cublas_handle_->Call(std::forward<Callback>(callback));
cublas_handle_->Call(callback);
}
}
......@@ -491,7 +496,7 @@ class CUDAContext {
DISABLE_COPY_AND_ASSIGN(CUDAContext);
};
class CUDADeviceContext : public DeviceContext {
class CUDADeviceContext : public pten::GPUContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext();
......@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Return place in the device context. */
Place GetPlace() const override;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
dim3 GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Call cublas function safely. */
template <typename Callback>
inline void CublasCall(Callback&& callback) const {
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::CublasCall(callback);
return;
}
return context()->CublasCall(callback);
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
template <typename Callback>
inline void CusparseCall(Callback&& callback) const {
return context()->CusparseCall(callback);
inline void CusparseCall(
const std::function<void(pten::sparseHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::CusparseCall(callback);
return;
}
context()->CusparseCall(callback);
}
#endif
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback>
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
return context()->TensorCoreCublasCallIfAvailable(callback);
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
return;
}
context()->TensorCoreCublasCallIfAvailable(callback);
}
/*! \brief Return cudnn handle in the device context. */
......@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
cusparseHandle_t cusparse_handle() const;
#endif
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
......@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */
CudnnWorkspaceHandle cudnn_workspace_handle() const;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return cuda stream in the device context. */
gpuStream_t stream() const;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
#endif
template <typename Callback>
void RecordEvent(gpuEvent_t ev, Callback callback) const {
return context()->Stream()->RecordEvent(ev, callback);
}
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return context()->Stream()->AddCallback(callback);
}
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void WaitStreamCallback() const {
return context()->Stream()->WaitCallback();
}
void AddStreamCallback(const std::function<void()>& callback) const;
void ResetDefaultContext(const stream::Priority& priority) {
default_ctx_.reset(new CUDAContext(place_, priority));
}
void WaitStreamCallback() const;
void ResetThreadContext(const stream::Priority& priority) {
std::lock_guard<std::mutex> guard(ctx_mtx_);
thread_ctx_[this].reset(new CUDAContext(place_, priority));
thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
}
std::shared_ptr<CUDAContext> context() const {
if (!thread_ctx_.count(this)) {
return default_ctx_;
}
return thread_ctx_.at(this);
}
std::shared_ptr<CUDAContext> context() const;
// Note: Can only be used under thread_local semantics.
void SetThreadLocalStream(const gpuStream_t stream) {
thread_ctx_.at(this)->SetStream(stream);
}
private:
CUDAPlace place_;
std::shared_ptr<CUDAContext> default_ctx_;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
stream::CUDAStream* GetCudaStream() const;
stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
private:
// The thread_local static variable will be released before the
// global static variable, so avoid using it in dtor.
static thread_local std::unordered_map<const CUDADeviceContext*,
......@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex cudnn_handle_mtx_;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
#endif
int compute_capability_;
int runtime_version_;
int driver_version_;
int multi_process_;
int max_threads_per_mp_;
int max_threads_per_block_;
dim3 max_grid_dim_size_;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
std::unique_ptr<stream::CUDAStream> cuda_stream_;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
};
......@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
CUDAPinnedDeviceContext();
explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
Place GetPlace() const override;
const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
TEST(Device, Init) {
using paddle::platform::DeviceContext;
......@@ -26,6 +27,20 @@ TEST(Device, Init) {
int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device);
delete device_context;
......@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device);
#ifdef PADDLE_WITH_HIP
......
......@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext."));
wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream());
wrapper->inner_event_.Record(cuda_dev_ctx->stream());
}
bool DeviceEventQueryCUDA(const DeviceEvent* event) {
......@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext."));
// calling cudaStreamWaitEvent(stream, event, 0)
cuda_dev_ctx->context()->Stream()->WaitEvent(
wrapper->inner_event_.GetRawCudaEvent());
cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
}
void DeviceEventCPUWaitCUDA(const DeviceEvent* event,
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/device_event.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/place.h"
using ::paddle::platform::kCUDA;
using ::paddle::platform::kCPU;
......@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
// case 1. test for event_creator
DeviceEvent event(place);
ASSERT_NE(event.GetEvent().get(), nullptr);
bool status = event.Query();
ASSERT_EQ(status, true);
// case 2. test for event_recorder
event.Record(context);
bool status = event.Query();
status = event.Query();
ASSERT_EQ(status, false);
// case 3. test for event_finisher
event.Finish();
......
......@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \
++retry_count; \
} \
......@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \
++retry_count; \
} \
......
......@@ -152,11 +152,11 @@ class CudaEvent {
#endif
}
void Record(const paddle::platform::stream::CUDAStream &stream) {
void Record(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
#endif
}
......
......@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace paddle {
namespace platform {
......@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
}
}
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <>
struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
......@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
size_t limit_;
};
template <>
struct ForRange<pten::GPUContext> {
ForRange(const pten::GPUContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int num_thread = 256;
platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
const int num_threads = num_thread;
#else
constexpr int num_threads = 1024;
#endif
size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
}
const pten::GPUContext& dev_ctx_;
size_t limit_;
};
#endif
} // namespace platform
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
platform::Place device = CUDAPlace(deviceId);
auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context()
->Stream()
.get();
return stream;
return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->GetCudaStream();
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
......@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
auto& device = stream->GetPlace();
auto& pool = platform::DeviceContextPool::Instance();
return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context()
->SetStream(stream);
->SetCudaStream(stream);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
return nullptr;
"Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream."));
return CUDAStream(nullptr);
#endif
}
} // namespace stream
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdint>
#include <functional>
#include <memory>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
......@@ -51,24 +52,28 @@ class CUDAStream final {
const StreamFlag& flag = StreamFlag::kDefaultFlag) {
Init(place, priority, flag);
}
explicit CUDAStream(gpuStream_t stream, const Place& place)
: place_(place), stream_(stream) {
owned_stream_ = false;
callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
}
virtual ~CUDAStream() { Destroy(); }
bool Init(const Place& place, const Priority& priority = Priority::kNormal,
const StreamFlag& flag = StreamFlag::kDefaultFlag);
template <typename Callback>
void AddCallback(Callback&& callback) const {
void AddCallback(std::function<void()> callback) const {
callback_manager_->AddCallback(callback);
}
template <typename Callback>
#ifdef PADDLE_WITH_HIP
void RecordEvent(hipEvent_t ev, Callback callback) const {
void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
callback();
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
}
#else
void RecordEvent(cudaEvent_t ev, Callback callback) const {
void RecordEvent(cudaEvent_t ev,
const std::function<void()>& callback) const {
callback();
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
}
......@@ -149,6 +154,7 @@ class CUDAStream final {
};
CUDAStream* get_current_stream(int deviceId);
// NOTE: There is a problem with the interface and needs to be fixed
CUDAStream* set_current_stream(CUDAStream* stream);
} // namespace stream
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/transform.h"
......@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
TEST(Transform, GPUUnary) {
CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
......@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_allocation = Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
......
......@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
return paddle::platform::stream::get_current_stream(deviceId);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current "
"Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream."));
#endif
},
......@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
[](paddle::platform::stream::CUDAStream &self,
paddle::platform::stream::CUDAStream &stream) {
paddle::platform::CudaEvent event;
event.Record(stream);
event.Record(stream.raw_stream());
self.WaitEvent(event.GetRawCudaEvent());
},
......@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
if (event == nullptr) {
event = new paddle::platform::CudaEvent();
}
event->Record(self);
event->Record(self.raw_stream());
return event;
},
R"DOC(
......@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
if (stream == nullptr) {
stream = paddle::platform::stream::get_current_stream(-1);
}
self.Record(*stream);
self.Record(stream->raw_stream());
},
R"DOC(
Records the event in the given stream.
......
......@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create",
[](paddle::platform::CPUPlace& place)
-> paddle::platform::DeviceContext* {
return new paddle::platform::CPUDeviceContext();
auto* context = new paddle::platform::CPUDeviceContext();
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
})
.def_static("create",
[](paddle::platform::XPUPlace& place)
......@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#else
return new paddle::platform::XPUDeviceContext(place);
auto* context = new paddle::platform::XPUDeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
#endif
})
.def_static("create",
......@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#else
return new paddle::platform::CUDADeviceContext(place);
auto* context = new paddle::platform::CUDADeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, context->stream())
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
context->PartialInitWithAllocator();
return context;
#endif
})
.def_static("create",
......
......@@ -2,6 +2,10 @@ add_subdirectory(dynload)
add_subdirectory(cpu)
if(WITH_GPU OR WITH_ROCM)
add_subdirectory(gpu)
endif()
if(WITH_XPU)
add_subdirectory(xpu)
endif()
......@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
if(WITH_XPU)
add_dependencies(pten_context xpu_context)
endif()
if(WITH_GPU)
add_dependencies(pten_context gpu_context)
endif()
......@@ -15,75 +15,59 @@
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "paddle/pten/core/device_context.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace pten {
struct CPUContext::CPUImpl {
CPUImpl() { device_ = new Eigen::DefaultDevice(); }
struct CPUContext::Impl {
Impl() : place_(CPUPlace()) {}
// Users need to manage external resources.
explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
device_ = res_.device;
}
explicit Impl(const Place& place) : place_(place) {}
~CPUImpl() {
if (res_.device == nullptr && device_ != nullptr) {
delete device_;
device_ = nullptr;
~Impl() {
if (owned_) {
delete eigen_device_;
}
}
Eigen::DefaultDevice* GetEigenDevice() const {
PD_CHECK(device_ != nullptr, "the eigen_device is nullptr.");
return device_;
void Init() {
owned_ = true;
eigen_device_ = new Eigen::DefaultDevice();
}
void SetEigenDevice(Eigen::DefaultDevice* device) {
if (device == nullptr) {
return;
}
res_.device = device;
device_ = device;
Eigen::DefaultDevice* GetEigenDevice() const {
PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
return eigen_device_;
}
Place GetPlace() const { return place_; }
Eigen::DefaultDevice* device_{nullptr};
CPUContextResource res_;
CPUPlace place_;
bool owned_{false};
Eigen::DefaultDevice* eigen_device_{nullptr};
Place place_;
};
CPUContext::CPUContext() : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
}
CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
cpu_impl_->SetEigenDevice(other.eigen_device());
}
CPUContext::CPUContext()
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
cpu_impl_ = std::move(other.cpu_impl_);
}
CPUContext::CPUContext(const Place& place)
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
CPUContext::~CPUContext() = default;
CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
}
void CPUContext::Init() { impl_->Init(); }
Eigen::DefaultDevice* CPUContext::eigen_device() const {
return cpu_impl_->GetEigenDevice();
return impl_->GetEigenDevice();
}
const Place& CPUContext::GetPlace() const { return impl_->place_; }
void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
cpu_impl_->SetEigenDevice(device);
impl_->eigen_device_ = device;
}
Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
} // namespace pten
......@@ -24,37 +24,29 @@ limitations under the License. */
namespace pten {
struct CPUContextResource {
Eigen::DefaultDevice* device{nullptr};
};
class CPUContext : public DeviceContext {
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
CPUContext();
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
CPUContext(const CPUContext&);
CPUContext(CPUContext&&);
~CPUContext();
explicit CPUContext(const Place&);
virtual ~CPUContext();
Eigen::DefaultDevice* eigen_device() const;
// TODO(wilber): Whether the interface should be preserved.
Place GetPlace() const override;
const Place& GetPlace() const override;
public:
// NOTE: External users manage resources. Used in inference scenarios.
explicit CPUContext(const CPUContextResource& ctx_res);
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetEigenDevice(Eigen::DefaultDevice* device);
private:
struct CPUImpl;
std::unique_ptr<CPUImpl> cpu_impl_;
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
if(WITH_GPU)
add_subdirectory(cuda)
nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
elseif(WITH_ROCM)
add_subdirectory(rocm)
hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
endif()
cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace pten {
namespace backends {
namespace gpu {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
} // namespace gpu
} // namespace backends
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,20 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props;
static std::vector<pten::gpuDeviceProp> g_device_props;
namespace pten {
namespace backends {
namespace gpu {
namespace paddle {
namespace platform {
int DnnVersion() {
if (!dynload::HasCUDNN()) return -1;
return dynload::cudnnGetVersion();
......@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
}
int GetGPUComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int major, minor;
auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
......@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
}
int GetGPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int runtime_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
return runtime_version;
}
int GetGPUDriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int driver_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
return driver_version;
......@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
}
int GetGPUMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
......@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
......@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
}
int GetGPUMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
......@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
return device_id;
}
dim3 GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
dim3 ret;
id,
GetGPUDeviceCount()));
std::array<int, 3> ret;
int size;
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
ret.x = size;
ret[0] = size;
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
ret.y = size;
ret[1] = size;
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
ret.z = size;
ret[2] = size;
return ret;
}
const gpuDeviceProp &GetDeviceProperties(int id) {
std::call_once(g_device_props_size_init_flag, [&] {
int gpu_num = 0;
gpu_num = platform::GetGPUDeviceCount();
gpu_num = GetGPUDeviceCount();
g_device_props_init_flags.resize(gpu_num);
g_device_props.resize(gpu_num);
for (int i = 0; i < gpu_num; ++i) {
......@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
});
if (id == -1) {
id = platform::GetCurrentDeviceId();
id = GetCurrentDeviceId();
}
if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
PADDLE_THROW(platform::errors::OutOfRange(
PADDLE_THROW(paddle::platform::errors::OutOfRange(
"The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!",
id, static_cast<int>(g_device_props.size()),
id,
static_cast<int>(g_device_props.size()),
static_cast<int>(g_device_props.size())));
}
......@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeer(dst, dst_device, src, src_device, count));
}
......@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return cudaGetLastError(); }
} // namespace platform
} // namespace paddle
} // namespace gpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
#pragma once
/// Forward declaration of Eigen types.
namespace Eigen {
struct GpuDevice;
} // namespace Eigen
/// Forward declaration of CUDA types.
// Forward declaration of CUDA runtime types.
using cudaStream_t = struct CUstream_st *;
using cudaEvent_t = struct CUevent_st *;
// Forward declaration of cuDNN types.
using cudnnHandle_t = struct cudnnContext *;
using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
using cudnnSpatialTransformerDescriptor_t =
struct cudnnSpatialTransformerStruct *;
using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
using cudnnFusedOpsVariantParamPack_t =
struct cudnnFusedOpsVariantParamStruct *;
using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
// Forward declaration of cuBLAS types.
using cublasHandle_t = struct cublasContext *;
// Forward declaration of cuSOLVER types.
using cusolverDnHandle_t = struct cusolverDnContext *;
// Forward declaration of cuSparse types.
using cusparseHandle_t = struct cusparseContext *;
// Forward declaration of cuFFT types.
using cufftHandle = int;
// Forward declaration of NCCL types.
using ncclComm_t = struct ncclComm *;
/// Forward declaration of ROCM types.
#include <cstddef>
using hipDevice_t = int;
using hipCtx_t = struct ihipCtx_t *;
using hipModule_t = struct ihipModule_t *;
using hipStream_t = struct ihipStream_t *;
using hipEvent_t = struct ihipEvent_t *;
using hipFunction_t = struct ihipModuleSymbol_t *;
// Forward declaration of MIOpen types.
using miopenHandle_t = struct miopenHandle *;
using miopenAcceleratorQueue_t = hipStream_t;
using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
using miopenOperatorArgs_t = struct miopenOperatorArgs *;
using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
// struct miopenConvAlgoPerf_t;
// struct miopenConvSolution_t;
// Forward declaration of rocBLAS types.
using rocblas_handle = struct _rocblas_handle *;
// Forward declaration of hipfft types.
using hipfftHandle = struct hipfftHandle_t *;
// Forward declaration of rocSOLVER types.
using rocsolver_handle = rocblas_handle;
// Forward declaration of rocSparse types.
using rocsparse_handle = struct _rocsparse_handle *;
此差异已折叠。
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -14,13 +14,162 @@ limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/device_context.h"
#include <array>
#include <functional>
#include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_helper.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/device_context.h"
namespace pten {
using GPUContext = paddle::platform::CUDADeviceContext;
} // namespace pten
#endif
class DnnWorkspaceHandle;
class GPUContext : public DeviceContext {
public:
GPUContext();
explicit GPUContext(const GPUPlace& place);
virtual ~GPUContext();
/*! \brief Return place in the device context. */
const Place& GetPlace() const override;
/*! \brief Return gpu stream in the device context. */
gpuStream_t stream() const;
/*! \brief Return cudnn handle in the device context. */
dnnHandle_t cudnn_handle() const;
/*! \brief Return cublas handle in the device context. */
blasHandle_t cublas_handle() const;
/*! \brief Return cusolver handle in the device context. */
solverHandle_t cusolver_dn_handle() const;
/*! \brief Return cusparse handle in the device context. */
sparseHandle_t cusparse_handle() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Wait for event in the stream. */
void WaitEvent(gpuEvent_t ev) const;
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
std::array<int, 3> GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
* would be acquired to prevent other threads from accessing the
* workspace. Once the handle is destructed, the lock would be released.
*/
DnnWorkspaceHandle* cudnn_workspace_handle();
public:
/*! \brief Call cublas function safely. */
void CublasCall(const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cusparse function safely. */
void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void RecordEvent(gpuEvent_t ev) const;
void AddStreamCallback(const std::function<void()>& callback) const;
void WaitStreamCallback() const;
public:
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const;
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm);
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
// Note that you must set the Allocator before calling Init function.
void Init();
// TODO(wilber): Why does the GetAllocator interface require a stream
// parameter?
// The temporary trick method bypasses this problem, and the following
// interfaces
// need to be deleted later.
// Note that this is a trick implementation, which can be used to partially
// initialize when the SetAllocator interface is not called.
void PartialInitWithoutAllocator();
// Note that this is a trick implementation that can be used to initialize
// resources that require an Allocator when the SetAllocator interface is
// called.
void PartialInitWithAllocator();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetStream(gpuStream_t);
void SetEigenDevice(Eigen::GpuDevice*);
void SetBlasHandle(blasHandle_t);
void SetDnnHandle(dnnHandle_t);
void SetSolverHandle(solverHandle_t);
void SetSparseHandle(sparseHandle_t);
void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
void SetComputeCapability(int val);
void SetMaxThreadsPerMultiProcessor(int val);
void SetMultiProcessors(int val);
void SetMaxThreadsPerBlock(int val);
void SetMaxGridDimSize(const std::array<int, 3>& val);
void SetDriverVersion(int val);
void SetRuntimeVersion(int val);
private:
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
namespace pten {
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
cudnnActivationStruct,
miopenActivationDescriptor);
DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
cudnnTensorStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
cudnnFilterStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
cudnnFilterDescriptor_t,
miopenTensorDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
cudnnConvolutionStruct,
miopenConvolutionDescriptor);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
cudnnConvolutionDescriptor_t,
miopenConvolutionDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
cudnnPoolingDescriptor_t,
miopenPoolingDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
cudnnDropoutDescriptor_t,
miopenDropoutDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
#undef DECLARE_TYPE_FOR_GPU
using CUDAGraphID = unsigned long long; // NOLINT
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
#else
#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
#endif
#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_info.h"
#include <vector>
#include "gflags/gflags.h"
DECLARE_string(selected_gpus);
namespace pten {
namespace backends {
namespace gpu {
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
} // namespace gpu
} // namespace backends
} // namespace pten
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册