未验证 提交 a821c4a9 编写于 作者: W Wilber 提交者: GitHub

[PTEN] Add Gpu context (#39305)

上级 dcff7fa8
......@@ -33,7 +33,7 @@ namespace distributed {
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() {
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx);
}
......
......@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto &t_latest = var_latest->Get<framework::LoDTensor>();
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
......@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense(varnames, table_id, pserver_scope_.get());
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
for (auto &varname : varnames) {
auto *var_latest = recv_scope_->FindVar(varname);
auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
......@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto *t_old = var_old->GetMutable<framework::LoDTensor>();
auto dims1 = t_latest.dims()[1];
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
......@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std::vector<float> v_delta;
v_delta.resize(numel);
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
......
......@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
}
// set output tensor to 0.
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
constant_functor;
constant_functor(cpu_ctx, out_t, static_cast<T>(0));
......@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for (auto &var : vars) {
inputs.push_back(&var->Get<pten::SelectedRows>());
}
auto dev_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext dev_ctx;
if (merge_add) {
paddle::operators::math::scatter::MergeAdd<
paddle::platform::CPUDeviceContext, T>
......
......@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
......
......@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
pool.Get(member_->places_[dev_id]));
auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_bkcl_context(bkcl_ctx.comm());
dev_ctx->SetBkclContext(bkcl_ctx.comm());
}
#else
PADDLE_THROW(
......
......@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using TYPE = pten::CPUContext;
};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
struct ConvertToPtenContext<platform::CUDADeviceContext> {
using TYPE = pten::GPUContext;
};
#endif
#ifdef PADDLE_WITH_XPU
template <>
struct ConvertToPtenContext<platform::XPUDeviceContext> {
......
......@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.seekg(seekg, is.cur);
void* buf;
auto ctx = platform::CPUDeviceContext();
platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
......@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
platform::CPUDeviceContext ctx;
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) ||
......@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
return os;
}
}
} // namespace pten
......@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
......@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace();
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
......@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<int> dst;
......@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
paddle::framework::Tensor gpu_tensor;
paddle::platform::CUDAPlace place;
paddle::platform::CUDADeviceContext gpu_ctx(place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<bool> dst;
......@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
......
......@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
gloo_wrapper->Init();
device_ = std::unique_ptr<platform::CPUDeviceContext>(
new platform::CPUDeviceContext(platform::CPUPlace()));
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
device_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
}
void GLOOParallelContext::InitWithRingID(int ring_id) {
......
......@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
......
......@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CUDAPlace(0), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CUDAPlace(0))
.get());
ctx_->PartialInitWithAllocator();
engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork();
......
......@@ -18,6 +18,7 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
......@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu);
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu, dev_ctx.stream())
.get());
dev_ctx.PartialInitWithAllocator();
auto th_main = [&](std::random_device::result_type seed) {
std::default_random_engine engine(seed);
......
......@@ -25,6 +25,7 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place)));
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
}
......@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr.reset();
for (int i = 0; i < NUM_STREAMS; ++i) {
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place)));
auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
new platform::CUDADeviceContext(place));
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
ctx->PartialInitWithAllocator();
dev_ctx.emplace_back(std::move(ctx));
threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
std::cref(*dev_ctx[i])));
}
......
......@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
return block_size;
};
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post;
int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
......@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
// Init a index array
......@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
int block_size = ComputeBlockSize(num_cols);
int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
......
......@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
// reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input_tensor, output_tensor, kps::IdentityFunctor<T>(),
reduce_dims_vec, stream);
context.cuda_device_context(), *input_tensor, output_tensor,
kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
}
}
}
......
......@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
};
......
......@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
{1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>(
*input, &tmp, kps::SquareFunctor<platform::float16, float>(),
dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream());
auto tmp_eigen = EigenVector<float>::Flatten(tmp);
auto x_norm = tmp_eigen.sqrt();
......
......@@ -65,7 +65,8 @@ class CompareReduceOpKernel
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
kps::IdentityFunctor<bool>>(
tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream);
}
}
};
......
......@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
TEST(copy_cross_scope, CUDA_fp32) {
f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare1<float>(&scope, ctx, "copy_cross_scope");
}
TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
f::Scope scope;
p::CUDADeviceContext ctx(p::CUDAPlace(0));
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p::CUDAPlace(0), ctx.stream())
.get());
ctx.PartialInitWithAllocator();
Compare2<float>(&scope, ctx, "copy_cross_scope");
}
#elif PADDLE_WITH_ASCEND_CL
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
......@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
pt_z.get());
pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"X's type[%s] is not supported by elementwise_op. X's type should be "
......
......@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream());
}
template <ElementwiseType ET, typename T, typename Functor>
......
......@@ -275,6 +275,18 @@ class TestFeedForward {
output_size_ = 3 * num_head_ * dim_head_;
input_size_ = dim_embed_;
ctx_ = new platform::CUDADeviceContext(place_);
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
ctx_->PartialInitWithAllocator();
size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim]
size_weight_ = dim_embed_ * output_size_; // weight: [output_size, em_dim]
......
......@@ -166,7 +166,8 @@ class AttnMatMul {
if (support_case_1 || support_case_2) {
gpuStream_t stream = dev_ctx_.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream);
dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
stream);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support reduce when the input dims are [0,1,2,3,4] and "
......
......@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluFwdCUDAKernel< \
......@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
is_aligned(x_g, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \
VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluBwdCUDAKernel< \
......
......@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context();
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
// 1. Insert data into keys and values.
......@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (outputs->size() + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
ReindexSrcOutput<
......
......@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
#endif
int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0];
......@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
#endif
int64_t n = slice_size * index_size;
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
int64_t input_size = src_dims[0];
......
......@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
const int size_from_axis = SizeFromAxis(axis, X.dims());
const int size_out_axis = SizeOutAxis(axis, X.dims());
constexpr int thread_size = 512;
int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
int64_t height = size_to_axis * size_out_axis;
int block_size = height < max_grid_dimx ? height : max_grid_dimx;
......
......@@ -27,10 +27,10 @@ namespace operators {
namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
}
......
......@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
int block_y = std::min(GetLastPow2(height), max_threads / block_x);
int block_z = std::min(num_img, max_threads / block_x / block_y);
dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
auto max_grid_dim = context.GetCUDAMaxGridDimSize();
int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
int grid_z =
std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));
const int capability = context.GetComputeCapability();
platform::GpuLaunchConfig config;
......
......@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
}
if (dy) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
}
#else
auto* place = dev_ctx.eigen_device();
......
......@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1;
int block_size = getBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
: maxGridDimX;
......
......@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
#else
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
......
......@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* scores,
......@@ -129,6 +131,83 @@ void TestBeamSearch() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestBeamSearch<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::LoDTensor ids;
paddle::framework::LoDTensor scores;
paddle::framework::LoDTensor pre_ids;
paddle::framework::LoDTensor pre_scores;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
} else {
paddle::framework::LoDTensor cpu_ids;
paddle::framework::LoDTensor cpu_scores;
paddle::framework::LoDTensor cpu_pre_ids;
paddle::framework::LoDTensor cpu_pre_scores;
PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
ids.set_lod(cpu_ids.lod());
scores.set_lod(cpu_scores.lod());
pre_ids.set_lod(cpu_pre_ids.lod());
pre_scores.set_lod(cpu_pre_scores.lod());
}
paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores;
paddle::framework::LoDTensor parent_idx;
size_t level = 0;
size_t beam_size = 2;
int end_id = 0;
paddle::operators::math::BeamSearchFunctor<
paddle::platform::CUDADeviceContext, float>
beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, &parent_idx, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
paddle::framework::LoDTensor cpu_selected_ids;
paddle::framework::LoDTensor cpu_selected_scores;
if (paddle::platform::is_cpu_place(*place)) {
cpu_selected_ids = selected_ids;
cpu_selected_scores = selected_scores;
} else {
paddle::framework::TensorCopySync(
selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
paddle::framework::TensorCopySync(
selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
cpu_selected_ids.set_lod(selected_ids.lod());
cpu_selected_scores.set_lod(selected_scores.lod());
}
std::vector<int64_t> expected_ids({4, 5, 3, 8});
std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
}
delete place;
delete context;
}
#endif
TEST(BeamSearch, CPU) {
TestBeamSearch<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>();
......
......@@ -18,6 +18,7 @@
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
DECLARE_bool(enable_cublas_tensor_op_math);
......@@ -92,6 +93,32 @@ struct CUBlas<float> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const float *beta, void *C, cudaDataType_t Ctype,
int ldc) {
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG(5) << "use_tensor_op_math: "
<< (dev_ctx->tensor_core_available() ? "True" : "False");
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasSgemmEx is not supported on cuda <= 7.5"));
#endif
}
template <typename... ARGS>
static void TRSM(ARGS... args) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
......@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> {
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const void *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const void *beta, void *C, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
};
template <>
......@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const void *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const void *beta, void *C, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
cublasFillMode_t uplo, cublasOperation_t transa,
cublasDiagType_t diag, int m, int n,
......@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> {
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const void *alpha, const void *A, cudaDataType_t Atype,
int lda, const void *B, cudaDataType_t Btype, int ldb,
const void *beta, void *C, cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW(platform::errors::Unimplemented(
"cublasGemmEx is not supported on cuda <= 7.5"));
#endif
}
};
template <>
......@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T *A, const T *B, T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
CUDA_R_32F, N);
} else {
#endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, N);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMM(
......@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::float16>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
N);
});
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMM(
......@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<float> alpha,
const platform::complex<float> *A,
const platform::complex<float> *B,
platform::complex<float> beta,
platform::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<float>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A,
CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::complex<float>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&c_alpha, h_B, ldb, h_A, lda,
&c_beta, h_C, N);
});
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMM(
......@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<double> alpha,
const platform::complex<double> *A,
const platform::complex<double> *B,
platform::complex<double> beta,
platform::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<double> c_alpha =
thrust::complex<double>(alpha.real, alpha.imag);
thrust::complex<double> c_beta =
thrust::complex<double>(beta.real, beta.imag);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<double>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A,
CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::complex<double>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&c_alpha, h_B, ldb, h_A, lda,
&c_beta, h_C, N);
});
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
......@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
#endif // CUDA_VERSION >= 8000
}
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
T alpha, const T *A, int lda, const T *B,
int ldb, T beta, T *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
#if CUDA_VERSION >= 8000
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
CUDA_R_32F, ldc);
} else {
#endif // CUDA_VERSION >= 8000
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, ldc);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMM(
......@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
});
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A, int lda,
const platform::float16 *B, int ldb,
platform::float16 beta,
platform::float16 *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, ldc);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
......@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
......@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
[&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
......@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
......@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
const T *A, const T *B, T beta, T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMV(
......@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
}
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
alpha, B, A, beta, C);
} else {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
alpha, A, B, beta, C);
}
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM(
......@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
#endif // CUDA_VERSION >= 9010
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T *A, const T *B,
T beta, T *C, int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t strideC = M * N;
#if CUDA_VERSION >= 9010
if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
std::is_same<T, paddle::platform::float16>::value) {
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
});
} else {
#endif // CUDA_VERSION >= 9010
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, strideB, A, lda, strideA, &beta, C,
ldc, strideC, batchCount);
});
#if CUDA_VERSION >= 9010
}
#endif // CUDA_VERSION >= 9010
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM(
......@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
}
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T **A,
const T **B, T beta, T **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
C[k]);
}
}
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
......@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
}
}
template <>
template <>
inline void Blas<pten::GPUContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 **A,
const platform::float16 **B, platform::float16 beta, platform::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
B[k], beta, C[k]);
}
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
......@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
int M, int N, T alpha, const T *A, int lda,
T *B, int ldb) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t cuSide =
(side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t cuUplo =
(uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasDiagType_t cuDiag =
(diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
lda, B, ldb);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
......@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
......@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
T **a_inv, int *info,
int batch_size) const {
PADDLE_ENFORCE_NE(
a_inv, a,
platform::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv, a));
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
......@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
int *info, int batch_size) const {
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGETRS(
......@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
int nrhs, const T **a, int lda,
int *ipiv, T **b, int ldb, int *info,
int batch_size) const {
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTrans =
(trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
batch_size);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedTRSM(
......@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag, int M, int N, T alpha,
const T **A, int lda, T **B, int ldb,
int batch_size) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t cuSide =
(side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t cuUplo =
(uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasDiagType_t cuDiag =
(diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
&alpha, A, lda, B, ldb, batch_size);
});
}
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -17,6 +17,7 @@
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
DECLARE_bool(enable_cublas_tensor_op_math);
......@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> {
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
rocblas_operation transb, int m, int n, int k,
const void *alpha, const void *A, rocblas_datatype Atype,
int lda, const void *B, rocblas_datatype Btype, int ldb,
const void *beta, void *C, rocblas_datatype Ctype,
int ldc, rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
};
template <>
......@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> {
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
rocblas_operation transb, int m, int n, int k,
const void *alpha, const void *A, rocblas_datatype Atype,
int lda, const void *B, rocblas_datatype Btype, int ldb,
const void *beta, void *C, rocblas_datatype Ctype,
int ldc, rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
};
template <>
......@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> {
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
template <typename... ARGS>
static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
rocblas_operation transb, int m, int n, int k,
const void *alpha, const void *A, rocblas_datatype Atype,
int lda, const void *B, rocblas_datatype Btype, int ldb,
const void *beta, void *C, rocblas_datatype Ctype,
int ldc, rocblas_datatype computeType) {
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
});
}
};
template <>
......@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
&beta, C, N);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T *A, const T *B, T beta,
T *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
&beta, C, N);
});
}
template <>
template <>
......@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::float16>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B,
rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
}
template <>
template <>
......@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<float> alpha,
const platform::complex<float> *A,
const platform::complex<float> *B,
platform::complex<float> beta,
platform::complex<float> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<float> c_alpha =
thrust::complex<float>(alpha.real, alpha.imag);
thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<float>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
}
template <>
template <>
......@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, platform::complex<double> alpha,
const platform::complex<double> *A,
const platform::complex<double> *B,
platform::complex<double> beta,
platform::complex<double> *C) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(), 53,
platform::errors::InvalidArgument(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d",
context_.GetComputeCapability()));
thrust::complex<double> c_alpha =
thrust::complex<double>(alpha.real, alpha.imag);
thrust::complex<double> c_beta =
thrust::complex<double>(beta.real, beta.imag);
auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
CUBlas<platform::complex<double>>::GEMM_EX(
&cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
}
template <>
template <typename T>
......@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
&beta, C, ldc);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
T alpha, const T *A, int lda, const T *B,
int ldb, T beta, T *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA =
transA ? rocblas_operation_transpose : rocblas_operation_none;
rocblas_operation cuTransB =
transB ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
&beta, C, ldc);
});
}
template <>
template <>
......@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
B, ldb, A, lda, &beta, C, ldc);
});
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
int K, platform::float16 alpha,
const platform::float16 *A, int lda,
const platform::float16 *B, int ldb,
platform::float16 beta,
platform::float16 *C, int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA =
transA ? rocblas_operation_transpose : rocblas_operation_none;
rocblas_operation cuTransB =
transB ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, ldc);
});
}
template <>
template <typename T>
......@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <>
template <typename T>
......@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
}
template <>
template <typename T>
......@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
context_.CublasCall(
[&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
}
template <>
template <typename T>
......@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
const T *A, const T *B, T beta, T *C) const {
rocblas_operation cuTransA =
!trans_a ? rocblas_operation_transpose : rocblas_operation_none;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
template <>
......@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
alpha, A, B, beta, C);
}
}
template <>
template <>
inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
platform::float16 alpha,
const platform::float16 *A,
const platform::float16 *B,
platform::float16 beta,
platform::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
alpha, B, A, beta, C);
} else {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
alpha, A, B, beta, C);
}
}
template <>
template <typename T>
......@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
ldc, strideC, batchCount);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T *A, const T *B,
T beta, T *C, int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
const int64_t strideC = M * N;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, strideB, A, lda, strideA, &beta, C,
ldc, strideC, batchCount);
});
}
template <>
template <typename T>
......@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
C[k]);
}
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T **A,
const T **B, T beta, T **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
C[k]);
}
}
template <>
template <>
......@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
B[k], beta, C[k]);
}
}
template <>
template <>
inline void Blas<pten::GPUContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 **A,
const platform::float16 **B, platform::float16 beta, platform::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
B[k], beta, C[k]);
}
}
template <>
template <typename T>
......@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
lda, B, ldb);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
int M, int N, T alpha, const T *A, int lda,
T *B, int ldb) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side cuSide =
(side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
rocblas_fill cuUplo =
(uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_diagonal cuDiag =
(diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
lda, B, ldb);
});
}
template <>
template <typename T>
......@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
int batch_size) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
});
}
template <>
template <typename T>
......@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
T **a_inv, int *info,
int batch_size) const {
PADDLE_ENFORCE_NE(
a_inv, a,
platform::errors::InvalidArgument(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p).",
a_inv, a));
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
......@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
int *info, int batch_size) const {
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
});
}
template <>
template <typename T>
......@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
batch_size);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
int nrhs, const T **a, int lda,
int *ipiv, T **b, int ldb, int *info,
int batch_size) const {
rocblas_operation cuTrans = (trans == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
batch_size);
});
}
template <>
template <typename T>
......@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
&alpha, A, lda, B, ldb, batch_size);
});
}
template <>
template <typename T>
void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag, int M, int N, T alpha,
const T **A, int lda, T **B, int ldb,
int batch_size) const {
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side cuSide =
(side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
rocblas_fill cuUplo =
(uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_diagonal cuDiag =
(diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
context_.CublasCall([&](rocblas_handle handle) {
CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
&alpha, A, lda, B, ldb, batch_size);
});
}
} // namespace math
} // namespace operators
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
/**
* case 1:
......@@ -441,6 +443,31 @@ void TestConcatMain() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void TestConcatMain<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
auto* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
context);
delete context;
}
#endif
TEST(math, concat) {
TestConcatMain<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>();
......
......@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
const int rows, const int cols) {
paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
T>(*context);
......@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
const std::string& mode) {
paddle::platform::CUDADeviceContext* context =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPlace(), context->stream())
.get());
context->PartialInitWithAllocator();
// get sparse
auto sparse =
paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place>
void testIm2col() {
......@@ -60,6 +62,7 @@ void testIm2col() {
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
......@@ -164,6 +167,165 @@ void testIm2col() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor input;
paddle::framework::Tensor output_cfo;
paddle::framework::Tensor output_ocf;
paddle::framework::Tensor output_tmp;
/**
* input = [0, 1, 2,
* 3, 4, 5]
*
* output_cfo = [0, 1
* 1, 2
* 3, 4
* 4, 5]
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*
* col2im_cfo = [0, 2, 2
* 3, 4, 5]
*
* col2im_ocf = [0, 2, 2
* 3, 4, 5]
*/
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> stride({1, 1}); // stride_y, stride_x
std::vector<int> padding(
{0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad
std::vector<int> dilation({1, 1}); // dilation_y, dilation_x
int output_height =
(input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
int output_width =
(input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
float* input_ptr = input_tmp.mutable_data<float>(
{1, input_height, input_width}, paddle::platform::CPUPlace());
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input_ptr, arr, 6 * sizeof(float));
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
output_ocf.mutable_data<float>(
{output_height, output_width, 1, filter_size, filter_size}, *place);
// Im2Col
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
im2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
im2col_ocf;
im2col(*context, input, dilation, stride, padding, &output_cfo);
im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
}
float* out_ocf_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
&output_tmp);
out_ocf_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
}
// Col2Im: kCFO
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO,
paddle::platform::CUDADeviceContext, float>
col2im;
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kOCF,
paddle::platform::CUDADeviceContext, float>
col2im_ocf;
float col2im_data[] = {0, 2, 2, 3, 8, 5};
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
// Col2Im: kOCF
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
delete place;
delete context;
}
#endif
TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
constexpr size_t kThreadNumY = 32;
size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x);
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
dim3 thread_dims(kThreadNumX, kThreadNumY);
if (reverse) {
InclusiveScanInnerDimCUDAKernel<
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h"
namespace paddle {
......@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
template struct SetConstant<platform::CUDADeviceContext,
platform::complex<double>>;
template struct SetConstant<pten::GPUContext, platform::float16>;
template struct SetConstant<pten::GPUContext, platform::bfloat16>;
template struct SetConstant<pten::GPUContext, float>;
template struct SetConstant<pten::GPUContext, double>;
template struct SetConstant<pten::GPUContext, uint8_t>;
template struct SetConstant<pten::GPUContext, int>;
template struct SetConstant<pten::GPUContext, int16_t>;
template struct SetConstant<pten::GPUContext, int64_t>;
template struct SetConstant<pten::GPUContext, bool>;
template struct SetConstant<pten::GPUContext, platform::complex<float>>;
template struct SetConstant<pten::GPUContext, platform::complex<double>>;
template struct SetConstant<platform::CUDAPinnedDeviceContext,
platform::float16>;
template struct SetConstant<platform::CUDAPinnedDeviceContext,
......
......@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
t.Resize({10, 10});
t.mutable_data<int>(paddle::platform::CPUPlace());
auto* ctx = new paddle::platform::CPUDeviceContext();
ctx->Init();
paddle::operators::math::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
......
......@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
......@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
......@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2;
int n = 3;
......@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
int m = 2;
int n = 3;
......@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
......@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h"
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template <typename DeviceContext, typename Place>
void testVol2col() {
......@@ -25,7 +27,6 @@ void testVol2col() {
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
/**
* input = [[0, 1, 2,
* 3, 4, 5]
......@@ -123,6 +124,124 @@ void testVol2col() {
delete context;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <>
void testVol2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>() {
paddle::framework::Tensor input;
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor output;
paddle::framework::Tensor output_tmp;
auto* place = new paddle::platform::CUDAPlace();
auto* context = new paddle::platform::CUDADeviceContext(*place);
context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(*place, context->stream())
.get());
context->PartialInitWithAllocator();
/**
* input = [[0, 1, 2,
* 3, 4, 5]
* [6, 7, 8,
* 9, 10, 11]]
*
* output = [0, 1
* 1, 2
* 3, 4
* 4, 5
* 6, 7
* 7, 8
* 9, 10
* 10, 11]
*
* col2vol = [[0, 2, 2,
* 3, 8, 5]
* [6, 14, 8,
* 9, 20, 11]]
*
*/
int input_depth = 2;
int input_height = 2;
int input_width = 3;
int filter_size = 2;
std::vector<int> strides({1, 1, 1});
std::vector<int> paddings({0, 0, 0});
std::vector<int> dilations({1, 1, 1});
int output_depth =
(input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
int output_height =
(input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
int output_width =
(input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
// Vol2Col test
float* input_ptr =
input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
paddle::platform::CPUPlace());
float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input_ptr, arr, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
*place);
paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
float>
vol2col;
vol2col(*context, input, dilations, strides, paddings, &output);
float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
&output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 16; ++i) {
EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
}
// Col2Vol test
float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
memset(input_ptr, 0, 12 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
float>
col2vol;
col2vol(*context, output, dilations, strides, paddings, &input);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
&input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 12; ++i) {
EXPECT_EQ(in_ptr[i], col_2_vol[i]);
}
delete place;
delete context;
}
#endif
TEST(math, vol2col) {
testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i);
}
TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
*input, output, Div(numel), reduce_dims, stream);
context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream);
}
};
......
......@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i);
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
auto *ctx = new p::CUDADeviceContext(place);
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->PartialInitWithAllocator();
dev_ctxs_.emplace_back(ctx);
}
NCCLInitOp();
......
......@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using MT = typename details::MPTypeTrait<T>::Type;
if (porder == 0) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
*in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
reduce_axis, stream);
} else if (porder == INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else if (porder == -INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
*in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
ctx.cuda_device_context(), *in_x, out_norm,
UnsignedPowFunctor<T>(porder), reduce_axis, stream);
const framework::Tensor* tmp_norm = out_norm;
std::vector<const framework::Tensor*> ins = {tmp_norm};
......
......@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
auto stream = dev_ctx.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor,
kps::DivideFunctor<T>>(
*in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim,
stream);
dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
reduce_dim, stream);
#else // for cpu
paddle::operators::math::Pool2dFunctor<
DeviceContext, paddle::operators::math::AvgPool<T>, T>
......
......@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
}
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream);
context.cuda_device_context(), dalpha_tmp, dalpha,
kps::IdentityFunctor<T>(), reduce_dims, stream);
}
};
......
......@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
framework::Scope* scope = (*micro_scope)[0];
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
framework::Executor exe(place);
// create var on local scope
......
......@@ -39,14 +39,16 @@ namespace operators {
template <typename Tx, typename Ty, template <typename> class ReduceOp,
typename TransformOp>
void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) {
y->mutable_data<Ty>(x.place());
pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
x, y, transform, origin_reduce_dims, stream);
static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims, stream);
}
} // namespace operators
......
......@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
stream);
RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
......@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
dim_divisor);
std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
ctx.cuda_device_context(), pow_value, &dim_value,
kps::IdentityFunctor<T>(), reduce_axis, stream);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
stream);
ctx.cuda_device_context(), mul_value, &weight_derivative,
kps::IdentityFunctor<T>(), reduce_axis, stream);
RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
x_data, dout_data, dx_data, numel,
dim_value.mutable_data<T>(ctx.GetPlace()),
......
......@@ -389,7 +389,8 @@ class ReshapeKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
pt_scalar_shape, out);
}
#endif
#ifdef PADDLE_WITH_XPU
......@@ -417,7 +418,8 @@ class ReshapeGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
*d_out, d_x);
}
#endif
#ifdef PADDLE_WITH_XPU
......@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
pten::ReshapeDoubleGradKernel(
static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
}
#endif
#ifdef PADDLE_WITH_XPU
......
......@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
int64_t max_grid_dimx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.GetCUDAMaxGridDimSize()
.x;
.GetCUDAMaxGridDimSize()[0];
int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
ScatterInitCUDAKernel<T, IndexT><<<
......
......@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
#if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
reduce_dims, stream);
#else
ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
input, output, reduce_dims, keep_dim, false, ctx)
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace paddle {
namespace operators {
......@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto src_allocation = memory::Alloc(gpu0, sizeof(src));
......@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
......
......@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
tensor->Resize(dims);
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
inference::tensorrt::RandomizeTensor(tensor, place, ctx);
}
......@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
// Prepare variables.
if (allow_build_at_runtime)
CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
......@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0);
......
......@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
};
int block_size = ComputeBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
// actually, int num_rows < max_grid_size
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
......
......@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
reduce_dims, stream);
} else {
math::SetConstant<DeviceContext, T> functor;
functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
......
......@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
};
......
......@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
}
const auto& dev_ctx = ctx.cuda_device_context();
auto cu_stream = dev_ctx.stream();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t height = pre * post;
int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
......
......@@ -138,6 +138,7 @@ if(WITH_CNCL)
endif()
if(WITH_GPU OR WITH_ROCM)
target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
target_link_libraries(device_context gpu_resource_pool)
endif()
......
......@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
......@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
int dev_id, int ring_id) {
std::unique_ptr<CUDADeviceContext> dev_ctx(
new CUDADeviceContext(CUDAPlace(dev_id)));
dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
.get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
dev_ctx->PartialInitWithAllocator();
std::shared_ptr<platform::CudaEventObject> compute_event(
platform::CudaEventResourcePool::Instance().New(dev_id));
......@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(
platform::XPUPlace(dev_id)));
dev_ctx->set_bkcl_context(comm);
dev_ctx->SetBkclContext(comm);
}
return comm_map_[ring_id][dev_id].get();
......
IF(WITH_GPU)
add_subdirectory(cuda)
nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
ELSEIF(WITH_ROCM)
add_subdirectory(rocm)
hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
......
nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
......
......@@ -14,8 +14,10 @@
#pragma once
#include <functional>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
......@@ -96,8 +98,7 @@ class CublasHandleHolder {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
}
template <typename Callback>
inline void Call(Callback&& callback) const {
inline void Call(const std::function<void(blasHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(mtx_);
callback(handle_);
}
......
......@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include <functional>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
namespace paddle {
namespace platform {
......@@ -45,8 +47,8 @@ class CusparseHandleHolder {
#endif
}
template <typename Callback>
inline void Call(Callback&& callback) const {
inline void Call(
const std::function<void(pten::sparseHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(mtx_);
callback(handle_);
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include <cstdlib>
#include <mutex>
#include <set>
......@@ -39,11 +40,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
......@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT;
namespace paddle {
namespace platform {
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total;
......@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
}
int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
int GetGPUComputeCapability(int id) {
return pten::backends::gpu::GetGPUComputeCapability(id);
}
int GetGPURuntimeVersion(int id) {
return pten::backends::gpu::GetGPURuntimeVersion(id);
}
int GetGPUDriverVersion(int id) {
return pten::backends::gpu::GetGPUDriverVersion(id);
}
bool TensorCoreAvailable() {
return pten::backends::gpu::TensorCoreAvailable();
}
int GetGPUMultiProcessors(int id) {
return pten::backends::gpu::GetGPUMultiProcessors(id);
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
}
int GetGPUMaxThreadsPerBlock(int id) {
return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
}
int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
return pten::backends::gpu::GetGpuMaxGridDimSize(id);
}
std::vector<int> GetSelectedDevices() {
return pten::backends::gpu::GetSelectedDevices();
}
const gpuDeviceProp &GetDeviceProperties(int id) {
return pten::backends::gpu::GetDeviceProperties(id);
}
void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
void GpuStreamSync(gpuStream_t stream) {
pten::backends::gpu::GpuStreamSync(stream);
}
void GpuDestroyStream(gpuStream_t stream) {
pten::backends::gpu::GpuDestroyStream(stream);
}
void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind) {
pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
count, stream);
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
count);
}
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
}
} // namespace platform
} // namespace paddle
......@@ -14,6 +14,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h>
#include <array>
#include <string>
#include <vector>
......@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
int GetCurrentDeviceId();
//! Get the maximum GridDim size for GPU buddy allocator.
dim3 GetGpuMaxGridDimSize(int);
std::array<int, 3> GetGpuMaxGridDimSize(int);
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices();
......@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
void GpuDestroyStream(gpuStream_t stream);
// ! Blocks until device has completed all operations.
void GpuDeviceync();
void GpuDeviceSync();
//! CudaMalloc with recorded info
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
......
......@@ -83,8 +83,21 @@ struct NCCLContext {
std::unique_ptr<CUDADeviceContext> ctx_;
ncclComm_t comm_;
explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
explicit NCCLContext(int dev_id) : comm_{nullptr} {
ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(dev_id), ctx_->stream())
.get());
ctx_->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
ctx_->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(dev_id))
.get());
ctx_->PartialInitWithAllocator();
}
gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; }
......
hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
......@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <functional>
#include <memory>
#include <set>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
......@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
cuda_ctx,
platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
dev_ctx->SetDeviceAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->context()->RawStream())
.get());
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->stream())
.get());
cuda_ctx->PartialInitWithAllocator();
#endif
} else {
dev_ctx->SetDeviceAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
}
dev_ctx->SetHostAllocator(
memory::allocation::AllocatorFacade::Instance()
......@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
}
}
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
pten::CPUContext::Init();
}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
pten::CPUContext::Init();
}
#ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
Place IPUDeviceContext::GetPlace() const { return place_; }
const Place& IPUDeviceContext::GetPlace() const { return place_; }
void IPUDeviceContext::Wait() const {
/*! \brief Wait for all operations completion in the stream. */
......@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
pten::XPUContext::Init();
}
XPUDeviceContext::~XPUDeviceContext() {}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
pten::XPUContext::Init();
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place.device);
}
......@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; }
const Place& NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; }
......@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif
......@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
#endif
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceGuard guard(place_.device);
compute_capability_ = GetGPUComputeCapability(place_.device);
multi_process_ = GetGPUMultiProcessors(place_.device);
max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
driver_version_ = GetGPUDriverVersion(place_.device);
runtime_version_ = GetGPURuntimeVersion(place_.device);
LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
<< static_cast<int>(place_.device)
<< ", GPU Compute Capability: "
<< compute_capability_ / 10 << "."
<< compute_capability_ % 10
<< ", Driver API Version: " << driver_version_ / 1000
<< "." << (driver_version_ % 100) / 10
<< ", Runtime API Version: "
<< runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", MIOpen Version: " << version_major << "."
<< version_minor << "." << version_patch;
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 1000) / 100 << ".";
#endif
{
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
#else
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
#endif
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
}
default_ctx_.reset(new CUDAContext(place_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
#endif
}
Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process_ * max_threads_per_mp_;
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: pten::GPUContext(place) {
pten::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(
new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
}
int CUDADeviceContext::GetSMCount() const { return multi_process_; }
int CUDADeviceContext::GetMaxThreadsPerBlock() const {
return max_threads_per_block_;
}
CUDADeviceContext::~CUDADeviceContext() = default;
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return context()->EigenDevice().get();
}
bool CUDADeviceContext::tensor_core_available() const {
return context()->CublasTensorCoreHandle() != nullptr;
if (thread_ctx_.count(this)) {
return context()->EigenDevice().get();
}
return pten::GPUContext::eigen_device();
}
dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_;
void CUDADeviceContext::Wait() const {
if (thread_ctx_.count(this)) {
context()->Stream()->Wait();
return;
}
pten::GPUContext::Wait();
}
#ifdef PADDLE_WITH_HIP
......@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif
return context()->CudnnHandle();
if (thread_ctx_.count(this)) {
return context()->CudnnHandle();
}
return pten::GPUContext::cudnn_handle();
}
#ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle();
if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
}
#else
cublasHandle_t CUDADeviceContext::cublas_handle() const {
return context()->CublasHandle()->GetCublasHandle();
if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return pten::GPUContext::cublas_handle();
}
cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
return context()->CusparseHandle()->GetCusparseHandle();
if (thread_ctx_.count(this)) {
return context()->CusparseHandle()->GetCusparseHandle();
}
return pten::GPUContext::cusparse_handle();
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle();
}
return pten::GPUContext::cusolver_dn_handle();
}
#endif
void CUDADeviceContext::RecordEvent(
gpuEvent_t ev, const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->RecordEvent(ev, callback);
return;
}
pten::GPUContext::RecordEvent(ev, callback);
}
void CUDADeviceContext::AddStreamCallback(
const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->AddCallback(callback);
return;
}
pten::GPUContext::AddStreamCallback(callback);
}
void CUDADeviceContext::WaitStreamCallback() const {
if (thread_ctx_.count(this)) {
context()->Stream()->WaitCallback();
return;
}
pten::GPUContext::WaitStreamCallback();
}
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
}
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
return context()->CusolverDnHandle();
gpuStream_t CUDADeviceContext::stream() const {
if (thread_ctx_.count(this)) {
return context()->RawStream();
}
return pten::GPUContext::stream();
}
#endif
gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
if (!thread_ctx_.count(this)) {
PADDLE_THROW(platform::errors::PermissionDenied(
"CUDADeviceContext call context() failed, make sure in the "
"thread_local semantic."));
}
return thread_ctx_.at(this);
}
stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
return cuda_stream_.get();
}
stream::CUDAStream* CUDADeviceContext::SetCudaStream(
stream::CUDAStream* new_stream_ptr) {
auto* old_stream_ptr = cuda_stream_.release();
cuda_stream_.reset(new_stream_ptr);
return old_stream_ptr;
}
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
......@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
......@@ -18,7 +19,9 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/core/device_context.h"
#include "paddle/fluid/memory/malloc.h"
......@@ -28,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
......@@ -38,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h" // NOLINT
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
......@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
explicit IPUDeviceContext(IPUPlace place);
virtual ~IPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
const Place& GetPlace() const override;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
......@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
const Place& GetPlace() const override;
aclrtContext context() const;
/*! \brief Wait for all operations completion in the stream. */
......@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
NPUPinnedDeviceContext();
explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
Place GetPlace() const override;
const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
......@@ -326,20 +331,20 @@ class CUDAContext {
#endif
/*! \brief Call cublas function safely. */
template <typename Callback>
inline void CublasCall(Callback&& callback) const {
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tf32_tensor_core_handle_) {
cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
cublas_tf32_tensor_core_handle_->Call(callback);
} else {
cublas_handle_->Call(std::forward<Callback>(callback));
cublas_handle_->Call(callback);
}
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
template <typename Callback>
inline void CusparseCall(Callback&& callback) const {
cusparse_handle_->Call(std::forward<Callback>(callback));
inline void CusparseCall(
const std::function<void(pten::sparseHandle_t)>& callback) const {
cusparse_handle_->Call(callback);
}
#endif
......@@ -348,12 +353,12 @@ class CUDAContext {
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback>
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
cublas_tensor_core_handle_->Call(callback);
} else {
cublas_handle_->Call(std::forward<Callback>(callback));
cublas_handle_->Call(callback);
}
}
......@@ -491,7 +496,7 @@ class CUDAContext {
DISABLE_COPY_AND_ASSIGN(CUDAContext);
};
class CUDADeviceContext : public DeviceContext {
class CUDADeviceContext : public pten::GPUContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext();
......@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Return place in the device context. */
Place GetPlace() const override;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
dim3 GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Call cublas function safely. */
template <typename Callback>
inline void CublasCall(Callback&& callback) const {
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::CublasCall(callback);
return;
}
return context()->CublasCall(callback);
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
template <typename Callback>
inline void CusparseCall(Callback&& callback) const {
return context()->CusparseCall(callback);
inline void CusparseCall(
const std::function<void(pten::sparseHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::CusparseCall(callback);
return;
}
context()->CusparseCall(callback);
}
#endif
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback>
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
return context()->TensorCoreCublasCallIfAvailable(callback);
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
return;
}
context()->TensorCoreCublasCallIfAvailable(callback);
}
/*! \brief Return cudnn handle in the device context. */
......@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
cusparseHandle_t cusparse_handle() const;
#endif
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
......@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */
CudnnWorkspaceHandle cudnn_workspace_handle() const;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return cuda stream in the device context. */
gpuStream_t stream() const;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
#endif
template <typename Callback>
void RecordEvent(gpuEvent_t ev, Callback callback) const {
return context()->Stream()->RecordEvent(ev, callback);
}
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return context()->Stream()->AddCallback(callback);
}
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void WaitStreamCallback() const {
return context()->Stream()->WaitCallback();
}
void AddStreamCallback(const std::function<void()>& callback) const;
void ResetDefaultContext(const stream::Priority& priority) {
default_ctx_.reset(new CUDAContext(place_, priority));
}
void WaitStreamCallback() const;
void ResetThreadContext(const stream::Priority& priority) {
std::lock_guard<std::mutex> guard(ctx_mtx_);
thread_ctx_[this].reset(new CUDAContext(place_, priority));
thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
}
std::shared_ptr<CUDAContext> context() const {
if (!thread_ctx_.count(this)) {
return default_ctx_;
}
return thread_ctx_.at(this);
}
std::shared_ptr<CUDAContext> context() const;
// Note: Can only be used under thread_local semantics.
void SetThreadLocalStream(const gpuStream_t stream) {
thread_ctx_.at(this)->SetStream(stream);
}
private:
CUDAPlace place_;
std::shared_ptr<CUDAContext> default_ctx_;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
stream::CUDAStream* GetCudaStream() const;
stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
private:
// The thread_local static variable will be released before the
// global static variable, so avoid using it in dtor.
static thread_local std::unordered_map<const CUDADeviceContext*,
......@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex cudnn_handle_mtx_;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
#endif
int compute_capability_;
int runtime_version_;
int driver_version_;
int multi_process_;
int max_threads_per_mp_;
int max_threads_per_block_;
dim3 max_grid_dim_size_;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
std::unique_ptr<stream::CUDAStream> cuda_stream_;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
};
......@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
CUDAPinnedDeviceContext();
explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
Place GetPlace() const override;
const Place& GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
TEST(Device, Init) {
using paddle::platform::DeviceContext;
......@@ -26,6 +27,20 @@ TEST(Device, Init) {
int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device);
delete device_context;
......@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
int count = paddle::platform::GetGPUDeviceCount();
for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
device_context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(CUDAPlace(i), device_context->stream())
.get());
device_context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
device_context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(CUDAPlace(i))
.get());
device_context->PartialInitWithAllocator();
Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device);
#ifdef PADDLE_WITH_HIP
......
......@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext."));
wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream());
wrapper->inner_event_.Record(cuda_dev_ctx->stream());
}
bool DeviceEventQueryCUDA(const DeviceEvent* event) {
......@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
platform::errors::PreconditionNotMet(
"Failed to dynamic_cast context into CUDADeviceContext."));
// calling cudaStreamWaitEvent(stream, event, 0)
cuda_dev_ctx->context()->Stream()->WaitEvent(
wrapper->inner_event_.GetRawCudaEvent());
cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
}
void DeviceEventCPUWaitCUDA(const DeviceEvent* event,
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/device_event.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/place.h"
using ::paddle::platform::kCUDA;
using ::paddle::platform::kCPU;
......@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
// case 1. test for event_creator
DeviceEvent event(place);
ASSERT_NE(event.GetEvent().get(), nullptr);
bool status = event.Query();
ASSERT_EQ(status, true);
// case 2. test for event_recorder
event.Record(context);
bool status = event.Query();
status = event.Query();
ASSERT_EQ(status, false);
// case 3. test for event_finisher
event.Finish();
......
......@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \
++retry_count; \
} \
......@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \
++retry_count; \
} \
......
......@@ -152,11 +152,11 @@ class CudaEvent {
#endif
}
void Record(const paddle::platform::stream::CUDAStream &stream) {
void Record(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
#endif
}
......
......@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place);
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, gpu_ctx.stream())
.get());
gpu_ctx.PartialInitWithAllocator();
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace paddle {
namespace platform {
......@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
}
}
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <>
struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
......@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
size_t limit_;
};
template <>
struct ForRange<pten::GPUContext> {
ForRange(const pten::GPUContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int num_thread = 256;
platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
const int num_threads = num_thread;
#else
constexpr int num_threads = 1024;
#endif
size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
}
const pten::GPUContext& dev_ctx_;
size_t limit_;
};
#endif
} // namespace platform
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
platform::Place device = CUDAPlace(deviceId);
auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context()
->Stream()
.get();
return stream;
return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->GetCudaStream();
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
......@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
auto& device = stream->GetPlace();
auto& pool = platform::DeviceContextPool::Instance();
return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
->context()
->SetStream(stream);
->SetCudaStream(stream);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
return nullptr;
"Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream."));
return CUDAStream(nullptr);
#endif
}
} // namespace stream
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdint>
#include <functional>
#include <memory>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
......@@ -51,24 +52,28 @@ class CUDAStream final {
const StreamFlag& flag = StreamFlag::kDefaultFlag) {
Init(place, priority, flag);
}
explicit CUDAStream(gpuStream_t stream, const Place& place)
: place_(place), stream_(stream) {
owned_stream_ = false;
callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
}
virtual ~CUDAStream() { Destroy(); }
bool Init(const Place& place, const Priority& priority = Priority::kNormal,
const StreamFlag& flag = StreamFlag::kDefaultFlag);
template <typename Callback>
void AddCallback(Callback&& callback) const {
void AddCallback(std::function<void()> callback) const {
callback_manager_->AddCallback(callback);
}
template <typename Callback>
#ifdef PADDLE_WITH_HIP
void RecordEvent(hipEvent_t ev, Callback callback) const {
void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
callback();
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
}
#else
void RecordEvent(cudaEvent_t ev, Callback callback) const {
void RecordEvent(cudaEvent_t ev,
const std::function<void()>& callback) const {
callback();
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
}
......@@ -149,6 +154,7 @@ class CUDAStream final {
};
CUDAStream* get_current_stream(int deviceId);
// NOTE: There is a problem with the interface and needs to be fixed
CUDAStream* set_current_stream(CUDAStream* stream);
} // namespace stream
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/transform.h"
......@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
TEST(Transform, GPUUnary) {
CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
......@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu0, ctx.stream())
.get());
ctx.PartialInitWithAllocator();
auto gpu_allocation = Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
......
......@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
return paddle::platform::stream::get_current_stream(deviceId);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with CUDA. Cannot visit cuda current "
"Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream."));
#endif
},
......@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
[](paddle::platform::stream::CUDAStream &self,
paddle::platform::stream::CUDAStream &stream) {
paddle::platform::CudaEvent event;
event.Record(stream);
event.Record(stream.raw_stream());
self.WaitEvent(event.GetRawCudaEvent());
},
......@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
if (event == nullptr) {
event = new paddle::platform::CudaEvent();
}
event->Record(self);
event->Record(self.raw_stream());
return event;
},
R"DOC(
......@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
if (stream == nullptr) {
stream = paddle::platform::stream::get_current_stream(-1);
}
self.Record(*stream);
self.Record(stream->raw_stream());
},
R"DOC(
Records the event in the given stream.
......
......@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create",
[](paddle::platform::CPUPlace& place)
-> paddle::platform::DeviceContext* {
return new paddle::platform::CPUDeviceContext();
auto* context = new paddle::platform::CPUDeviceContext();
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
})
.def_static("create",
[](paddle::platform::XPUPlace& place)
......@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#else
return new paddle::platform::XPUDeviceContext(place);
auto* context = new paddle::platform::XPUDeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place)
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
return context;
#endif
})
.def_static("create",
......@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#else
return new paddle::platform::CUDADeviceContext(place);
auto* context = new paddle::platform::CUDADeviceContext(place);
context->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, context->stream())
.get());
context->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
context->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place)
.get());
context->PartialInitWithAllocator();
return context;
#endif
})
.def_static("create",
......
......@@ -2,6 +2,10 @@ add_subdirectory(dynload)
add_subdirectory(cpu)
if(WITH_GPU OR WITH_ROCM)
add_subdirectory(gpu)
endif()
if(WITH_XPU)
add_subdirectory(xpu)
endif()
......@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
if(WITH_XPU)
add_dependencies(pten_context xpu_context)
endif()
if(WITH_GPU)
add_dependencies(pten_context gpu_context)
endif()
......@@ -15,75 +15,59 @@
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "paddle/pten/core/device_context.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace pten {
struct CPUContext::CPUImpl {
CPUImpl() { device_ = new Eigen::DefaultDevice(); }
struct CPUContext::Impl {
Impl() : place_(CPUPlace()) {}
// Users need to manage external resources.
explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
device_ = res_.device;
}
explicit Impl(const Place& place) : place_(place) {}
~CPUImpl() {
if (res_.device == nullptr && device_ != nullptr) {
delete device_;
device_ = nullptr;
~Impl() {
if (owned_) {
delete eigen_device_;
}
}
Eigen::DefaultDevice* GetEigenDevice() const {
PD_CHECK(device_ != nullptr, "the eigen_device is nullptr.");
return device_;
void Init() {
owned_ = true;
eigen_device_ = new Eigen::DefaultDevice();
}
void SetEigenDevice(Eigen::DefaultDevice* device) {
if (device == nullptr) {
return;
}
res_.device = device;
device_ = device;
Eigen::DefaultDevice* GetEigenDevice() const {
PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
return eigen_device_;
}
Place GetPlace() const { return place_; }
Eigen::DefaultDevice* device_{nullptr};
CPUContextResource res_;
CPUPlace place_;
bool owned_{false};
Eigen::DefaultDevice* eigen_device_{nullptr};
Place place_;
};
CPUContext::CPUContext() : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
}
CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>();
cpu_impl_->SetEigenDevice(other.eigen_device());
}
CPUContext::CPUContext()
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
cpu_impl_ = std::move(other.cpu_impl_);
}
CPUContext::CPUContext(const Place& place)
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
CPUContext::~CPUContext() = default;
CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
}
void CPUContext::Init() { impl_->Init(); }
Eigen::DefaultDevice* CPUContext::eigen_device() const {
return cpu_impl_->GetEigenDevice();
return impl_->GetEigenDevice();
}
const Place& CPUContext::GetPlace() const { return impl_->place_; }
void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
cpu_impl_->SetEigenDevice(device);
impl_->eigen_device_ = device;
}
Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
} // namespace pten
......@@ -24,37 +24,29 @@ limitations under the License. */
namespace pten {
struct CPUContextResource {
Eigen::DefaultDevice* device{nullptr};
};
class CPUContext : public DeviceContext {
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
CPUContext();
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
CPUContext(const CPUContext&);
CPUContext(CPUContext&&);
~CPUContext();
explicit CPUContext(const Place&);
virtual ~CPUContext();
Eigen::DefaultDevice* eigen_device() const;
// TODO(wilber): Whether the interface should be preserved.
Place GetPlace() const override;
const Place& GetPlace() const override;
public:
// NOTE: External users manage resources. Used in inference scenarios.
explicit CPUContext(const CPUContextResource& ctx_res);
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetEigenDevice(Eigen::DefaultDevice* device);
private:
struct CPUImpl;
std::unique_ptr<CPUImpl> cpu_impl_;
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
if(WITH_GPU)
add_subdirectory(cuda)
nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
elseif(WITH_ROCM)
add_subdirectory(rocm)
hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
endif()
cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace pten {
namespace backends {
namespace gpu {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
} // namespace gpu
} // namespace backends
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,20 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props;
static std::vector<pten::gpuDeviceProp> g_device_props;
namespace pten {
namespace backends {
namespace gpu {
namespace paddle {
namespace platform {
int DnnVersion() {
if (!dynload::HasCUDNN()) return -1;
return dynload::cudnnGetVersion();
......@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
}
int GetGPUComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int major, minor;
auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
......@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
}
int GetGPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int runtime_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
return runtime_version;
}
int GetGPUDriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int driver_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
return driver_version;
......@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
}
int GetGPUMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
......@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
......@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
}
int GetGPUMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
......@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
return device_id;
}
dim3 GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
dim3 ret;
id,
GetGPUDeviceCount()));
std::array<int, 3> ret;
int size;
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
ret.x = size;
ret[0] = size;
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
ret.y = size;
ret[1] = size;
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
ret.z = size;
ret[2] = size;
return ret;
}
const gpuDeviceProp &GetDeviceProperties(int id) {
std::call_once(g_device_props_size_init_flag, [&] {
int gpu_num = 0;
gpu_num = platform::GetGPUDeviceCount();
gpu_num = GetGPUDeviceCount();
g_device_props_init_flags.resize(gpu_num);
g_device_props.resize(gpu_num);
for (int i = 0; i < gpu_num; ++i) {
......@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
});
if (id == -1) {
id = platform::GetCurrentDeviceId();
id = GetCurrentDeviceId();
}
if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
PADDLE_THROW(platform::errors::OutOfRange(
PADDLE_THROW(paddle::platform::errors::OutOfRange(
"The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!",
id, static_cast<int>(g_device_props.size()),
id,
static_cast<int>(g_device_props.size()),
static_cast<int>(g_device_props.size())));
}
......@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyPeer(dst, dst_device, src, src_device, count));
}
......@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return cudaGetLastError(); }
} // namespace platform
} // namespace paddle
} // namespace gpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
#pragma once
/// Forward declaration of Eigen types.
namespace Eigen {
struct GpuDevice;
} // namespace Eigen
/// Forward declaration of CUDA types.
// Forward declaration of CUDA runtime types.
using cudaStream_t = struct CUstream_st *;
using cudaEvent_t = struct CUevent_st *;
// Forward declaration of cuDNN types.
using cudnnHandle_t = struct cudnnContext *;
using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
using cudnnSpatialTransformerDescriptor_t =
struct cudnnSpatialTransformerStruct *;
using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
using cudnnFusedOpsVariantParamPack_t =
struct cudnnFusedOpsVariantParamStruct *;
using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
// Forward declaration of cuBLAS types.
using cublasHandle_t = struct cublasContext *;
// Forward declaration of cuSOLVER types.
using cusolverDnHandle_t = struct cusolverDnContext *;
// Forward declaration of cuSparse types.
using cusparseHandle_t = struct cusparseContext *;
// Forward declaration of cuFFT types.
using cufftHandle = int;
// Forward declaration of NCCL types.
using ncclComm_t = struct ncclComm *;
/// Forward declaration of ROCM types.
#include <cstddef>
using hipDevice_t = int;
using hipCtx_t = struct ihipCtx_t *;
using hipModule_t = struct ihipModule_t *;
using hipStream_t = struct ihipStream_t *;
using hipEvent_t = struct ihipEvent_t *;
using hipFunction_t = struct ihipModuleSymbol_t *;
// Forward declaration of MIOpen types.
using miopenHandle_t = struct miopenHandle *;
using miopenAcceleratorQueue_t = hipStream_t;
using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
using miopenOperatorArgs_t = struct miopenOperatorArgs *;
using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
// struct miopenConvAlgoPerf_t;
// struct miopenConvSolution_t;
// Forward declaration of rocBLAS types.
using rocblas_handle = struct _rocblas_handle *;
// Forward declaration of hipfft types.
using hipfftHandle = struct hipfftHandle_t *;
// Forward declaration of rocSOLVER types.
using rocsolver_handle = rocblas_handle;
// Forward declaration of rocSparse types.
using rocsparse_handle = struct _rocsparse_handle *;
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_context.h"
#include <array>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#include "paddle/pten/backends/dynload/cusolver.h"
#include "paddle/pten/backends/dynload/cusparse.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/pten/backends/dynload/nccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/pten/backends/dynload/rccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#endif // PADDLE_WITH_HIP
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "unsupported/Eigen/CXX11/Tensor"
// TODO(pten): remove fluid header.
#include "paddle/fluid/platform/enforce.h"
namespace pten {
namespace internal {
class EigenGpuStreamDevice : public Eigen::StreamInterface {
public:
EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
Eigen::initializeDeviceProp();
}
~EigenGpuStreamDevice() override {}
void Reinitialize(gpuStream_t cuda_stream,
Allocator* allocator,
GPUPlace place) {
stream_ = cuda_stream;
place_ = place;
allocator_ = allocator;
device_prop_ = &Eigen::m_deviceProperties[place.device];
}
const gpuStream_t& stream() const override { return stream_; }
const gpuDeviceProp& deviceProperties() const override {
return *device_prop_;
}
void* allocate(size_t num_bytes) const override {
if (UNLIKELY(num_bytes == 0)) {
return nullptr;
}
auto buf = allocator_->Allocate(num_bytes);
VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
<< num_bytes;
void* retv = buf->ptr();
{
std::lock_guard<std::mutex> lock(mtx_);
allocations_.emplace(retv, std::move(buf));
}
return retv;
}
void deallocate(void* buffer) const override {
if (LIKELY(buffer)) {
std::lock_guard<std::mutex> lock(mtx_);
allocations_.erase(buffer);
}
}
void* scratchpad() const override {
if (scratch_ == NULL) {
scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
}
return scratch_;
}
unsigned int* semaphore() const override {
if (semaphore_ == NULL) {
char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
#endif
}
return semaphore_;
}
private:
GPUPlace place_;
gpuStream_t stream_; // not owned;
Allocator* allocator_; // not owned;
const gpuDeviceProp* device_prop_; // not owned;
mutable void* scratch_;
mutable unsigned int* semaphore_;
mutable std::mutex mtx_; // to protect allocations_
mutable std::unordered_map<void*, Allocator::AllocationPtr> allocations_;
};
#ifdef PADDLE_WITH_HIP
static void StreamCallbackFunc(gpuStream_t stream,
gpuError_t status,
void* user_data)
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void* user_data)
#else
static void CUDART_CB
StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
#endif
#endif
{
std::unique_ptr<std::function<void()>> func(
reinterpret_cast<std::function<void()>*>(user_data));
(*func)();
}
} // namespace internal
class DnnWorkspaceHandle {
public:
explicit inline DnnWorkspaceHandle(Allocator* allocator)
: allocator_(allocator) {}
inline void RunFunc(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) {
if (required_workspace_bytes > WorkspaceSize()) {
ReallocWorkspace(required_workspace_bytes);
}
VLOG(2) << "Cudnn workspace size at RunFunc: "
<< static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
{
std::lock_guard<std::mutex> guard(mtx_);
cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
}
}
/*! \brief Thread which call RunFuncSync() would release gpu memory after
* running the function. Currently this function is only used when cudnn
* exhaustive searching and callers have to guarantee that the input function
* is host blocking */
inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) {
RunFunc(cudnn_func, required_workspace_bytes);
ResetWorkspace();
}
inline size_t WorkspaceSize() {
if (allocation_ == nullptr) {
return 0;
}
return allocation_->size();
}
void ResetWorkspace() { allocation_ = nullptr; }
void ReallocWorkspace(size_t required_workspace_bytes) {
if (required_workspace_bytes <= WorkspaceSize()) return;
// reset allocation first before re-allocate to save memory
allocation_.reset();
allocation_ = allocator_->Allocate(required_workspace_bytes);
}
private:
Allocator::AllocationPtr allocation_{nullptr};
Allocator* allocator_{nullptr};
std::mutex mtx_;
};
struct GPUContext::Impl {
void Init() {
owned_ = true;
backends::gpu::GPUDeviceGuard guard(place_.device);
InitGpuProperties();
InitStream();
InitEigenDevice();
InitBlasHandle();
InitDNNHandle();
InitSolverHandle();
InitSparseHandle();
InitDnnWorkspace();
}
void PartialInitWithoutAllocator() {
owned_ = true;
backends::gpu::GPUDeviceGuard guard(place_.device);
InitGpuProperties();
InitStream();
InitBlasHandle();
InitDNNHandle();
InitSolverHandle();
InitSparseHandle();
}
void PartialInitWithAllocator() {
owned_ = true;
backends::gpu::GPUDeviceGuard guard(place_.device);
InitEigenDevice();
InitDnnWorkspace();
}
Impl() : place_(GPUPlace()) {}
explicit Impl(const GPUPlace& place) : place_(place) {}
~Impl() {
backends::gpu::GPUDeviceGuard guard(place_.device);
DestoryInternalWorkspace();
DestoryInternalEigenDevice();
DestroyInternalSparseHandle();
DestroyInternalSolverHandle();
DestroyInternalDnnHandle();
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
#endif
DestroyInternalBlasHandle();
DestoryInternalStream();
}
const Place& GetPlace() const { return place_; }
bool IsTensorCoreAvailable() const {
return blas_tensor_core_handle_ != nullptr;
}
void InitGpuProperties() {
backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
compute_capability_ =
backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
max_threads_per_mp_ =
backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
max_grid_dim_size_ =
backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
max_threads_per_block_ =
backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
runtime_version_ =
backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
// TODO(wilber): glog may be replaced in the future?
LOG_FIRST_N(WARNING, 1)
<< "Please NOTE: device: " << static_cast<int>(place_.device)
<< ", GPU Compute Capability: " << compute_capability_ / 10 << "."
<< compute_capability_ % 10
<< ", Driver API Version: " << driver_version_ / 1000 << "."
<< (driver_version_ % 100) / 10
<< ", Runtime API Version: " << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
auto cudnn_dso_ver =
(miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
auto compile_miopen_version = MIOPEN_VERSION / 10;
if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 100 << "." << compile_miopen_version % 100
<< ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
<< "." << cudnn_dso_ver % 100
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
<< ", cuDNN Version: " << cudnn_dso_ver / 1000
<< "." << (cudnn_dso_ver % 1000) / 100 << ".";
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << static_cast<int>(place_.device)
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
#endif
}
void InitDnnWorkspace() {
PD_CHECK(allocator_ != nullptr,
"the device allocator for gpu context is nullptr.");
workspace_ = new DnnWorkspaceHandle(allocator_);
}
void DestoryInternalWorkspace() {
if (owned_ && workspace_ != nullptr) {
delete workspace_;
stream_ = nullptr;
}
}
DnnWorkspaceHandle* GetDnnWorkspace() {
PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
return workspace_;
}
void InitStream() {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
#endif
}
void DestoryInternalStream() {
if (owned_ && stream_ != nullptr) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
#endif
}
stream_ = nullptr;
}
void SetStream(gpuStream_t stream) { stream_ = stream; }
gpuStream_t GetStream() const {
PD_CHECK(stream_ != nullptr, "the gpu stream is nullptr.");
return stream_;
}
void InitEigenDevice() {
PD_CHECK(allocator_ != nullptr,
"the allocator for eigen device is nullptr.");
eigen_stream_.reset(new internal::EigenGpuStreamDevice());
eigen_stream_->Reinitialize(stream_, allocator_, place_);
eigen_device_ = new Eigen::GpuDevice(eigen_stream_.get());
}
void DestoryInternalEigenDevice() {
if (owned_ && eigen_device_ != nullptr) {
delete eigen_device_;
eigen_device_ = nullptr;
}
}
void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; }
Eigen::GpuDevice* eigen_device() const {
PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr.");
return eigen_device_;
}
void InitBlasHandle() {
#ifdef PADDLE_WITH_HIP
pten::dynload::rocblas_create_handle(&blas_handle_);
pten::dynload::rocblas_set_stream(blas_handle_, stream_);
#else // PADDLE_WITH_CUDA
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasCreate(&blas_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasSetStream(blas_handle_, stream_));
#if CUDA_VERSION >= 9000
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasCreate(&blas_tensor_core_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 9000
#endif // PADDLE_WITH_HIP
}
void DestroyInternalBlasHandle() {
#ifdef PADDLE_WITH_HIP
if (owned_ && blas_handle_ != nullptr) {
pten::dynload::rocblas_destroy_handle(blas_handle_);
blas_handle_ = nullptr;
}
#else
if (owned_ && blas_handle_ != nullptr) {
pten::dynload::cublasDestroy(blas_handle_);
blas_handle_ = nullptr;
}
if (owned_ && blas_tensor_core_handle_ != nullptr) {
pten::dynload::cublasDestroy(blas_tensor_core_handle_);
blas_tensor_core_handle_ = nullptr;
}
if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
pten::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
blas_tf32_tensor_core_handle_ = nullptr;
}
#endif // PADDLE_WITH_HIP
}
blasHandle_t GetBlasHandle() const {
PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
return blas_handle_;
}
void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
void InitDNNHandle() {
if (pten::dynload::HasCUDNN()) {
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&miopen_major, &miopen_minor, &miopen_patch));
auto local_miopen_version =
(miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
auto compile_miopen_version = MIOPEN_VERSION / 10;
if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 100 << "."
<< compile_miopen_version % 100
<< ", but MIOPEN version in your machine is "
<< local_miopen_version / 100 << "." << local_miopen_version % 100
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenSetStream(dnn_handle_, stream_));
#else
auto local_cudnn_version = pten::dynload::cudnnGetVersion() / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cudnnCreate(&dnn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cudnnSetStream(dnn_handle_, stream_));
#endif
} else {
dnn_handle_ = nullptr;
}
}
dnnHandle_t GetDnnHandle() {
PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
return dnn_handle_;
}
void DestroyInternalDnnHandle() {
#ifdef PADDLE_WITH_HIP
if (owned_ && dnn_handle_ != nullptr) {
PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::miopenDestroy(dnn_handle_));
dnn_handle_ = nullptr;
}
#else
if (owned_ && dnn_handle_ != nullptr) {
PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::cudnnDestroy(dnn_handle_));
dnn_handle_ = nullptr;
}
#endif // PADDLE_WITH_HIP
}
void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
void InitSolverHandle() {
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cusolverDnCreate(&solver_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
pten::dynload::cusolverDnSetStream(solver_handle_, stream_));
#endif
}
void DestroyInternalSolverHandle() {
#ifndef PADDLE_WITH_HIP
if (owned_ && solver_handle_ != nullptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
pten::dynload::cusolverDnDestroy(solver_handle_));
solver_handle_ = nullptr;
}
#endif
}
solverHandle_t GetSolverHandle() const {
PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
return solver_handle_;
}
void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
void InitSparseHandle() {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cusparseSetStream(sparse_handle_, stream_));
#endif
#endif
}
void DestroyInternalSparseHandle() {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
if (owned_ && sparse_handle_ != nullptr) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
sparse_handle_ = nullptr;
}
#endif
#endif
}
sparseHandle_t GetSparseHandle() const {
PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
return sparse_handle_;
}
void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; }
void Wait() const {
#ifdef PADDLE_WITH_HIP
hipError_t e_sync = hipSuccess;
#if !defined(_WIN32)
e_sync = hipStreamSynchronize(stream_);
#else
while (e_sync = hipStreamQuery(stream_)) {
if (e_sync == hipErrorNotReady) continue;
break;
}
#endif // !defined(_WIN32)
#else // PADDLE_WITH_HIP
cudaError_t e_sync = cudaSuccess;
#if !defined(_WIN32)
e_sync = cudaStreamSynchronize(stream_);
#else
while (e_sync = cudaStreamQuery(stream_)) {
if (e_sync == cudaErrorNotReady) continue;
break;
}
#endif // !defined(_WIN32)
#endif // PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
}
void WaitEvent(gpuEvent_t ev) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
#endif
}
ncclComm_t GetNcclComm() const {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
return nccl_comm_;
#endif
return nullptr;
}
void SetNcclComm(ncclComm_t comm) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_comm_ = comm;
#endif
}
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (blas_tf32_tensor_core_handle_ != nullptr) {
std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
callback(blas_tf32_tensor_core_handle_);
} else {
std::lock_guard<std::mutex> guard(blas_mtx_);
callback(blas_handle_);
}
}
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (blas_tensor_core_handle_ != nullptr) {
std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
callback(blas_tensor_core_handle_);
} else {
std::lock_guard<std::mutex> guard(blas_mtx_);
callback(blas_handle_);
}
}
inline void CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
std::lock_guard<std::mutex> guard(sparse_mtx_);
callback(sparse_handle_);
}
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const {
callback();
RecordEvent(ev);
}
void RecordEvent(gpuEvent_t ev) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
#endif
}
void AddStreamCallback(const std::function<void()>& callback) const {
// TODO(wilber): Do we need ThreadPool?
auto* func = new std::function<void()>([this, callback] {
std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
});
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
PADDLE_ENFORCE_GPU_SUCCESS(
cudaLaunchHostFunc(stream_, internal::StreamCallbackFunc, func));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
#endif
#endif
}
void WaitStreamCallback() const {
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
pten::backends::gpu::GpuStreamSync(stream_);
#endif
{
std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
if (last_future_.valid()) {
last_future_.wait();
}
}
}
bool owned_{false};
Place place_;
int compute_capability_;
int runtime_version_;
int driver_version_;
int multi_process_;
int max_threads_per_mp_;
int max_threads_per_block_;
std::array<int, 3> max_grid_dim_size_;
gpuStream_t stream_{nullptr};
Eigen::GpuDevice* eigen_device_{nullptr};
blasHandle_t blas_handle_{nullptr};
blasHandle_t blas_tensor_core_handle_{nullptr};
blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
dnnHandle_t dnn_handle_{nullptr};
solverHandle_t solver_handle_{nullptr};
sparseHandle_t sparse_handle_{nullptr};
DnnWorkspaceHandle* workspace_{nullptr};
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
// NOTE: Distributed communicator, distributed framework manages its
// resources.
ncclComm_t nccl_comm_{nullptr};
#endif
mutable std::mutex blas_mtx_;
mutable std::mutex blas_tensor_core_mtx_;
mutable std::mutex blas_tf32_mtx_;
mutable std::mutex sparse_mtx_;
mutable std::mutex stream_call_back_mtx_;
mutable std::future<void> last_future_;
Allocator* allocator_{nullptr}; // external resource.
// A internal resouce to initinalize eigen_device.
std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
};
GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
GPUContext::GPUContext(const GPUPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
GPUContext::~GPUContext() = default;
const Place& GPUContext::GetPlace() const { return impl_->GetPlace(); }
gpuStream_t GPUContext::stream() const { return impl_->GetStream(); }
dnnHandle_t GPUContext::cudnn_handle() const { return impl_->GetDnnHandle(); }
blasHandle_t GPUContext::cublas_handle() const {
return impl_->GetBlasHandle();
}
solverHandle_t GPUContext::cusolver_dn_handle() const {
return impl_->GetSolverHandle();
}
sparseHandle_t GPUContext::cusparse_handle() const {
return impl_->GetSparseHandle();
}
void GPUContext::Wait() const { impl_->Wait(); }
void GPUContext::WaitEvent(gpuEvent_t ev) const { impl_->WaitEvent(ev); }
bool GPUContext::tensor_core_available() const {
return impl_->IsTensorCoreAvailable();
}
int GPUContext::GetComputeCapability() const {
return impl_->compute_capability_;
}
int GPUContext::GetMaxPhysicalThreadCount() const {
return impl_->multi_process_ * impl_->max_threads_per_mp_;
}
int GPUContext::GetSMCount() const { return impl_->multi_process_; }
int GPUContext::GetMaxThreadsPerBlock() const {
return impl_->max_threads_per_block_;
}
std::array<int, 3> GPUContext::GetCUDAMaxGridDimSize() const {
return impl_->max_grid_dim_size_;
}
Eigen::GpuDevice* GPUContext::eigen_device() const {
return impl_->eigen_device();
}
DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() {
return impl_->GetDnnWorkspace();
}
void GPUContext::CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
impl_->CublasCall(callback);
}
void GPUContext::TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
impl_->TensorCoreCublasCallIfAvailable(callback);
}
void GPUContext::CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
impl_->CusparseCall(callback);
}
void GPUContext::RecordEvent(gpuEvent_t ev,
const std::function<void()>& callback) const {
impl_->RecordEvent(ev, callback);
}
void GPUContext::RecordEvent(gpuEvent_t ev) const { impl_->RecordEvent(ev); }
void GPUContext::AddStreamCallback(
const std::function<void()>& callback) const {
impl_->AddStreamCallback(callback);
}
void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
void GPUContext::Init() {
impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
impl_->Init();
}
void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
impl_->SetEigenDevice(device);
}
void GPUContext::SetBlasHandle(blasHandle_t blas) {
impl_->SetBlasHandle(blas);
}
void GPUContext::SetDnnHandle(dnnHandle_t handle) {
impl_->SetDnnHandle(handle);
}
void GPUContext::SetSolverHandle(solverHandle_t handle) {
impl_->SetSolverHandle(handle);
}
void GPUContext::SetSparseHandle(sparseHandle_t handle) {
impl_->SetSparseHandle(handle);
}
void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) {
impl_->workspace_ = handle;
}
void GPUContext::PartialInitWithoutAllocator() {
impl_->PartialInitWithoutAllocator();
}
void GPUContext::PartialInitWithAllocator() {
impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
impl_->PartialInitWithAllocator();
}
void GPUContext::SetComputeCapability(int val) {
impl_->compute_capability_ = val;
}
void GPUContext::SetMaxThreadsPerMultiProcessor(int val) {
impl_->max_threads_per_mp_ = val;
}
void GPUContext::SetMultiProcessors(int val) { impl_->multi_process_ = val; }
void GPUContext::SetMaxThreadsPerBlock(int val) {
impl_->max_threads_per_block_ = val;
}
void GPUContext::SetMaxGridDimSize(const std::array<int, 3>& val) {
impl_->max_grid_dim_size_ = val;
}
void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }
void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }
} // namespace pten
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -14,13 +14,162 @@ limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/device_context.h"
#include <array>
#include <functional>
#include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_helper.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/device_context.h"
namespace pten {
using GPUContext = paddle::platform::CUDADeviceContext;
} // namespace pten
#endif
class DnnWorkspaceHandle;
class GPUContext : public DeviceContext {
public:
GPUContext();
explicit GPUContext(const GPUPlace& place);
virtual ~GPUContext();
/*! \brief Return place in the device context. */
const Place& GetPlace() const override;
/*! \brief Return gpu stream in the device context. */
gpuStream_t stream() const;
/*! \brief Return cudnn handle in the device context. */
dnnHandle_t cudnn_handle() const;
/*! \brief Return cublas handle in the device context. */
blasHandle_t cublas_handle() const;
/*! \brief Return cusolver handle in the device context. */
solverHandle_t cusolver_dn_handle() const;
/*! \brief Return cusparse handle in the device context. */
sparseHandle_t cusparse_handle() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Wait for event in the stream. */
void WaitEvent(gpuEvent_t ev) const;
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return the SM count in the device context */
int GetSMCount() const;
/*! \brief Return the Max thread num of block in the device context */
int GetMaxThreadsPerBlock() const;
/*! \brief Return the max grid dim size in the device context */
std::array<int, 3> GetCUDAMaxGridDimSize() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
* would be acquired to prevent other threads from accessing the
* workspace. Once the handle is destructed, the lock would be released.
*/
DnnWorkspaceHandle* cudnn_workspace_handle();
public:
/*! \brief Call cublas function safely. */
void CublasCall(const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>&) const;
/*! \brief Call cusparse function safely. */
void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void RecordEvent(gpuEvent_t ev) const;
void AddStreamCallback(const std::function<void()>& callback) const;
void WaitStreamCallback() const;
public:
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const;
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm);
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
// Note that you must set the Allocator before calling Init function.
void Init();
// TODO(wilber): Why does the GetAllocator interface require a stream
// parameter?
// The temporary trick method bypasses this problem, and the following
// interfaces
// need to be deleted later.
// Note that this is a trick implementation, which can be used to partially
// initialize when the SetAllocator interface is not called.
void PartialInitWithoutAllocator();
// Note that this is a trick implementation that can be used to initialize
// resources that require an Allocator when the SetAllocator interface is
// called.
void PartialInitWithAllocator();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetStream(gpuStream_t);
void SetEigenDevice(Eigen::GpuDevice*);
void SetBlasHandle(blasHandle_t);
void SetDnnHandle(dnnHandle_t);
void SetSolverHandle(solverHandle_t);
void SetSparseHandle(sparseHandle_t);
void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
void SetComputeCapability(int val);
void SetMaxThreadsPerMultiProcessor(int val);
void SetMultiProcessors(int val);
void SetMaxThreadsPerBlock(int val);
void SetMaxGridDimSize(const std::array<int, 3>& val);
void SetDriverVersion(int val);
void SetRuntimeVersion(int val);
private:
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
namespace pten {
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
cudnnActivationStruct,
miopenActivationDescriptor);
DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
cudnnTensorStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
cudnnFilterStruct,
miopenTensorDescriptor);
DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
cudnnFilterDescriptor_t,
miopenTensorDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
cudnnConvolutionStruct,
miopenConvolutionDescriptor);
DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
cudnnConvolutionDescriptor_t,
miopenConvolutionDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
cudnnPoolingDescriptor_t,
miopenPoolingDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
cudnnDropoutDescriptor_t,
miopenDropoutDescriptor_t);
DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
#undef DECLARE_TYPE_FOR_GPU
using CUDAGraphID = unsigned long long; // NOLINT
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
#else
#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
#endif
#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_info.h"
#include <vector>
#include "gflags/gflags.h"
DECLARE_string(selected_gpus);
namespace pten {
namespace backends {
namespace gpu {
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetGPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
} // namespace gpu
} // namespace backends
} // namespace pten
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h>
#include <array>
#include <string>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_types.h"
namespace pten {
namespace backends {
namespace gpu {
//! Get the version of dnn
int DnnVersion();
//! Get the total number of GPU devices in system.
int GetGPUDeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int GetGPUComputeCapability(int id);
//! Get the runtime version of the ith GPU
int GetGPURuntimeVersion(int id);
//! Get the driver version of the ith GPU
int GetGPUDriverVersion(int id);
//! Wheter the current device support TensorCore
bool TensorCoreAvailable();
//! Get the MultiProcessors of the ith GPU.
int GetGPUMultiProcessors(int id);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int GetGPUMaxThreadsPerMultiProcessor(int id);
//! Get the MaxThreads of each block of the ith GPU.
int GetGPUMaxThreadsPerBlock(int id);
//! Get the current GPU device id in system.
int GetCurrentDeviceId();
//! Get the maximum GridDim size for GPU buddy allocator.
std::array<int, 3> GetGpuMaxGridDimSize(int);
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices();
//! Get the properties of the ith GPU device.
const gpuDeviceProp &GetDeviceProperties(int id);
//! Set the GPU device id for next execution.
void SetDeviceId(int device_id);
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind);
//! Copy memory from one device to another device asynchronously.
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream);
//! Copy memory from one device to another device synchronously.
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count);
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
//! Blocks until stream has completed all operations.
void GpuStreamSync(gpuStream_t stream);
void GpuDestroyStream(gpuStream_t stream);
// ! Blocks until device has completed all operations.
void GpuDeviceSync();
gpuError_t GpuGetLastError();
class GPUDeviceGuard {
public:
explicit inline GPUDeviceGuard(int dev_id) {
int prev_id = GetCurrentDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
SetDeviceId(dev_id);
}
}
inline ~GPUDeviceGuard() {
if (prev_id_ != -1) {
SetDeviceId(prev_id_);
}
}
GPUDeviceGuard(const GPUDeviceGuard &o) = delete;
GPUDeviceGuard &operator=(const GPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
};
} // namespace gpu
} // namespace backends
} // namespace pten
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Used for compute gpu launch parameter config
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h>
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace pten {
namespace backends {
namespace gpu {
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
static inline int RoundToPowerOfTwo(int n) {
n--;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
#else
return std::min(1024, std::max(32, (n + 1)));
#endif
}
#ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 .
inline void ChangeThreadNum(const pten::CUDAContext& context,
int* num_thread,
int alternative_num_thread = 512) {
if (context.GetComputeCapability() == 53 ||
context.GetComputeCapability() == 62) {
*num_thread = alternative_num_thread;
}
}
#endif
struct GpuLaunchConfig {
public:
GpuLaunchConfig() {}
size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
size_t GetGridSize() const {
return block_per_grid.x * block_per_grid.y * block_per_grid.z;
}
size_t GetBlockSize() const {
return thread_per_block.x * thread_per_block.y * thread_per_block.z;
}
int compute_capability = 0;
dim3 thread_per_block = dim3(1, 1, 1);
dim3 block_per_grid = dim3(1, 1, 1);
};
/* According to NVIDIA, if number of threads per block is 64/128/256/512,
* cuda performs better. And number of blocks should be greater (at least
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block. */
inline GpuLaunchConfig GetGpuLaunchConfig1D(const pten::GPUContext& context,
int64_t numel,
int vec_size = 1) {
PADDLE_ENFORCE_GT(numel,
0,
paddle::platform::errors::InvalidArgument(
"element quantity should be greater than 0,"
" but received value is: %d.",
numel));
// Get compute_capability
const int capability = context.GetComputeCapability();
/* If thread number per block is 64/128/256/512, cuda performs better.*/
int limit_threads =
std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
#ifdef WITH_NV_JETSON
if (capability == 53 || capability == 62) {
limit_threads = 512;
}
#endif
int threads = limit_threads;
int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
} else if (active_threads_num / (sm_count << 2) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
}
// Number of threads per block shall be larger than 64.
threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads);
GpuLaunchConfig config;
config.thread_per_block.x = threads;
config.block_per_grid.x = blocks;
config.compute_capability = capability;
return config;
}
inline GpuLaunchConfig GetGpuLaunchConfig2D(const pten::GPUContext& context,
int x_dim,
int y_dim) {
PADDLE_ENFORCE_GT(x_dim,
0,
paddle::platform::errors::InvalidArgument(
"x dim number should greater than 0,"
" but received value is: %d",
x_dim));
PADDLE_ENFORCE_GT(y_dim,
0,
paddle::platform::errors::InvalidArgument(
"y dim number should greater than 0,"
" but received value is: %d",
y_dim));
const int kThreadsPerBlock = 256;
int block_cols = (std::min)(x_dim, kThreadsPerBlock);
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself.
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
int grid_y =
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1);
return config;
}
} // namespace gpu
} // namespace backends
} // namespace pten
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#else // PADDLE_WITH_CUDA
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#endif
namespace pten {
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
cudnnTensorFormat_t,
miopenTensorFormat_t);
DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
cudnnActivationMode_t,
miopenActivationMode_t);
#undef DECLARE_TYPE_FOR_GPU
#ifdef PADDLE_WITH_HIP
#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
constexpr auto GPU_CV = ROCM_CV;
#else // PADDLE_WITH_CUDA
#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
constexpr auto GPU_CV = CUDA_CV;
#endif
DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
cudaErrorMemoryAllocation,
hipErrorOutOfMemory);
DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
#undef DECLARE_CONSTANT_FOR_GPU
} // namespace pten
#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace pten {
namespace backends {
namespace gpu {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
} // namespace gpu
} // namespace backends
} // namespace pten
......@@ -12,20 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props;
namespace paddle {
namespace platform {
namespace pten {
namespace backends {
namespace gpu {
int DnnVersion() {
if (!dynload::HasCUDNN()) return -1;
size_t version_major, version_minor, version_patch;
......@@ -78,11 +78,13 @@ int GetGPUDeviceCount() {
}
int GetGPUComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int major, minor;
auto major_error_code = hipDeviceGetAttribute(
&major, hipDeviceAttributeComputeCapabilityMajor, id);
......@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) {
}
int GetGPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int runtime_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
return runtime_version;
}
int GetGPUDriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int driver_version = 0;
PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
return driver_version;
......@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) {
bool TensorCoreAvailable() { return false; }
int GetGPUMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(
hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
......@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
}
int GetGPUMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
&count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
......@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
}
int GetGPUMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
int count;
PADDLE_ENFORCE_GPU_SUCCESS(
hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
......@@ -161,35 +173,37 @@ int GetCurrentDeviceId() {
return device_id;
}
dim3 GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
std::array<int, 3> GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
dim3 ret;
id,
GetGPUDeviceCount()));
std::array<int, 3> ret;
int size;
auto error_code_x =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
ret.x = size;
ret[0] = size;
auto error_code_y =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
ret.y = size;
ret[1] = size;
auto error_code_z =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
ret.z = size;
ret[2] = size;
return ret;
}
const gpuDeviceProp &GetDeviceProperties(int id) {
std::call_once(g_device_props_size_init_flag, [&] {
int gpu_num = 0;
gpu_num = platform::GetGPUDeviceCount();
gpu_num = GetGPUDeviceCount();
g_device_props_init_flags.resize(gpu_num);
g_device_props.resize(gpu_num);
for (int i = 0; i < gpu_num; ++i) {
......@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
});
if (id == -1) {
id = platform::GetCurrentDeviceId();
id = GetCurrentDeviceId();
}
if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
PADDLE_THROW(platform::errors::OutOfRange(
PADDLE_THROW(paddle::platform::errors::OutOfRange(
"The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!",
id, static_cast<int>(g_device_props.size()),
id,
static_cast<int>(g_device_props.size()),
static_cast<int>(g_device_props.size())));
}
......@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(id,
GetGPUDeviceCount(),
paddle::platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetGPUDeviceCount()));
id,
GetGPUDeviceCount()));
PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) {
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count) {
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyPeer(dst, dst_device, src, src_device, count));
}
......@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
gpuError_t GpuGetLastError() { return hipGetLastError(); }
} // namespace platform
} // namespace paddle
} // namespace gpu
} // namespace backends
} // namespace pten
......@@ -13,8 +13,11 @@
// limitations under the License.
#include "paddle/pten/backends/xpu/xpu_context.h"
#include <memory>
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
......@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api;
namespace pten {
struct XPUContext::XPUImpl {
void SetL3Cache() {
struct XPUContext::Impl {
void SetL3Cache(int l3_size = 14155776) {
const int MAX_XPU_NUM = 16;
static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
int l3_size = 13.5 * 1024 * 1024;
if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
}
......@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl {
}
}
XPUImpl() {
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
}
explicit XPUImpl(XPUPlace place) : place_(place) {
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place_.device);
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
Impl() : place_(XPUPlace()) {}
// Users need to manage external resources.
explicit XPUImpl(const XPUContextResource& ctx_res,
const XPUPlace& place = XPUPlace(0))
: res_(ctx_res), place_(place) {
context_ = res_.context;
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
explicit Impl(const Place& place) : place_(place) {}
~XPUImpl() {
if (res_.context == nullptr && context_ != nullptr) {
~Impl() {
if (owned_ && context_ != nullptr) {
xpu::destroy_context(context_);
context_ = nullptr;
}
}
Place GetPlace() const { return place_; }
backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
const Place& GetPlace() const { return place_; }
xpu::Context* GetXContext() const {
PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
return context_;
}
xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
xpu::BKCLContext_t GetBkclContext() const {
PD_CHECK(bkcl_context_ != nullptr, "the xpu bkcl_context is nullptr.");
return bkcl_context_;
}
void Wait() const {
backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
......@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl {
xpu_wait(context_->xpu_stream);
}
void SetXContext(xpu::Context* context) {
if (context == nullptr) {
return;
}
res_.context = context;
context_ = context;
void Init() {
owned_ = true;
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
<< static_cast<int>(place_.device);
context_ = xpu::create_context();
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
void SetXContext(xpu::Context* context) { context_ = context; }
void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
XPUContextResource res_;
XPUPlace place_;
bool owned_{false};
Place place_;
backends::xpu::XPUVersion xpu_version_;
xpu::Context* context_{nullptr};
// NOTE: Distributed communicator, distributed framework manages its
// resources, XPUContext only holds references.
xpu::BKCLContext_t bkcl_context_{nullptr};
};
XPUContext::XPUContext() : DeviceContext() {
impl_ = std::make_unique<XPUImpl>();
}
XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
XPUContext::XPUContext(const XPUPlace& place) {
impl_ = std::make_unique<XPUImpl>(place);
}
XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
impl_ = std::make_unique<XPUImpl>();
impl_->SetXContext(other.x_context());
impl_->SetBkclContext(other.bkcl_context());
}
XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
impl_ = std::move(other.impl_);
}
XPUContext::XPUContext(const XPUPlace& place)
: DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
XPUContext::~XPUContext() = default;
XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
impl_ = std::make_unique<XPUImpl>(ctx_res);
}
Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); }
backends::xpu::XPUVersion XPUContext::xpu_version() const {
return impl_->GetXpuVersion();
return impl_->xpu_version_;
}
xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
......@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const {
void XPUContext::Wait() const { impl_->Wait(); }
void XPUContext::set_x_context(xpu::Context* context) {
void XPUContext::SetXContext(xpu::Context* context) {
impl_->SetXContext(context);
}
void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); }
void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
impl_->SetBkclContext(context);
}
void XPUContext::Init() { impl_->Init(); }
} // namespace pten
......@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api;
namespace pten {
struct XPUContextResource {
xpu::Context* context{nullptr};
};
class XPUContext : public DeviceContext {
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
XPUContext();
explicit XPUContext(const XPUPlace&);
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
XPUContext(const XPUContext&);
XPUContext(XPUContext&&);
virtual ~XPUContext();
Place GetPlace() const override;
const Place& GetPlace() const override;
backends::xpu::XPUVersion xpu_version() const;
......@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext {
// Return bkcl context.
xpu::BKCLContext_t bkcl_context() const;
void SetBkclContext(xpu::BKCLContext_t context);
// Wait for all operations completion in the stream.
void Wait() const override;
public:
// NOTE: External users manage resources. Used in inference scenarios.
explicit XPUContext(const XPUContextResource&);
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
void set_x_context(xpu::Context*);
public:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void SetXContext(xpu::Context*);
void set_bkcl_context(xpu::BKCLContext_t context);
void SetL3Cache(int l3_size = 14155776);
private:
struct XPUImpl;
std::unique_ptr<XPUImpl> impl_;
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace pten
......@@ -23,7 +23,7 @@ struct DeviceContext::Impl {
Impl() = default;
~Impl() = default;
void SetDeviceAllocator(const Allocator* allocator) {
void SetAllocator(const Allocator* allocator) {
PADDLE_ENFORCE_NOT_NULL(
allocator,
pten::errors::InvalidArgument(
......@@ -47,7 +47,7 @@ struct DeviceContext::Impl {
zero_allocator_ = allocator;
}
const Allocator& GetDeviceAllocator() const {
const Allocator& GetAllocator() const {
PADDLE_ENFORCE_NOT_NULL(
device_allocator_,
pten::errors::InvalidArgument("Required device_allocator_ shall not be "
......@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
DeviceContext::DeviceContext(const DeviceContext& other) {
impl_->SetHostAllocator(&other.GetHostAllocator());
impl_->SetDeviceAllocator(&other.GetDeviceAllocator());
impl_->SetAllocator(&other.GetAllocator());
impl_->SetZeroAllocator(&other.GetZeroAllocator());
}
......@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
DeviceContext::~DeviceContext() = default;
void DeviceContext::SetDeviceAllocator(const Allocator* allocator) {
impl_->SetDeviceAllocator(allocator);
void DeviceContext::SetAllocator(const Allocator* allocator) {
impl_->SetAllocator(allocator);
}
const Allocator& DeviceContext::GetDeviceAllocator() const {
return impl_->GetDeviceAllocator();
const Allocator& DeviceContext::GetAllocator() const {
return impl_->GetAllocator();
}
void DeviceContext::SetHostAllocator(const Allocator* allocator) {
......
......@@ -60,7 +60,7 @@ class DeviceContext {
*
* @param allocator
*/
void SetDeviceAllocator(const Allocator*);
void SetAllocator(const Allocator*);
/**
* @brief Set the host Allocator object.
......@@ -81,7 +81,7 @@ class DeviceContext {
*
* @return Allocator
*/
const Allocator& GetDeviceAllocator() const;
const Allocator& GetAllocator() const;
/**
* @brief Get the const device-related Allocator object.
......@@ -114,7 +114,7 @@ class DeviceContext {
// TODO(wilber): Just for the convenience of migrating the code, it will be
// modified or removed later.
virtual Place GetPlace() const = 0;
virtual const Place& GetPlace() const = 0;
// TODO(wilber): The fluid framework uses wait() in many places, how to delete
// this API interface.
virtual void Wait() const {}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#ifdef __HIPCC__
#define ELEMENTWISE_BLOCK_SIZE 256
......@@ -31,7 +32,7 @@ namespace funcs {
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block.
*/
inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
inline int GetThreadsConfig(const pten::GPUContext &ctx,
int64_t numel,
int vec_size) {
int threads = ELEMENTWISE_BLOCK_SIZE;
......
......@@ -23,8 +23,8 @@ limitations under the License. */
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/backends/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace kps = pten::kps;
......@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
VecSize><<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func);
#else
auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
auto gpu_config =
pten::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
gpu_config.GetBlockSize();
auto stream = ctx.stream();
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/ddim.h"
......
......@@ -237,12 +237,11 @@ __global__ void SplitKernel(const T* input_data,
SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
}
static inline void GetBlockDims(
const paddle::platform::CUDADeviceContext& context,
int64_t num_rows,
int64_t num_cols,
dim3* block_dims,
dim3* grid_dims) {
static inline void GetBlockDims(const pten::GPUContext& context,
int64_t num_rows,
int64_t num_cols,
dim3* block_dims,
dim3* grid_dims) {
// Set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock;
......
......@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place));
auto stream =
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
: reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cpu_place(src_place) && // NOLINT
......@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place));
auto stream =
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
: reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
......@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place.device));
auto stream =
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
: reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
......@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place.device));
auto stream =
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
: reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
......@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx,
ctx_place));
auto stream =
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
: reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
if (paddle::platform::is_same_place(src_place, dst_place)) {
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
......
......@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
T,
kps::AddFunctor,
kps::IdentityFunctor<T>>(
dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
......@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
T,
kps::AddFunctor,
kps::IdentityFunctor<T>>(
dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
}
......@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
T,
kps::AddFunctor,
kps::IdentityFunctor<T>>(
dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
......@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
T,
kps::AddFunctor,
kps::InverseFunctor<T>>(
dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
}
}
}
......
......@@ -1064,7 +1064,8 @@ template <typename Tx,
typename Ty,
template <typename> class ReduceOp,
typename TransformOp>
void TensorReduceFunctorImpl(const pten::DenseTensor& x,
void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx,
const pten::DenseTensor& x,
pten::DenseTensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
......@@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
auto x_data = x.data<Tx>();
auto y_data = y->data<Ty>();
auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(x.place()));
if (config.reduce_num == 1) {
std::vector<const DenseTensor*> inputs = {&x};
std::vector<DenseTensor*> outputs = {y};
funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
*dev_ctx, inputs, &outputs, transform);
dev_ctx, inputs, &outputs, transform);
return;
}
......@@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx,
data_t,
ReduceOp,
TransformOp<T, MPType>>(
x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
stream);
}));
} else {
using MPType = typename kps::details::MPTypeTrait<T>::Type;
pten::kernels::
TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
stream);
}
}
} // namespace pten
......
......@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
DenseTensor* output,
const std::vector<int>& reduce_dims) {
auto stream = dev_ctx.stream();
kernels::
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
kernels::TensorReduceFunctorImpl<T,
T,
kps::AddFunctor,
kps::IdentityFunctor<T>>(
dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
};
#endif
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/pten/api/include/api.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/kernels/copy_kernel.h"
......@@ -122,7 +123,7 @@ TEST(API, matmul_cuda) {
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto place = paddle::platform::CUDAPlace();
auto* dev_ctx = pool.GetByPlace(place);
auto* dev_ctx = static_cast<const pten::GPUContext*>(pool.GetByPlace(place));
pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
......
......@@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) {
std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
pten::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
// 1. test dense_to_sparse_coo
paddle::experimental::Tensor x(dense_x);
......
......@@ -25,43 +25,29 @@ limitations under the License. */
namespace pten {
namespace tests {
class InferenceCPUContext : public CPUContext {
public:
void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
CPUContext::SetEigenDevice(eigen_device);
}
};
TEST(DeviceContext, cpu_context) {
std::cout << "test training scenarios" << std::endl;
{
pten::CPUContext ctx;
ctx.Init();
EXPECT_TRUE(ctx.eigen_device() != nullptr);
}
std::cout << "test inference scenarios" << std::endl;
Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
{
pten::CPUContextResource ctx_res{device};
pten::CPUContext ctx(ctx_res);
EXPECT_TRUE(ctx.eigen_device() != nullptr);
}
{
pten::CPUContextResource ctx_res{nullptr};
pten::CPUContext ctx(ctx_res);
InferenceCPUContext ctx;
ctx.SetEigenDevice(device);
EXPECT_TRUE(ctx.eigen_device() != nullptr);
}
delete device;
std::cout << "test copy constructor" << std::endl;
{
pten::CPUContext ctx1;
pten::CPUContext ctx2(ctx1);
EXPECT_EQ(ctx1.eigen_device(), ctx2.eigen_device());
}
std::cout << "test move constructor" << std::endl;
{
pten::CPUContext ctx1 = pten::CPUContext();
auto* eigen_device1 = ctx1.eigen_device();
pten::CPUContext ctx2(std::move(ctx1));
auto* eigen_device2 = ctx2.eigen_device();
EXPECT_EQ(eigen_device1, eigen_device2);
}
}
} // namespace tests
......
......@@ -50,10 +50,10 @@ TEST(DEV_API, cast) {
}
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
pten::DataType out_dtype = pten::DataType::FLOAT64;
// 2. test API
......
......@@ -59,10 +59,10 @@ TEST(DEV_API, concat) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = pten::Concat<float>(dev_ctx, inputs, 0);
// 3. check result
......
......@@ -46,10 +46,10 @@ TEST(DEV_API, conj) {
}
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x);
......
......@@ -58,10 +58,10 @@ TEST(DEV_API, copy) {
std::cout << typeid(a).name() << std::endl;
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
// 3. check result
......
......@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim;
TEST(DEV_API, empty) {
// 1. create input
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32);
......@@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = pten::EmptyLike<float>(dev_ctx, dense_x);
// 3. check result
......@@ -84,10 +84,10 @@ TEST(DEV_API, full) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32);
// 3. check result
......@@ -118,10 +118,10 @@ TEST(DEV_API, full_like) {
float val = 1.0;
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = pten::FullLike<float>(dev_ctx, dense_x, val);
......
......@@ -60,10 +60,10 @@ TEST(DEV_API, dot) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......
......@@ -62,10 +62,10 @@ TEST(DEV_API, add) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......@@ -116,10 +116,10 @@ TEST(DEV_API, subtract) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......@@ -170,10 +170,10 @@ TEST(DEV_API, divide) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......@@ -224,10 +224,10 @@ TEST(DEV_API, multiply) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......
......@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) {
}
int start_axis = 1, stop_axis = 2;
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
......
......@@ -55,10 +55,10 @@ TEST(DEV_API, dot) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
// 3. check result
......
......@@ -49,10 +49,10 @@ TEST(DEV_API, mean) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false);
// 3. check result
......
......@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = pten::Reshape<float>(dev_ctx, dense_x, shape);
// 3. check result
std::vector<int64_t> expect_shape = {12, 3};
......
......@@ -49,10 +49,11 @@ TEST(DEV_API, scale) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out =
pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
......@@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) {
// 2. test API
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out =
pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/kernels/copy_kernel.h"
#include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
......@@ -22,6 +24,8 @@ limitations under the License. */
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace pten {
namespace tests {
......@@ -38,9 +42,8 @@ inline void CheckResult(
ASSERT_EQ(coo.nnz(), non_zero_num);
#if defined(PADDLE_WITH_CUDA)
if (coo.place() == paddle::platform::CUDAPlace()) {
const auto* dev_ctx_cuda =
static_cast<const paddle::platform::CUDADeviceContext*>(dev_ctx);
if (coo.place() == pten::GPUPlace()) {
const auto* dev_ctx_cuda = static_cast<const pten::GPUContext*>(dev_ctx);
DenseTensor indices(
alloc.get(),
DenseTensorMeta(
......@@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
paddle::platform::CPUPlace());
pten::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
// 1. test cpu
auto cpu_sparse_out =
sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
......@@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
// 2. test cuda
#if defined(PADDLE_WITH_CUDA)
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
// paddle::platform::DeviceContextPool& pool =
// paddle::platform::DeviceContextPool::Instance();
// auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten::GPUContext dev_ctx_gpu;
dev_ctx_gpu.PartialInitWithoutAllocator();
dev_ctx_gpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
.get());
dev_ctx_gpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(pten::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
const auto cuda_alloc =
std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CUDAPlace());
......@@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
cuda_alloc.get(),
DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
pten::Copy(*dev_ctx_cuda, dense_x, true, &d_dense_x);
pten::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
auto sparse_out =
sparse::DenseToSparseCoo<T>(*dev_ctx_cuda, d_dense_x, sparse_dim);
CheckResult<T, int64_t>(dev_ctx_cuda,
sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
CheckResult<T, int64_t>(&dev_ctx_gpu,
sparse_out,
non_zero_data,
indices_data,
......@@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
alloc);
// 2. test cuda
#if defined(PADDLE_WITH_CUDA)
pten::GPUContext dev_ctx_gpu;
dev_ctx_gpu.PartialInitWithoutAllocator();
dev_ctx_gpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
.get());
dev_ctx_gpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(pten::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
const auto cuda_alloc =
std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CUDAPlace());
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
// auto& pool = paddle::platform::DeviceContextPool::Instance();
// auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
pten::DenseTensor d_values(cuda_alloc.get(), values_meta);
pten::Copy(*dev_ctx_cuda, crows, true, &d_crows);
pten::Copy(*dev_ctx_cuda, cols, true, &d_cols);
pten::Copy(*dev_ctx_cuda, values, true, &d_values);
pten::Copy(dev_ctx_gpu, crows, true, &d_crows);
pten::Copy(dev_ctx_gpu, cols, true, &d_cols);
pten::Copy(dev_ctx_gpu, values, true, &d_values);
pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(*dev_ctx_cuda, d_csr);
CheckResult<T, int64_t>(dev_ctx_cuda,
auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
CheckResult<T, int64_t>(&dev_ctx_gpu,
cuda_sparse_out,
non_zero_data,
indices_data,
......
......@@ -47,10 +47,11 @@ TEST(DEV_API, sum) {
std::vector<int64_t> axis = {0, 1};
pten::CPUContext dev_ctx;
dev_ctx.SetDeviceAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out =
pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册