未验证 提交 bdae5481 编写于 作者: L limingshu 提交者: GitHub

Combination of multiple paddle::memory::allocate operation into one for ops (#49126)

* A leap of try for cudaLaunchCooperativeKernel

* fix bugs

* Totally replace the lar cuda kernel

* Fix bugs

* fix code according to comments

* fix codes according to  review comments

* adding some function overload

* relocate the power operation.

* add bf16 support for index select relevant ops

* revert bf16 type change.

* add changes for more op

* fix code writting bugs
上级 af673090
......@@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
ComputeBroadcastKernelSize(
y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
auto x_strides_array_tmp = paddle::memory::Alloc(
// One part buffer for x_strides_array, rest for y_strides_array and
// out_dims_array.
size_t tmp_total_bytes = bytes * 3;
auto tmp_buffer = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
tmp_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_strides_array_gpu =
reinterpret_cast<int *>(x_strides_array_tmp->ptr());
int *x_strides_array_gpu = reinterpret_cast<int *>(tmp_buffer->ptr());
int *y_strides_array_gpu =
reinterpret_cast<int *>(x_strides_array_gpu + max_dim);
int *out_dims_array_gpu =
reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
paddle::memory::Copy(gplace,
x_strides_array_gpu,
cplace,
x_strides_array.data(),
bytes,
ctx.stream());
auto y_strides_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_strides_array_gpu =
reinterpret_cast<int *>(y_strides_array_tmp->ptr());
paddle::memory::Copy(gplace,
y_strides_array_gpu,
cplace,
y_strides_array.data(),
bytes,
ctx.stream());
auto out_dims_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
paddle::memory::Copy(
gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
......@@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
if (dx) {
auto x_strides_order_tmp = paddle::memory::Alloc(
size_t dx_total_bytes = bytes * 2;
auto dx_tmp_buffer = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
dx_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_strides_order_gpu =
reinterpret_cast<int *>(x_strides_order_tmp->ptr());
int *x_strides_order_gpu = reinterpret_cast<int *>(dx_tmp_buffer->ptr());
int *x_dims_order_gpu =
reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace,
x_strides_order_gpu,
cplace,
x_strides_order.data(),
bytes,
ctx.stream());
auto x_dims_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
paddle::memory::Copy(gplace,
x_dims_order_gpu,
cplace,
......@@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
dx_op);
}
if (dy) {
auto y_strides_order_tmp = paddle::memory::Alloc(
// One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
size_t dy_total_bytes = bytes * 2;
auto dy_tmp_buffer = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
dy_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_strides_order_gpu =
reinterpret_cast<int *>(y_strides_order_tmp->ptr());
int *y_strides_order_gpu = reinterpret_cast<int *>(dy_tmp_buffer->ptr());
int *y_dims_order_gpu =
reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace,
y_strides_order_gpu,
cplace,
y_strides_order.data(),
bytes,
ctx.stream());
auto y_dims_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
paddle::memory::Copy(gplace,
y_dims_order_gpu,
cplace,
......
......@@ -55,11 +55,14 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
}
// Copy the addresses of A and A_inv from host to device.
// Copy the addresses of A and A_inv from host to device,
// and allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
dev_ctx.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
......@@ -67,20 +70,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
T** gpu_inv_ptrs =
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
// Allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
paddle::memory::Alloc(
dev_ctx.GetPlace(),
num_ints * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
int* gpu_info_ptr =
reinterpret_cast<int*>(gpu_inv_pivot_info + cpu_ptrs.size());
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
std::vector<int> info; // only for singular checking
info.resize(batch_size);
// This functions in cuBLAS is intended to be used for matrices of small
......@@ -100,8 +95,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
// This function performs the LU factorization of each matrix A by the
// equation P * A = L * U. L and U are written back to original matrix A,
// and diagonal elements of L are discarded.
int* gpu_pivot_ptr =
reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
int* gpu_pivot_ptr = gpu_info_ptr + batch_size;
blas.BatchedGETRF(n,
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
gpu_pivot_ptr,
......
......@@ -354,12 +354,6 @@ struct MatrixEighFunctor<GPUContext, T> {
has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
auto info = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batch_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto *info_ptr = reinterpret_cast<int *>(info->ptr());
DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
T *input_vector = input_trans.data<T>();
......@@ -410,11 +404,13 @@ struct MatrixEighFunctor<GPUContext, T> {
out_value,
&workspace_size);
}
size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
auto work = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(T) * workspace_size,
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto *work_ptr = reinterpret_cast<T *>(work->ptr());
auto *info_ptr = reinterpret_cast<int *>(work_ptr + workspace_size);
for (auto i = 0; i < batch_size; ++i) {
auto *input_data = input_vector + i * vector_stride;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册