Combination of multiple paddle::memory::allocate operation into one for ops (#49126)

* A leap of try for cudaLaunchCooperativeKernel * fix bugs * Totally replace the lar cuda kernel * Fix bugs * fix code according to comments * fix codes according to review comments * adding some function overload * relocate the power operation. * add bf16 support for index select relevant ops * revert bf16 type change. * add changes for more op * fix code writting bugs

Combination of multiple paddle::memory::allocate operation into one for ops (#49126)
* A leap of try for cudaLaunchCooperativeKernel * fix bugs * Totally replace the lar cuda kernel * Fix bugs * fix code according to comments * fix codes according to review comments * adding some function overload * relocate the power operation. * add bf16 support for index select relevant ops * revert bf16 type change. * add changes for more op * fix code writting bugs
bdae5481 · limingshu · GitHub · af673090 · bdae5481 · bdae5481
3 changed file
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  ComputeBroadcastKernelSize(
      y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);

-  auto x_strides_array_tmp = paddle::memory::Alloc(
+  // One part buffer for x_strides_array, rest for y_strides_array and
+  // out_dims_array.
+  size_t tmp_total_bytes = bytes * 3;
+  auto tmp_buffer = paddle::memory::Alloc(
      ctx.GetPlace(),
-      bytes,
+      tmp_total_bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *x_strides_array_gpu =
-      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
+  int *x_strides_array_gpu = reinterpret_cast<int *>(tmp_buffer->ptr());
+  int *y_strides_array_gpu =
+      reinterpret_cast<int *>(x_strides_array_gpu + max_dim);
+  int *out_dims_array_gpu =
+      reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
+
  paddle::memory::Copy(gplace,
                       x_strides_array_gpu,
                       cplace,
                       x_strides_array.data(),
                       bytes,
                       ctx.stream());
-
-  auto y_strides_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *y_strides_array_gpu =
-      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
  paddle::memory::Copy(gplace,
                       y_strides_array_gpu,
                       cplace,
                       y_strides_array.data(),
                       bytes,
                       ctx.stream());
-
-  auto out_dims_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
  paddle::memory::Copy(
      gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());

@@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
  int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
  if (dx) {
-    auto x_strides_order_tmp = paddle::memory::Alloc(
+    size_t dx_total_bytes = bytes * 2;
+    auto dx_tmp_buffer = paddle::memory::Alloc(
        ctx.GetPlace(),
-        bytes,
+        dx_total_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_strides_order_gpu =
-        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
+    int *x_strides_order_gpu = reinterpret_cast<int *>(dx_tmp_buffer->ptr());
+    int *x_dims_order_gpu =
+        reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
+
    paddle::memory::Copy(gplace,
                         x_strides_order_gpu,
                         cplace,
                         x_strides_order.data(),
                         bytes,
                         ctx.stream());
-
-    auto x_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
    paddle::memory::Copy(gplace,
                         x_dims_order_gpu,
                         cplace,
@@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                                      dx_op);
  }
  if (dy) {
-    auto y_strides_order_tmp = paddle::memory::Alloc(
+    // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
+    size_t dy_total_bytes = bytes * 2;
+    auto dy_tmp_buffer = paddle::memory::Alloc(
        ctx.GetPlace(),
-        bytes,
+        dy_total_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_strides_order_gpu =
-        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
+    int *y_strides_order_gpu = reinterpret_cast<int *>(dy_tmp_buffer->ptr());
+    int *y_dims_order_gpu =
+        reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
+
    paddle::memory::Copy(gplace,
                         y_strides_order_gpu,
                         cplace,
                         y_strides_order.data(),
                         bytes,
                         ctx.stream());
-
-    auto y_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
    paddle::memory::Copy(gplace,
                         y_dims_order_gpu,
                         cplace,

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -55,11 +55,14 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
    cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
  }

-  // Copy the addresses of A and A_inv from host to device.
+  // Copy the addresses of A and A_inv from host to device,
+  // and allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
      paddle::memory::Alloc(
          dev_ctx.GetPlace(),
-          cpu_ptrs.size() * sizeof(T*),
+          total_bytes,
          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  paddle::memory::Copy(dev_ctx.GetPlace(),
                       tmp_gpu_ptrs_data->ptr(),
@@ -67,20 +70,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                       static_cast<void*>(cpu_ptrs.data()),
                       cpu_ptrs.size() * sizeof(T*),
                       dev_ctx.stream());
-  T** gpu_inv_ptrs =
-      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-  // Allocate device memory for info and pivots.
-  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(
-          dev_ctx.GetPlace(),
-          num_ints * sizeof(int),
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+  T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
+  T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
+  int* gpu_info_ptr =
+      reinterpret_cast<int*>(gpu_inv_pivot_info + cpu_ptrs.size());

  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
  std::vector<int> info;  // only for singular checking
  info.resize(batch_size);
  // This functions in cuBLAS is intended to be used for matrices of small
@@ -100,8 +95,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
    // This function performs the LU factorization of each matrix A by the
    // equation P * A = L * U. L and U are written back to original matrix A,
    // and diagonal elements of L are discarded.
-    int* gpu_pivot_ptr =
-        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    int* gpu_pivot_ptr = gpu_info_ptr + batch_size;
    blas.BatchedGETRF(n,
                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
                      gpu_pivot_ptr,

--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -354,12 +354,6 @@ struct MatrixEighFunctor<GPUContext, T> {
        has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;

    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
-    auto info = paddle::memory::Alloc(
-        dev_ctx.GetPlace(),
-        sizeof(int) * batch_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
-
    DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
    T *input_vector = input_trans.data<T>();

@@ -410,11 +404,13 @@ struct MatrixEighFunctor<GPUContext, T> {
                out_value,
                &workspace_size);
    }
+    size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
    auto work = paddle::memory::Alloc(
        dev_ctx.GetPlace(),
-        sizeof(T) * workspace_size,
+        total_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
    auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    auto *info_ptr = reinterpret_cast<int *>(work_ptr + workspace_size);

    for (auto i = 0; i < batch_size; ++i) {
      auto *input_data = input_vector + i * vector_stride;