H2D data transfer optimization for split kernel (#49086)

* profile reduce kernel for fp16 and reduceHigherdim * use reinterpret_cast * fix for CI on ROCm * add Macro for ROCm * ROCm CI config * ROCm CI config * unit test repair * pull * add common_funcs.h * reduceType * Update reduce_function.h * not higher * rename * implement of matmul using cublasLt instead of cublas * cublasLt bugfix * Update matmul_kernel_impl.h * Update matmul_kernel_impl_via_blasLt.h * for-loop-algo * PR comments changes * add macro * ci unused variable isCublasLt * ci unused variable isCublasLt macro * split matmul to autotune * rewrite the split kernel with segmented_array * rewrite the split kernel with segmented_array * rewrite the split kernel with segmented_array * add some method for cuda_graph * fix bugs for rocm * change for ci-error * i dont know why ci-model-benchmark gives a shit error, so i recover codes with original one to see if original codes work. * add some changes for passing mode_benchmark and coverage ci * fix ci error * fix ci-rocm error * add some changes for header --------- Co-authored-by: N zhangbopd <1299246947@qq.com> Co-authored-by: N Bo Zhang <105368690+zhangbopd@users.noreply.github.com>

H2D data transfer optimization for split kernel (#49086)
* profile reduce kernel for fp16 and reduceHigherdim * use reinterpret_cast * fix for CI on ROCm * add Macro for ROCm * ROCm CI config * ROCm CI config * unit test repair * pull * add common_funcs.h * reduceType * Update reduce_function.h * not higher * rename * implement of matmul using cublasLt instead of cublas * cublasLt bugfix * Update matmul_kernel_impl.h * Update matmul_kernel_impl_via_blasLt.h * for-loop-algo * PR comments changes * add macro * ci unused variable isCublasLt * ci unused variable isCublasLt macro * split matmul to autotune * rewrite the split kernel with segmented_array * rewrite the split kernel with segmented_array * rewrite the split kernel with segmented_array * add some method for cuda_graph * fix bugs for rocm * change for ci-error * i dont know why ci-model-benchmark gives a shit error, so i recover codes with original one to see if original codes work. * add some changes for passing mode_benchmark and coverage ci * fix ci error * fix ci-rocm error * add some changes for header --------- Co-authored-by: N zhangbopd <1299246947@qq.com> Co-authored-by: N Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
057ba778 · limingshu · GitHub · dc1b6511 · 057ba778 · 057ba778
3 changed file
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -14,6 +14,7 @@

 #pragma once

+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/core/dense_tensor.h"

 namespace phi {
@@ -34,6 +35,26 @@ enum class SegmentedArraySize {
  kFixed64 = 64,
 };

+template <typename T, SegmentedArraySize Size, int Num = static_cast<int>(Size)>
+struct PADDLE_ALIGN(256) ValueArray {
+ public:
+  T data[Num];
+
+  void Set(T* ptr, const int num) {
+    for (auto i = 0; i < num; ++i) {
+      data[i] = ptr[i];
+    }
+  }
+};
+
+template <typename T>
+struct PADDLE_ALIGN(256) ValueArray<T, SegmentedArraySize::kVariableLength, 0> {
+ public:
+  T* data{nullptr};
+
+  void Set(T* ptr, const int num) { data = ptr; }
+};
+
 template <typename T, SegmentedArraySize Size>
 struct PADDLE_ALIGN(256) ConstPointerArray {
 public:
@@ -62,8 +83,8 @@ struct PADDLE_ALIGN(256) PointerArray {
 public:
  T* data[static_cast<int>(Size)];

-  void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
-    for (auto i = 0; i < ptrs.size(); ++i) {
+  void Set(T** ptrs, const int num, T** dev_ptr = nullptr) {
+    for (auto i = 0; i < num; ++i) {
      data[i] = ptrs[i];
    }
  }
@@ -74,9 +95,7 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
 public:
  T** data{nullptr};

-  void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
-    data = dev_ptr;
-  }
+  void Set(T** ptrs, const int num, T** dev_ptr = nullptr) { data = dev_ptr; }
 };

 #undef PADDLE_ALIGN
@@ -84,13 +103,24 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
 template <typename Context>
 struct ArraySetterBase {
 protected:
-  void* AllocAndCopy(const Context& ctx, void* src, size_t num_bytes) {
+  void* AllocAndCopy(const Context& ctx,
+                     void* src,
+                     size_t num_bytes,
+                     bool use_cuda_graph = false) {
    allocation = paddle::memory::Alloc(
        ctx.GetPlace(),
        num_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+
+    int8_t* restored = reinterpret_cast<int8_t*>(src);
+#ifdef PADDLE_WITH_CUDA
+    if (use_cuda_graph) {
+      restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph<int8_t>(
+          restored, num_bytes);
+    }
+#endif
    phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
-                                       src,
+                                       restored,
                                       num_bytes,
                                       phi::gpuMemcpyHostToDevice,
                                       ctx.stream());
@@ -131,13 +161,28 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
 public:
  PointerArray<T, Size> array;

-  PointerArraySetter(const Context& ctx, std::vector<DenseTensor*>* t) {
+  // need_alloc : tensor data needs extra buffer or not.
+  // use_cuda_graph: tensor data shall be captured by cuda_graph or not.
+  // pre_alloc_host_buf: tensor data is temporaily stored by pinned memory or
+  // not.
+  PointerArraySetter(const Context& ctx,
+                     std::vector<DenseTensor*>* t,
+                     bool need_alloc = false,
+                     bool use_cuda_graph = false,
+                     T** pre_alloc_host_buf = nullptr) {
    ptrs.resize(t->size());
+    T** data_ptr = ptrs.data();
+#ifdef PADDLE_WITH_HIP
+    if (pre_alloc_host_buf) {
+      data_ptr = pre_alloc_host_buf;
+    }
+#endif
    for (int i = 0; i < t->size(); ++i) {
      if (t->at(i) && (t->at(i)->numel() > 0)) {
-        ptrs[i] = ctx.template Alloc<T>(t->at(i));
+        data_ptr[i] =
+            need_alloc ? ctx.template Alloc<T>(t->at(i)) : t->at(i)->data<T>();
      } else {
-        ptrs[i] = nullptr;
+        data_ptr[i] = nullptr;
      }
    }

@@ -145,10 +190,9 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
    if (Size == SegmentedArraySize::kVariableLength) {
      size_t num_bytes = t->size() * sizeof(T*);
      dev_ptr = reinterpret_cast<T**>(this->AllocAndCopy(
-          ctx, reinterpret_cast<void*>(ptrs.data()), num_bytes));
+          ctx, reinterpret_cast<void*>(data_ptr), num_bytes, use_cuda_graph));
    }
-
-    array.Set(ptrs, dev_ptr);
+    array.Set(data_ptr, t->size(), dev_ptr);
  }

 private:

--- a/paddle/phi/kernels/funcs/stack_and_unstack.h
+++ b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -192,7 +192,7 @@ void LaunchUnStackKernel(const Context& ctx,
          << ", out_col=" << out_col << ", num_splits=" << num_splits;

  auto x_ptr = x.data<T>();
-  PointerArraySetter<Context, T, Size> setter(ctx, outs);
+  PointerArraySetter<Context, T, Size> setter(ctx, outs, /*need_alloc=*/true);

  if (out_col == 1) {
    // For the case axis == (x.dims().size() - 1)