未验证 提交 057ba778 编写于 作者: L limingshu 提交者: GitHub

H2D data transfer optimization for split kernel (#49086)

* profile reduce kernel for fp16 and reduceHigherdim

* use reinterpret_cast

* fix for CI on ROCm

* add Macro for ROCm

* ROCm CI config

* ROCm CI config

* unit test repair

* pull

* add common_funcs.h

* reduceType

* Update reduce_function.h

* not higher

* rename

* implement of matmul using cublasLt instead of cublas

* cublasLt bugfix

* Update matmul_kernel_impl.h

* Update matmul_kernel_impl_via_blasLt.h

* for-loop-algo

* PR comments changes

* add macro

* ci unused variable isCublasLt

* ci unused variable isCublasLt macro

* split matmul to autotune

* rewrite the split kernel with segmented_array

* rewrite the split kernel with segmented_array

* rewrite the split kernel with segmented_array

* add some method for cuda_graph

* fix bugs for rocm

* change for ci-error

* i dont know why ci-model-benchmark gives a shit error, so i recover codes with original one to see if original codes work.

* add some changes for passing mode_benchmark and coverage ci

* fix ci error

* fix ci-rocm error

* add some changes for header

---------
Co-authored-by: Nzhangbopd <1299246947@qq.com>
Co-authored-by: NBo Zhang <105368690+zhangbopd@users.noreply.github.com>
上级 dc1b6511
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
...@@ -34,6 +35,26 @@ enum class SegmentedArraySize { ...@@ -34,6 +35,26 @@ enum class SegmentedArraySize {
kFixed64 = 64, kFixed64 = 64,
}; };
template <typename T, SegmentedArraySize Size, int Num = static_cast<int>(Size)>
struct PADDLE_ALIGN(256) ValueArray {
public:
T data[Num];
void Set(T* ptr, const int num) {
for (auto i = 0; i < num; ++i) {
data[i] = ptr[i];
}
}
};
template <typename T>
struct PADDLE_ALIGN(256) ValueArray<T, SegmentedArraySize::kVariableLength, 0> {
public:
T* data{nullptr};
void Set(T* ptr, const int num) { data = ptr; }
};
template <typename T, SegmentedArraySize Size> template <typename T, SegmentedArraySize Size>
struct PADDLE_ALIGN(256) ConstPointerArray { struct PADDLE_ALIGN(256) ConstPointerArray {
public: public:
...@@ -62,8 +83,8 @@ struct PADDLE_ALIGN(256) PointerArray { ...@@ -62,8 +83,8 @@ struct PADDLE_ALIGN(256) PointerArray {
public: public:
T* data[static_cast<int>(Size)]; T* data[static_cast<int>(Size)];
void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) { void Set(T** ptrs, const int num, T** dev_ptr = nullptr) {
for (auto i = 0; i < ptrs.size(); ++i) { for (auto i = 0; i < num; ++i) {
data[i] = ptrs[i]; data[i] = ptrs[i];
} }
} }
...@@ -74,9 +95,7 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> { ...@@ -74,9 +95,7 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
public: public:
T** data{nullptr}; T** data{nullptr};
void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) { void Set(T** ptrs, const int num, T** dev_ptr = nullptr) { data = dev_ptr; }
data = dev_ptr;
}
}; };
#undef PADDLE_ALIGN #undef PADDLE_ALIGN
...@@ -84,13 +103,24 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> { ...@@ -84,13 +103,24 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
template <typename Context> template <typename Context>
struct ArraySetterBase { struct ArraySetterBase {
protected: protected:
void* AllocAndCopy(const Context& ctx, void* src, size_t num_bytes) { void* AllocAndCopy(const Context& ctx,
void* src,
size_t num_bytes,
bool use_cuda_graph = false) {
allocation = paddle::memory::Alloc( allocation = paddle::memory::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
num_bytes, num_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int8_t* restored = reinterpret_cast<int8_t*>(src);
#ifdef PADDLE_WITH_CUDA
if (use_cuda_graph) {
restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph<int8_t>(
restored, num_bytes);
}
#endif
phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(), phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
src, restored,
num_bytes, num_bytes,
phi::gpuMemcpyHostToDevice, phi::gpuMemcpyHostToDevice,
ctx.stream()); ctx.stream());
...@@ -131,13 +161,28 @@ struct PointerArraySetter : public ArraySetterBase<Context> { ...@@ -131,13 +161,28 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
public: public:
PointerArray<T, Size> array; PointerArray<T, Size> array;
PointerArraySetter(const Context& ctx, std::vector<DenseTensor*>* t) { // need_alloc : tensor data needs extra buffer or not.
// use_cuda_graph: tensor data shall be captured by cuda_graph or not.
// pre_alloc_host_buf: tensor data is temporaily stored by pinned memory or
// not.
PointerArraySetter(const Context& ctx,
std::vector<DenseTensor*>* t,
bool need_alloc = false,
bool use_cuda_graph = false,
T** pre_alloc_host_buf = nullptr) {
ptrs.resize(t->size()); ptrs.resize(t->size());
T** data_ptr = ptrs.data();
#ifdef PADDLE_WITH_HIP
if (pre_alloc_host_buf) {
data_ptr = pre_alloc_host_buf;
}
#endif
for (int i = 0; i < t->size(); ++i) { for (int i = 0; i < t->size(); ++i) {
if (t->at(i) && (t->at(i)->numel() > 0)) { if (t->at(i) && (t->at(i)->numel() > 0)) {
ptrs[i] = ctx.template Alloc<T>(t->at(i)); data_ptr[i] =
need_alloc ? ctx.template Alloc<T>(t->at(i)) : t->at(i)->data<T>();
} else { } else {
ptrs[i] = nullptr; data_ptr[i] = nullptr;
} }
} }
...@@ -145,10 +190,9 @@ struct PointerArraySetter : public ArraySetterBase<Context> { ...@@ -145,10 +190,9 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
if (Size == SegmentedArraySize::kVariableLength) { if (Size == SegmentedArraySize::kVariableLength) {
size_t num_bytes = t->size() * sizeof(T*); size_t num_bytes = t->size() * sizeof(T*);
dev_ptr = reinterpret_cast<T**>(this->AllocAndCopy( dev_ptr = reinterpret_cast<T**>(this->AllocAndCopy(
ctx, reinterpret_cast<void*>(ptrs.data()), num_bytes)); ctx, reinterpret_cast<void*>(data_ptr), num_bytes, use_cuda_graph));
} }
array.Set(data_ptr, t->size(), dev_ptr);
array.Set(ptrs, dev_ptr);
} }
private: private:
......
...@@ -192,7 +192,7 @@ void LaunchUnStackKernel(const Context& ctx, ...@@ -192,7 +192,7 @@ void LaunchUnStackKernel(const Context& ctx,
<< ", out_col=" << out_col << ", num_splits=" << num_splits; << ", out_col=" << out_col << ", num_splits=" << num_splits;
auto x_ptr = x.data<T>(); auto x_ptr = x.data<T>();
PointerArraySetter<Context, T, Size> setter(ctx, outs); PointerArraySetter<Context, T, Size> setter(ctx, outs, /*need_alloc=*/true);
if (out_col == 1) { if (out_col == 1) {
// For the case axis == (x.dims().size() - 1) // For the case axis == (x.dims().size() - 1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册