未验证 提交 8c3777df 编写于 作者: W Wilber 提交者: GitHub

[multi-stream] Fix split and concat problem. (#43039)

上级 9bb39d48
...@@ -1090,12 +1090,6 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>( ...@@ -1090,12 +1090,6 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
process_level_allocator_enabled = true; process_level_allocator_enabled = true;
} }
// TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is
// auto_growth
if (config.tensorrt_engine_enabled()) {
gflags.push_back("--allocator_strategy=naive_best_fit");
}
if (framework::InitGflags(gflags)) { if (framework::InitGflags(gflags)) {
VLOG(3) << "The following gpu analysis configurations only take effect " VLOG(3) << "The following gpu analysis configurations only take effect "
"for the first predictor: "; "for the first predictor: ";
......
...@@ -87,9 +87,11 @@ endif() ...@@ -87,9 +87,11 @@ endif()
if(WITH_GPU) if(WITH_GPU)
if(NOT WIN32) if(NOT WIN32)
add_definitions("-DPADDLE_WITH_GPU")
set(CUDA_LIB set(CUDA_LIB
"/usr/local/cuda/lib64/" "/usr/local/cuda/lib64/"
CACHE STRING "CUDA Library") CACHE STRING "CUDA Library")
include_directories("${CUDA_LIB}/../include")
else() else()
set(CUDA_LIB set(CUDA_LIB
"" ""
......
...@@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { ...@@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
for (int i = 0; i < thread_num; ++i) { for (int i = 0; i < thread_num; ++i) {
threads.emplace_back(paddle::test::SingleThreadPrediction, threads.emplace_back(paddle::test::SingleThreadPrediction,
pred_pool.Retrive(i), &my_input_data_map, pred_pool.Retrive(i), &my_input_data_map,
&infer_output_data, 2); &infer_output_data, 10);
} }
// thread join & check outputs // thread join & check outputs
......
...@@ -648,7 +648,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -648,7 +648,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by stream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event( platform::RecordEvent record_event(
"GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
......
...@@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { ...@@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
auto& desired_dev_ctx = auto& desired_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx); static_cast<const platform::CUDADeviceContext&>(dev_ctx);
if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
return Alloc(place, size); return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size,
phi::Stream(reinterpret_cast<phi::StreamId>(
desired_dev_ctx.stream())));
} else { } else {
return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
desired_dev_ctx, size); desired_dev_ctx, size);
......
...@@ -504,8 +504,7 @@ struct GPUContext::Impl { ...@@ -504,8 +504,7 @@ struct GPUContext::Impl {
void AddStreamCallback(const std::function<void()>& callback) const { void AddStreamCallback(const std::function<void()>& callback) const {
// NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
// launch too // launch too many threads and result in thread oversubscription.
// many threads and result in thread oversubscription.
auto* callback_func = new std::function<void()>(std::move(callback)); auto* callback_func = new std::function<void()>(std::move(callback));
auto* func = new std::function<void()>([this, callback_func] { auto* func = new std::function<void()>([this, callback_func] {
std::lock_guard<std::mutex> lock(stream_call_back_mtx_); std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
......
...@@ -276,10 +276,7 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -276,10 +276,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
int64_t out_row = in_row, out_col = 0; int64_t out_row = in_row, out_col = 0;
int inputs_col_num = in_num + 1; int inputs_col_num = in_num + 1;
std::vector<const T*> inputs_data_vec(in_num); paddle::memory::AllocationPtr data_alloc, col_alloc;
std::vector<int64_t> inputs_col_vec(inputs_col_num);
const T** inputs_data = inputs_data_vec.data();
int64_t* inputs_col = inputs_col_vec.data();
// There are some differences between hip runtime and NV runtime. // There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from // In NV, when the pageable memory data less than 64K is transferred from
...@@ -289,16 +286,22 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -289,16 +286,22 @@ struct ConcatFunctor<phi::GPUContext, T> {
// 3.2.6.1. Concurrent Execution between Host and Device // 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less // Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::memory::AllocationPtr data_alloc, col_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
in_num * sizeof(T*)); in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
inputs_col_num * sizeof(int)); inputs_col_num * sizeof(int));
inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr()); #else
// TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
// allocator.
data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
in_num * sizeof(T*));
col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
(inputs_col_num) * sizeof(int64_t));
#endif #endif
const T** inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
int64_t* inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
inputs_col[0] = 0; inputs_col[0] = 0;
bool has_same_shape = true; bool has_same_shape = true;
...@@ -387,7 +390,6 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -387,7 +390,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
output->data<T>()); output->data<T>());
} }
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory // Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory // after the launch kernel of the stream is executed (reapply pinned memory
// next time) // next time)
...@@ -401,7 +403,6 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -401,7 +403,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
paddle::memory::allocation::Allocator::AllocationDeleter( paddle::memory::allocation::Allocator::AllocationDeleter(
col_alloc_released); col_alloc_released);
}); });
#endif
} }
}; };
...@@ -432,10 +433,7 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -432,10 +433,7 @@ class SplitFunctor<phi::GPUContext, T> {
bool has_same_shape = true; bool has_same_shape = true;
int outputs_cols_num = o_num + 1; int outputs_cols_num = o_num + 1;
std::vector<T*> outputs_data_vec(o_num); paddle::memory::AllocationPtr data_alloc, cols_alloc;
std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
T** outputs_data = outputs_data_vec.data();
int64_t* outputs_cols = outputs_cols_vec.data();
// There are some differences between hip runtime and NV runtime. // There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from // In NV, when the pageable memory data less than 64K is transferred from
...@@ -445,16 +443,22 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -445,16 +443,22 @@ class SplitFunctor<phi::GPUContext, T> {
// 3.2.6.1. Concurrent Execution between Host and Device // 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less // Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::memory::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
o_num * sizeof(T*)); o_num * sizeof(T*));
outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
(outputs_cols_num) * sizeof(int64_t)); (outputs_cols_num) * sizeof(int64_t));
outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr()); #else
// TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
// allocator.
data_alloc =
paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*));
cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
(outputs_cols_num) * sizeof(int64_t));
#endif #endif
T** outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
int64_t* outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
...@@ -547,7 +551,7 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -547,7 +551,7 @@ class SplitFunctor<phi::GPUContext, T> {
static_cast<int>(outputs_cols_num), static_cast<int>(outputs_cols_num),
dev_out_gpu_data); dev_out_gpu_data);
} }
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory // Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory // after the launch kernel of the stream is executed (reapply pinned memory
// next time) // next time)
...@@ -559,7 +563,6 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -559,7 +563,6 @@ class SplitFunctor<phi::GPUContext, T> {
paddle::memory::allocation::Allocator::AllocationDeleter( paddle::memory::allocation::Allocator::AllocationDeleter(
cols_alloc_released); cols_alloc_released);
}); });
#endif
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册