From 8c3777dfcba3e4d20045087db2957ea34f076aec Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 7 Jun 2022 12:46:06 +0800 Subject: [PATCH] [multi-stream] Fix split and concat problem. (#43039) --- .../fluid/inference/api/analysis_predictor.cc | 6 --- .../inference/tests/infer_ut/CMakeLists.txt | 2 + .../inference/tests/infer_ut/test_LeViT.cc | 2 +- paddle/fluid/memory/memcpy.cc | 2 +- paddle/fluid/platform/device_context.cc | 4 +- paddle/phi/backends/gpu/gpu_context.cc | 3 +- .../kernels/funcs/concat_and_split_functor.cu | 39 ++++++++++--------- 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5f9051ff2fd..18229c302db 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1090,12 +1090,6 @@ CreatePaddlePredictor( process_level_allocator_enabled = true; } - // TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is - // auto_growth - if (config.tensorrt_engine_enabled()) { - gflags.push_back("--allocator_strategy=naive_best_fit"); - } - if (framework::InitGflags(gflags)) { VLOG(3) << "The following gpu analysis configurations only take effect " "for the first predictor: "; diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 5aef30bf335..0aee989367e 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -87,9 +87,11 @@ endif() if(WITH_GPU) if(NOT WIN32) + add_definitions("-DPADDLE_WITH_GPU") set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + include_directories("${CUDA_LIB}/../include") else() set(CUDA_LIB "" diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc index b74d1189b80..b069feaec1a 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { for (int i = 0; i < thread_num; ++i) { threads.emplace_back(paddle::test::SingleThreadPrediction, pred_pool.Retrive(i), &my_input_data_map, - &infer_output_data, 2); + &infer_output_data, 10); } // thread join & check outputs diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 3198b4f8d93..c45180f600e 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -648,7 +648,7 @@ void Copy( platform::SetDeviceId(dst_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; + << dst_place << " by stream(" << stream << ")"; if (stream) { platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 0bd606257f5..fd61b813f0a 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { auto& desired_dev_ctx = static_cast(dev_ctx); if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { - return Alloc(place, size); + return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size, + phi::Stream(reinterpret_cast( + desired_dev_ctx.stream()))); } else { return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( desired_dev_ctx, size); diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index f51f287ee4a..f68e4510390 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -504,8 +504,7 @@ struct GPUContext::Impl { void AddStreamCallback(const std::function& callback) const { // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may - // launch too - // many threads and result in thread oversubscription. + // launch too many threads and result in thread oversubscription. auto* callback_func = new std::function(std::move(callback)); auto* func = new std::function([this, callback_func] { std::lock_guard lock(stream_call_back_mtx_); diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 5abaf6c2ffa..1c9fbffa2ac 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -276,10 +276,7 @@ struct ConcatFunctor { int64_t out_row = in_row, out_col = 0; int inputs_col_num = in_num + 1; - std::vector inputs_data_vec(in_num); - std::vector inputs_col_vec(inputs_col_num); - const T** inputs_data = inputs_data_vec.data(); - int64_t* inputs_col = inputs_col_vec.data(); + paddle::memory::AllocationPtr data_alloc, col_alloc; // There are some differences between hip runtime and NV runtime. // In NV, when the pageable memory data less than 64K is transferred from @@ -289,16 +286,22 @@ struct ConcatFunctor { // 3.2.6.1. Concurrent Execution between Host and Device // Memory copies from host to device of a memory block of 64 KB or less #ifdef PADDLE_WITH_HIP - paddle::memory::AllocationPtr data_alloc, col_alloc; // TODO(chentianyu03): try to find a method to remove the Alloc function data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*)); - inputs_data = reinterpret_cast(data_alloc->ptr()); // TODO(chentianyu03): try to find a method to remove the Alloc function col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(int)); - inputs_col = reinterpret_cast(col_alloc->ptr()); +#else + // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu + // allocator. + data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), + in_num * sizeof(T*)); + col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), + (inputs_col_num) * sizeof(int64_t)); #endif + const T** inputs_data = reinterpret_cast(data_alloc->ptr()); + int64_t* inputs_col = reinterpret_cast(col_alloc->ptr()); inputs_col[0] = 0; bool has_same_shape = true; @@ -387,7 +390,6 @@ struct ConcatFunctor { output->data()); } -#ifdef PADDLE_WITH_HIP // Prevent the pinned memory value from being covered and release the memory // after the launch kernel of the stream is executed (reapply pinned memory // next time) @@ -401,7 +403,6 @@ struct ConcatFunctor { paddle::memory::allocation::Allocator::AllocationDeleter( col_alloc_released); }); -#endif } }; @@ -432,10 +433,7 @@ class SplitFunctor { bool has_same_shape = true; int outputs_cols_num = o_num + 1; - std::vector outputs_data_vec(o_num); - std::vector outputs_cols_vec(outputs_cols_num); - T** outputs_data = outputs_data_vec.data(); - int64_t* outputs_cols = outputs_cols_vec.data(); + paddle::memory::AllocationPtr data_alloc, cols_alloc; // There are some differences between hip runtime and NV runtime. // In NV, when the pageable memory data less than 64K is transferred from @@ -445,16 +443,22 @@ class SplitFunctor { // 3.2.6.1. Concurrent Execution between Host and Device // Memory copies from host to device of a memory block of 64 KB or less #ifdef PADDLE_WITH_HIP - paddle::memory::AllocationPtr data_alloc, cols_alloc; // TODO(chentianyu03): try to find a method to remove the Alloc function data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), o_num * sizeof(T*)); - outputs_data = reinterpret_cast(data_alloc->ptr()); // TODO(chentianyu03): try to find a method to remove the Alloc function cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), (outputs_cols_num) * sizeof(int64_t)); - outputs_cols = reinterpret_cast(cols_alloc->ptr()); +#else + // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu + // allocator. + data_alloc = + paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*)); + cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), + (outputs_cols_num) * sizeof(int64_t)); #endif + T** outputs_data = reinterpret_cast(data_alloc->ptr()); + int64_t* outputs_cols = reinterpret_cast(cols_alloc->ptr()); outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { @@ -547,7 +551,7 @@ class SplitFunctor { static_cast(outputs_cols_num), dev_out_gpu_data); } -#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory // after the launch kernel of the stream is executed (reapply pinned memory // next time) @@ -559,7 +563,6 @@ class SplitFunctor { paddle::memory::allocation::Allocator::AllocationDeleter( cols_alloc_released); }); -#endif } }; -- GitLab