diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5f9051ff2fdb9e45249232e4c7d1e26bca462d2b..18229c302db395579bca448376cec02c8e18a448 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1090,12 +1090,6 @@ CreatePaddlePredictor( process_level_allocator_enabled = true; } - // TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is - // auto_growth - if (config.tensorrt_engine_enabled()) { - gflags.push_back("--allocator_strategy=naive_best_fit"); - } - if (framework::InitGflags(gflags)) { VLOG(3) << "The following gpu analysis configurations only take effect " "for the first predictor: "; diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 5aef30bf335c3d2380b85ff271fed29102ed697f..0aee989367e4b398f3235b34d7b932c0c29fb380 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -87,9 +87,11 @@ endif() if(WITH_GPU) if(NOT WIN32) + add_definitions("-DPADDLE_WITH_GPU") set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + include_directories("${CUDA_LIB}/../include") else() set(CUDA_LIB "" diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc index b74d1189b804be8dd34048c95b11d65a6731896a..b069feaec1ae79f8f82f8c122e78d1c935376388 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { for (int i = 0; i < thread_num; ++i) { threads.emplace_back(paddle::test::SingleThreadPrediction, pred_pool.Retrive(i), &my_input_data_map, - &infer_output_data, 2); + &infer_output_data, 10); } // thread join & check outputs diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 3198b4f8d935e3815ba94db945a24ab4df4ca97b..c45180f600e3e8f60034b74ab610b6170c55ae71 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -648,7 +648,7 @@ void Copy( platform::SetDeviceId(dst_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; + << dst_place << " by stream(" << stream << ")"; if (stream) { platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 0bd606257f541593ed48d780851abca1ff338875..fd61b813f0aa267043e71e6b85a03af7a2207af8 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { auto& desired_dev_ctx = static_cast(dev_ctx); if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { - return Alloc(place, size); + return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size, + phi::Stream(reinterpret_cast( + desired_dev_ctx.stream()))); } else { return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( desired_dev_ctx, size); diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index f51f287ee4a0848fc41901daeccbdac07eff270a..f68e451039092373985f1292512d80d5fcfdabb1 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -504,8 +504,7 @@ struct GPUContext::Impl { void AddStreamCallback(const std::function& callback) const { // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may - // launch too - // many threads and result in thread oversubscription. + // launch too many threads and result in thread oversubscription. auto* callback_func = new std::function(std::move(callback)); auto* func = new std::function([this, callback_func] { std::lock_guard lock(stream_call_back_mtx_); diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 5abaf6c2ffa87cf4e63d082edbb26294d99b2932..1c9fbffa2ac1955dc27ed03c398a20ab1c5c8b18 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -276,10 +276,7 @@ struct ConcatFunctor { int64_t out_row = in_row, out_col = 0; int inputs_col_num = in_num + 1; - std::vector inputs_data_vec(in_num); - std::vector inputs_col_vec(inputs_col_num); - const T** inputs_data = inputs_data_vec.data(); - int64_t* inputs_col = inputs_col_vec.data(); + paddle::memory::AllocationPtr data_alloc, col_alloc; // There are some differences between hip runtime and NV runtime. // In NV, when the pageable memory data less than 64K is transferred from @@ -289,16 +286,22 @@ struct ConcatFunctor { // 3.2.6.1. Concurrent Execution between Host and Device // Memory copies from host to device of a memory block of 64 KB or less #ifdef PADDLE_WITH_HIP - paddle::memory::AllocationPtr data_alloc, col_alloc; // TODO(chentianyu03): try to find a method to remove the Alloc function data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*)); - inputs_data = reinterpret_cast(data_alloc->ptr()); // TODO(chentianyu03): try to find a method to remove the Alloc function col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(int)); - inputs_col = reinterpret_cast(col_alloc->ptr()); +#else + // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu + // allocator. + data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), + in_num * sizeof(T*)); + col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), + (inputs_col_num) * sizeof(int64_t)); #endif + const T** inputs_data = reinterpret_cast(data_alloc->ptr()); + int64_t* inputs_col = reinterpret_cast(col_alloc->ptr()); inputs_col[0] = 0; bool has_same_shape = true; @@ -387,7 +390,6 @@ struct ConcatFunctor { output->data()); } -#ifdef PADDLE_WITH_HIP // Prevent the pinned memory value from being covered and release the memory // after the launch kernel of the stream is executed (reapply pinned memory // next time) @@ -401,7 +403,6 @@ struct ConcatFunctor { paddle::memory::allocation::Allocator::AllocationDeleter( col_alloc_released); }); -#endif } }; @@ -432,10 +433,7 @@ class SplitFunctor { bool has_same_shape = true; int outputs_cols_num = o_num + 1; - std::vector outputs_data_vec(o_num); - std::vector outputs_cols_vec(outputs_cols_num); - T** outputs_data = outputs_data_vec.data(); - int64_t* outputs_cols = outputs_cols_vec.data(); + paddle::memory::AllocationPtr data_alloc, cols_alloc; // There are some differences between hip runtime and NV runtime. // In NV, when the pageable memory data less than 64K is transferred from @@ -445,16 +443,22 @@ class SplitFunctor { // 3.2.6.1. Concurrent Execution between Host and Device // Memory copies from host to device of a memory block of 64 KB or less #ifdef PADDLE_WITH_HIP - paddle::memory::AllocationPtr data_alloc, cols_alloc; // TODO(chentianyu03): try to find a method to remove the Alloc function data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), o_num * sizeof(T*)); - outputs_data = reinterpret_cast(data_alloc->ptr()); // TODO(chentianyu03): try to find a method to remove the Alloc function cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), (outputs_cols_num) * sizeof(int64_t)); - outputs_cols = reinterpret_cast(cols_alloc->ptr()); +#else + // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu + // allocator. + data_alloc = + paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*)); + cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), + (outputs_cols_num) * sizeof(int64_t)); #endif + T** outputs_data = reinterpret_cast(data_alloc->ptr()); + int64_t* outputs_cols = reinterpret_cast(cols_alloc->ptr()); outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { @@ -547,7 +551,7 @@ class SplitFunctor { static_cast(outputs_cols_num), dev_out_gpu_data); } -#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory // after the launch kernel of the stream is executed (reapply pinned memory // next time) @@ -559,7 +563,6 @@ class SplitFunctor { paddle::memory::allocation::Allocator::AllocationDeleter( cols_alloc_released); }); -#endif } };