From ac75617ac77a6e872b20dd2ecb409511a5c64247 Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 10 Jun 2022 18:39:47 +0800 Subject: [PATCH] revert PR43039 (#43384) --- .../inference/tests/infer_ut/test_LeViT.cc | 122 +++++++++--------- .../kernels/funcs/concat_and_split_functor.cu | 38 +++--- 2 files changed, 79 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc index c929869233..87b443278a 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -174,67 +174,67 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { } #ifdef PADDLE_WITH_GPU -TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) { - int thread_num = 4; - - // init stream - std::vector streams(thread_num); - for (size_t i = 0; i < thread_num; ++i) { - cudaStreamCreate(&streams[i]); - } - - // init input data - std::map my_input_data_map; - my_input_data_map["x"] = PrepareInput(2); - // init output data - std::map infer_output_data, - truth_output_data; - // prepare groudtruth config - paddle_infer::Config config, config_no_ir; - config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", - FLAGS_modeldir + "/inference.pdiparams"); - config_no_ir.SwitchIrOptim(false); - // prepare inference config - config.SetModel(FLAGS_modeldir + "/inference.pdmodel", - FLAGS_modeldir + "/inference.pdiparams"); - config.EnableUseGpu(100, 0); - config.EnableTensorRtEngine( - 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); - // get groudtruth by disbale ir - - paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); - SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, - &truth_output_data, 1); - - // get infer results from multi threads - std::vector threads; - config.SetExecStream(streams[0]); - config.pass_builder()->DeletePass("add_support_int8_pass"); - auto main_predictor = CreatePredictor(config); - std::vector predictors; - for (size_t i = 0; i < thread_num - 1; ++i) { - predictors.push_back(std::move(main_predictor->Clone(streams[i + 1]))); - LOG(INFO) << "predictors[" << i << "] stream is " - << predictors[i]->GetExecStream(); - } - predictors.push_back(std::move(main_predictor)); - LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is " - << predictors[thread_num - 1]->GetExecStream(); - for (int i = 0; i < thread_num; ++i) { - threads.emplace_back(paddle::test::SingleThreadPrediction, - predictors[i].get(), &my_input_data_map, - &infer_output_data, 10); - } - - // thread join & check outputs - for (int i = 0; i < thread_num; ++i) { - LOG(INFO) << "join tid : " << i; - threads[i].join(); - // CompareRecord(&truth_output_data, &infer_output_data); - } - - std::cout << "finish multi-thread test" << std::endl; -} +// TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) { +// int thread_num = 4; + +// // init stream +// std::vector streams(thread_num); +// for (size_t i = 0; i < thread_num; ++i) { +// cudaStreamCreate(&streams[i]); +// } + +// // init input data +// std::map my_input_data_map; +// my_input_data_map["x"] = PrepareInput(2); +// // init output data +// std::map infer_output_data, +// truth_output_data; +// // prepare groudtruth config +// paddle_infer::Config config, config_no_ir; +// config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", +// FLAGS_modeldir + "/inference.pdiparams"); +// config_no_ir.SwitchIrOptim(false); +// // prepare inference config +// config.SetModel(FLAGS_modeldir + "/inference.pdmodel", +// FLAGS_modeldir + "/inference.pdiparams"); +// config.EnableUseGpu(100, 0); +// config.EnableTensorRtEngine( +// 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); +// // get groudtruth by disbale ir + +// paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); +// SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, +// &truth_output_data, 1); + +// // get infer results from multi threads +// std::vector threads; +// config.SetExecStream(streams[0]); +// config.pass_builder()->DeletePass("add_support_int8_pass"); +// auto main_predictor = CreatePredictor(config); +// std::vector predictors; +// for (size_t i = 0; i < thread_num - 1; ++i) { +// predictors.push_back(std::move(main_predictor->Clone(streams[i + 1]))); +// LOG(INFO) << "predictors[" << i << "] stream is " +// << predictors[i]->GetExecStream(); +// } +// predictors.push_back(std::move(main_predictor)); +// LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is " +// << predictors[thread_num - 1]->GetExecStream(); +// for (int i = 0; i < thread_num; ++i) { +// threads.emplace_back(paddle::test::SingleThreadPrediction, +// predictors[i].get(), &my_input_data_map, +// &infer_output_data, 10); +// } + +// // thread join & check outputs +// for (int i = 0; i < thread_num; ++i) { +// LOG(INFO) << "join tid : " << i; +// threads[i].join(); +// CompareRecord(&truth_output_data, &infer_output_data); +// } + +// std::cout << "finish multi-thread test" << std::endl; +// } #endif } // namespace paddle_infer diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 1c9fbffa2a..22dba8297d 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -276,7 +276,10 @@ struct ConcatFunctor { int64_t out_row = in_row, out_col = 0; int inputs_col_num = in_num + 1; - paddle::memory::AllocationPtr data_alloc, col_alloc; + std::vector inputs_data_vec(in_num); + std::vector inputs_col_vec(inputs_col_num); + const T** inputs_data = inputs_data_vec.data(); + int64_t* inputs_col = inputs_col_vec.data(); // There are some differences between hip runtime and NV runtime. // In NV, when the pageable memory data less than 64K is transferred from @@ -286,22 +289,16 @@ struct ConcatFunctor { // 3.2.6.1. Concurrent Execution between Host and Device // Memory copies from host to device of a memory block of 64 KB or less #ifdef PADDLE_WITH_HIP + paddle::memory::AllocationPtr data_alloc, col_alloc; // TODO(chentianyu03): try to find a method to remove the Alloc function data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*)); + inputs_data = reinterpret_cast(data_alloc->ptr()); // TODO(chentianyu03): try to find a method to remove the Alloc function col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(int)); -#else - // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu - // allocator. - data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), - in_num * sizeof(T*)); - col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), - (inputs_col_num) * sizeof(int64_t)); + inputs_col = reinterpret_cast(col_alloc->ptr()); #endif - const T** inputs_data = reinterpret_cast(data_alloc->ptr()); - int64_t* inputs_col = reinterpret_cast(col_alloc->ptr()); inputs_col[0] = 0; bool has_same_shape = true; @@ -390,6 +387,7 @@ struct ConcatFunctor { output->data()); } +#ifdef PADDLE_WITH_HIP // Prevent the pinned memory value from being covered and release the memory // after the launch kernel of the stream is executed (reapply pinned memory // next time) @@ -403,6 +401,7 @@ struct ConcatFunctor { paddle::memory::allocation::Allocator::AllocationDeleter( col_alloc_released); }); +#endif } }; @@ -433,7 +432,10 @@ class SplitFunctor { bool has_same_shape = true; int outputs_cols_num = o_num + 1; - paddle::memory::AllocationPtr data_alloc, cols_alloc; + std::vector outputs_data_vec(o_num); + std::vector outputs_cols_vec(outputs_cols_num); + T** outputs_data = outputs_data_vec.data(); + int64_t* outputs_cols = outputs_cols_vec.data(); // There are some differences between hip runtime and NV runtime. // In NV, when the pageable memory data less than 64K is transferred from @@ -443,22 +445,16 @@ class SplitFunctor { // 3.2.6.1. Concurrent Execution between Host and Device // Memory copies from host to device of a memory block of 64 KB or less #ifdef PADDLE_WITH_HIP + paddle::memory::AllocationPtr data_alloc, cols_alloc; // TODO(chentianyu03): try to find a method to remove the Alloc function data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), o_num * sizeof(T*)); + outputs_data = reinterpret_cast(data_alloc->ptr()); // TODO(chentianyu03): try to find a method to remove the Alloc function cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), (outputs_cols_num) * sizeof(int64_t)); -#else - // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu - // allocator. - data_alloc = - paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*)); - cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(), - (outputs_cols_num) * sizeof(int64_t)); + outputs_cols = reinterpret_cast(cols_alloc->ptr()); #endif - T** outputs_data = reinterpret_cast(data_alloc->ptr()); - int64_t* outputs_cols = reinterpret_cast(cols_alloc->ptr()); outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { @@ -552,6 +548,7 @@ class SplitFunctor { dev_out_gpu_data); } +#ifdef PADDLE_WITH_HIP // Prevent the pinned memory value from being covered and release the memory // after the launch kernel of the stream is executed (reapply pinned memory // next time) @@ -563,6 +560,7 @@ class SplitFunctor { paddle::memory::allocation::Allocator::AllocationDeleter( cols_alloc_released); }); +#endif } }; -- GitLab