未验证 提交 ac75617a 编写于 作者: W Wilber 提交者: GitHub

revert PR43039 (#43384)

上级 cdeb3167
...@@ -174,67 +174,67 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { ...@@ -174,67 +174,67 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
} }
#ifdef PADDLE_WITH_GPU #ifdef PADDLE_WITH_GPU
TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) { // TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
int thread_num = 4; // int thread_num = 4;
// init stream // // init stream
std::vector<cudaStream_t> streams(thread_num); // std::vector<cudaStream_t> streams(thread_num);
for (size_t i = 0; i < thread_num; ++i) { // for (size_t i = 0; i < thread_num; ++i) {
cudaStreamCreate(&streams[i]); // cudaStreamCreate(&streams[i]);
} // }
// init input data // // init input data
std::map<std::string, paddle::test::Record> my_input_data_map; // std::map<std::string, paddle::test::Record> my_input_data_map;
my_input_data_map["x"] = PrepareInput(2); // my_input_data_map["x"] = PrepareInput(2);
// init output data // // init output data
std::map<std::string, paddle::test::Record> infer_output_data, // std::map<std::string, paddle::test::Record> infer_output_data,
truth_output_data; // truth_output_data;
// prepare groudtruth config // // prepare groudtruth config
paddle_infer::Config config, config_no_ir; // paddle_infer::Config config, config_no_ir;
config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", // config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
FLAGS_modeldir + "/inference.pdiparams"); // FLAGS_modeldir + "/inference.pdiparams");
config_no_ir.SwitchIrOptim(false); // config_no_ir.SwitchIrOptim(false);
// prepare inference config // // prepare inference config
config.SetModel(FLAGS_modeldir + "/inference.pdmodel", // config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
FLAGS_modeldir + "/inference.pdiparams"); // FLAGS_modeldir + "/inference.pdiparams");
config.EnableUseGpu(100, 0); // config.EnableUseGpu(100, 0);
config.EnableTensorRtEngine( // config.EnableTensorRtEngine(
1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); // 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
// get groudtruth by disbale ir // // get groudtruth by disbale ir
paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); // paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, // SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
&truth_output_data, 1); // &truth_output_data, 1);
// get infer results from multi threads // // get infer results from multi threads
std::vector<std::thread> threads; // std::vector<std::thread> threads;
config.SetExecStream(streams[0]); // config.SetExecStream(streams[0]);
config.pass_builder()->DeletePass("add_support_int8_pass"); // config.pass_builder()->DeletePass("add_support_int8_pass");
auto main_predictor = CreatePredictor(config); // auto main_predictor = CreatePredictor(config);
std::vector<decltype(main_predictor)> predictors; // std::vector<decltype(main_predictor)> predictors;
for (size_t i = 0; i < thread_num - 1; ++i) { // for (size_t i = 0; i < thread_num - 1; ++i) {
predictors.push_back(std::move(main_predictor->Clone(streams[i + 1]))); // predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
LOG(INFO) << "predictors[" << i << "] stream is " // LOG(INFO) << "predictors[" << i << "] stream is "
<< predictors[i]->GetExecStream(); // << predictors[i]->GetExecStream();
} // }
predictors.push_back(std::move(main_predictor)); // predictors.push_back(std::move(main_predictor));
LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is " // LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
<< predictors[thread_num - 1]->GetExecStream(); // << predictors[thread_num - 1]->GetExecStream();
for (int i = 0; i < thread_num; ++i) { // for (int i = 0; i < thread_num; ++i) {
threads.emplace_back(paddle::test::SingleThreadPrediction, // threads.emplace_back(paddle::test::SingleThreadPrediction,
predictors[i].get(), &my_input_data_map, // predictors[i].get(), &my_input_data_map,
&infer_output_data, 10); // &infer_output_data, 10);
} // }
// thread join & check outputs // // thread join & check outputs
for (int i = 0; i < thread_num; ++i) { // for (int i = 0; i < thread_num; ++i) {
LOG(INFO) << "join tid : " << i; // LOG(INFO) << "join tid : " << i;
threads[i].join(); // threads[i].join();
// CompareRecord(&truth_output_data, &infer_output_data); // CompareRecord(&truth_output_data, &infer_output_data);
} // }
std::cout << "finish multi-thread test" << std::endl; // std::cout << "finish multi-thread test" << std::endl;
} // }
#endif #endif
} // namespace paddle_infer } // namespace paddle_infer
......
...@@ -276,7 +276,10 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -276,7 +276,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
int64_t out_row = in_row, out_col = 0; int64_t out_row = in_row, out_col = 0;
int inputs_col_num = in_num + 1; int inputs_col_num = in_num + 1;
paddle::memory::AllocationPtr data_alloc, col_alloc; std::vector<const T*> inputs_data_vec(in_num);
std::vector<int64_t> inputs_col_vec(inputs_col_num);
const T** inputs_data = inputs_data_vec.data();
int64_t* inputs_col = inputs_col_vec.data();
// There are some differences between hip runtime and NV runtime. // There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from // In NV, when the pageable memory data less than 64K is transferred from
...@@ -286,22 +289,16 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -286,22 +289,16 @@ struct ConcatFunctor<phi::GPUContext, T> {
// 3.2.6.1. Concurrent Execution between Host and Device // 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less // Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::memory::AllocationPtr data_alloc, col_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
in_num * sizeof(T*)); in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
inputs_col_num * sizeof(int)); inputs_col_num * sizeof(int));
#else inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
// TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
// allocator.
data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
in_num * sizeof(T*));
col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
(inputs_col_num) * sizeof(int64_t));
#endif #endif
const T** inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
int64_t* inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
inputs_col[0] = 0; inputs_col[0] = 0;
bool has_same_shape = true; bool has_same_shape = true;
...@@ -390,6 +387,7 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -390,6 +387,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
output->data<T>()); output->data<T>());
} }
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory // Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory // after the launch kernel of the stream is executed (reapply pinned memory
// next time) // next time)
...@@ -403,6 +401,7 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -403,6 +401,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
paddle::memory::allocation::Allocator::AllocationDeleter( paddle::memory::allocation::Allocator::AllocationDeleter(
col_alloc_released); col_alloc_released);
}); });
#endif
} }
}; };
...@@ -433,7 +432,10 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -433,7 +432,10 @@ class SplitFunctor<phi::GPUContext, T> {
bool has_same_shape = true; bool has_same_shape = true;
int outputs_cols_num = o_num + 1; int outputs_cols_num = o_num + 1;
paddle::memory::AllocationPtr data_alloc, cols_alloc; std::vector<T*> outputs_data_vec(o_num);
std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
T** outputs_data = outputs_data_vec.data();
int64_t* outputs_cols = outputs_cols_vec.data();
// There are some differences between hip runtime and NV runtime. // There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from // In NV, when the pageable memory data less than 64K is transferred from
...@@ -443,22 +445,16 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -443,22 +445,16 @@ class SplitFunctor<phi::GPUContext, T> {
// 3.2.6.1. Concurrent Execution between Host and Device // 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less // Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::memory::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
o_num * sizeof(T*)); o_num * sizeof(T*));
outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
(outputs_cols_num) * sizeof(int64_t)); (outputs_cols_num) * sizeof(int64_t));
#else outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
// TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
// allocator.
data_alloc =
paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*));
cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
(outputs_cols_num) * sizeof(int64_t));
#endif #endif
T** outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
int64_t* outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
...@@ -552,6 +548,7 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -552,6 +548,7 @@ class SplitFunctor<phi::GPUContext, T> {
dev_out_gpu_data); dev_out_gpu_data);
} }
#ifdef PADDLE_WITH_HIP
// Prevent the pinned memory value from being covered and release the memory // Prevent the pinned memory value from being covered and release the memory
// after the launch kernel of the stream is executed (reapply pinned memory // after the launch kernel of the stream is executed (reapply pinned memory
// next time) // next time)
...@@ -563,6 +560,7 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -563,6 +560,7 @@ class SplitFunctor<phi::GPUContext, T> {
paddle::memory::allocation::Allocator::AllocationDeleter( paddle::memory::allocation::Allocator::AllocationDeleter(
cols_alloc_released); cols_alloc_released);
}); });
#endif
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册