From ac75617ac77a6e872b20dd2ecb409511a5c64247 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Fri, 10 Jun 2022 18:39:47 +0800
Subject: [PATCH] revert PR43039 (#43384)

---
 .../inference/tests/infer_ut/test_LeViT.cc    | 122 +++++++++---------
 .../kernels/funcs/concat_and_split_functor.cu |  38 +++---
 2 files changed, 79 insertions(+), 81 deletions(-)

diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index c929869233..87b443278a 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -174,67 +174,67 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
 }
 
 #ifdef PADDLE_WITH_GPU
-TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
-  int thread_num = 4;
-
-  // init stream
-  std::vector<cudaStream_t> streams(thread_num);
-  for (size_t i = 0; i < thread_num; ++i) {
-    cudaStreamCreate(&streams[i]);
-  }
-
-  // init input data
-  std::map<std::string, paddle::test::Record> my_input_data_map;
-  my_input_data_map["x"] = PrepareInput(2);
-  // init output data
-  std::map<std::string, paddle::test::Record> infer_output_data,
-      truth_output_data;
-  // prepare groudtruth config
-  paddle_infer::Config config, config_no_ir;
-  config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
-                        FLAGS_modeldir + "/inference.pdiparams");
-  config_no_ir.SwitchIrOptim(false);
-  // prepare inference config
-  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
-                  FLAGS_modeldir + "/inference.pdiparams");
-  config.EnableUseGpu(100, 0);
-  config.EnableTensorRtEngine(
-      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
-  // get groudtruth by disbale ir
-
-  paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
-  SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
-                         &truth_output_data, 1);
-
-  // get infer results from multi threads
-  std::vector<std::thread> threads;
-  config.SetExecStream(streams[0]);
-  config.pass_builder()->DeletePass("add_support_int8_pass");
-  auto main_predictor = CreatePredictor(config);
-  std::vector<decltype(main_predictor)> predictors;
-  for (size_t i = 0; i < thread_num - 1; ++i) {
-    predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
-    LOG(INFO) << "predictors[" << i << "] stream is "
-              << predictors[i]->GetExecStream();
-  }
-  predictors.push_back(std::move(main_predictor));
-  LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
-            << predictors[thread_num - 1]->GetExecStream();
-  for (int i = 0; i < thread_num; ++i) {
-    threads.emplace_back(paddle::test::SingleThreadPrediction,
-                         predictors[i].get(), &my_input_data_map,
-                         &infer_output_data, 10);
-  }
-
-  // thread join & check outputs
-  for (int i = 0; i < thread_num; ++i) {
-    LOG(INFO) << "join tid : " << i;
-    threads[i].join();
-    // CompareRecord(&truth_output_data, &infer_output_data);
-  }
-
-  std::cout << "finish multi-thread test" << std::endl;
-}
+// TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
+//   int thread_num = 4;
+
+//   // init stream
+//   std::vector<cudaStream_t> streams(thread_num);
+//   for (size_t i = 0; i < thread_num; ++i) {
+//     cudaStreamCreate(&streams[i]);
+//   }
+
+//   // init input data
+//   std::map<std::string, paddle::test::Record> my_input_data_map;
+//   my_input_data_map["x"] = PrepareInput(2);
+//   // init output data
+//   std::map<std::string, paddle::test::Record> infer_output_data,
+//       truth_output_data;
+//   // prepare groudtruth config
+//   paddle_infer::Config config, config_no_ir;
+//   config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+//                         FLAGS_modeldir + "/inference.pdiparams");
+//   config_no_ir.SwitchIrOptim(false);
+//   // prepare inference config
+//   config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+//                   FLAGS_modeldir + "/inference.pdiparams");
+//   config.EnableUseGpu(100, 0);
+//   config.EnableTensorRtEngine(
+//       1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
+//   // get groudtruth by disbale ir
+
+//   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
+//   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
+//                          &truth_output_data, 1);
+
+//   // get infer results from multi threads
+//   std::vector<std::thread> threads;
+//   config.SetExecStream(streams[0]);
+//   config.pass_builder()->DeletePass("add_support_int8_pass");
+//   auto main_predictor = CreatePredictor(config);
+//   std::vector<decltype(main_predictor)> predictors;
+//   for (size_t i = 0; i < thread_num - 1; ++i) {
+//     predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
+//     LOG(INFO) << "predictors[" << i << "] stream is "
+//               << predictors[i]->GetExecStream();
+//   }
+//   predictors.push_back(std::move(main_predictor));
+//   LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
+//             << predictors[thread_num - 1]->GetExecStream();
+//   for (int i = 0; i < thread_num; ++i) {
+//     threads.emplace_back(paddle::test::SingleThreadPrediction,
+//                          predictors[i].get(), &my_input_data_map,
+//                          &infer_output_data, 10);
+//   }
+
+//   // thread join & check outputs
+//   for (int i = 0; i < thread_num; ++i) {
+//     LOG(INFO) << "join tid : " << i;
+//     threads[i].join();
+//     CompareRecord(&truth_output_data, &infer_output_data);
+//   }
+
+//   std::cout << "finish multi-thread test" << std::endl;
+// }
 #endif
 
 }  // namespace paddle_infer
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 1c9fbffa2a..22dba8297d 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -276,7 +276,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
     int64_t out_row = in_row, out_col = 0;
 
     int inputs_col_num = in_num + 1;
-    paddle::memory::AllocationPtr data_alloc, col_alloc;
+    std::vector<const T*> inputs_data_vec(in_num);
+    std::vector<int64_t> inputs_col_vec(inputs_col_num);
+    const T** inputs_data = inputs_data_vec.data();
+    int64_t* inputs_col = inputs_col_vec.data();
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -286,22 +289,16 @@ struct ConcatFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        in_num * sizeof(T*));
+    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                       inputs_col_num * sizeof(int));
-#else
-    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
-    // allocator.
-    data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
-                                       in_num * sizeof(T*));
-    col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
-                                      (inputs_col_num) * sizeof(int64_t));
+    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 #endif
-    const T** inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-    int64_t* inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 
     inputs_col[0] = 0;
     bool has_same_shape = true;
@@ -390,6 +387,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
           output->data<T>());
     }
 
+#ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -403,6 +401,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           col_alloc_released);
     });
+#endif
   }
 };
 
@@ -433,7 +432,10 @@ class SplitFunctor<phi::GPUContext, T> {
     bool has_same_shape = true;
 
     int outputs_cols_num = o_num + 1;
-    paddle::memory::AllocationPtr data_alloc, cols_alloc;
+    std::vector<T*> outputs_data_vec(o_num);
+    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
+    T** outputs_data = outputs_data_vec.data();
+    int64_t* outputs_cols = outputs_cols_vec.data();
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -443,22 +445,16 @@ class SplitFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        o_num * sizeof(T*));
+    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        (outputs_cols_num) * sizeof(int64_t));
-#else
-    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
-    // allocator.
-    data_alloc =
-        paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*));
-    cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
-                                       (outputs_cols_num) * sizeof(int64_t));
+    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 #endif
-    T** outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-    int64_t* outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -552,6 +548,7 @@ class SplitFunctor<phi::GPUContext, T> {
           dev_out_gpu_data);
     }
 
+#ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -563,6 +560,7 @@ class SplitFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           cols_alloc_released);
     });
+#endif
   }
 };
 
-- 
GitLab