[Cherry-pick] [Fix BUG]: Core when multi thread + clone + paddle-tr #22442 (#22471)

test=release/1.7

[Cherry-pick] [Fix BUG]: Core when multi thread + clone + paddle-tr #22442 (#22471)
test=release/1.7
2eb3a7a9 · Zhaolong Xing · GitHub · 6892deb1 · 2eb3a7a9 · 2eb3a7a9
4 changed file
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -138,7 +138,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
                 t_data, ele_num * sizeof(T), dev_ctx->stream());
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(dev_ctx->stream());
 #else
    PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -38,13 +38,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
  const std::thread::id tid = std::this_thread::get_id();
  batch_size_ = batch_size;
  if (infer_context_.find(tid) == infer_context_.end()) {
+    std::unique_lock<std::mutex> lock(mutex_);
    PADDLE_ENFORCE_NOT_NULL(
        infer_engine_,
        "You should build engine first and then set the context.");
    infer_context_[tid].reset(infer_engine_->createExecutionContext());
  }
  infer_context_[tid]->enqueue(batch_size, buffers->data(), stream, nullptr);
-  cudaStreamSynchronize(stream);
  SetRuntimeBatch(batch_size);
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -82,7 +82,7 @@ class TensorRTEngine {
  void Build(const DescType& paddle_model);
  void Execute(int batch_size, std::vector<void*>* buffers,
-               cudaStream_t stream);
+               cudaStream_t stream = nullptr);
  // Initialize the inference network, so that TensorRT layers can add to this
  // network.
@@ -216,6 +216,7 @@ class TensorRTEngine {
      infer_context_;
  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
  std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
+  std::mutex mutex_;
 };  // class TensorRTEngine
 #define IS_TRT_VERSION_GE(version)                       \

--- a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <numeric>
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
@@ -44,6 +45,15 @@ TEST(quant_int8, resnet50) {
  input_t->copy_from_cpu(input);
  ASSERT_TRUE(predictor->ZeroCopyRun());
+  std::vector<float> out_data;
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data.resize(out_num);
+  output_t->copy_to_cpu(out_data.data());
 }
 }  // namespace inference