diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6732f2dbe366c252b1cf5bb1d6f842bd153d821e..1ab5366d34c9538ff001b0b1c8cb07c4670b5652 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -94,6 +94,61 @@ #endif namespace paddle { +namespace { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void UpdatePrivateDeviceContext(InferGPUContext *gpu_context, + GPUContextResource *gpu_resource, + Place place_) { + gpu_context->SetAllocator(memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place_, gpu_resource->GetStream()) + .get()); + gpu_context->SetPinnedAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); + gpu_context->SetHostAllocator(memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CPUPlace()) + .get()); + gpu_context->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place_) + .get()); + gpu_context->SetHostZeroAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CPUPlace()) + .get()); + gpu_context->SetGenerator( + phi::DefaultCUDAGenerator(place_.GetDeviceId()).get()); + gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get()); + + gpu_context->SetStream(gpu_resource->GetStream()); + gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator()); + gpu_context->SetBlasTensorCoreHandle( + gpu_resource->GetBlasTensorCoreHandleCreator()); + gpu_context->SetBlasTF32Handle( + gpu_resource->GetBlasTF32TensorCoreHandleCreator()); + gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator()); + gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator()); + gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator()); + gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice()); + + gpu_context->SetComputeCapability(gpu_resource->GetGpuComputeCapability()); + gpu_context->SetMaxThreadsPerBlock(gpu_resource->GetGpuMaxThreadsPerBlock()); + gpu_context->SetMaxThreadsPerMultiProcessor( + gpu_resource->GetGpuMaxThreadsPerMp()); + gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize()); + gpu_context->SetMultiProcessors(gpu_resource->GetGPUMultiProcessors()); + gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion()); + gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion()); + VLOG(1) << "thread id is " << std::this_thread::get_id() << ", stream id is " + << reinterpret_cast(gpu_resource->GetStream()) + << ", allotor ptr is " + << reinterpret_cast( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place_, gpu_resource->GetStream()) + .get()); +} +#endif +} // namespace using inference::Singleton; #ifdef PADDLE_WITH_TENSORRT @@ -451,60 +506,7 @@ void AnalysisPredictor::InitDeviceContexts() { auto *gpu_resource = ResourceManager::Instance().GetGPUResource(predictor_stream_); auto *gpu_context = new InferGPUContext(place_); - gpu_context->SetAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(place_, gpu_resource->GetStream()) - .get()); - gpu_context->SetPinnedAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CUDAPinnedPlace()) - .get()); - gpu_context->SetHostAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(platform::CPUPlace()) - .get()); - gpu_context->SetZeroAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetZeroAllocator(place_) - .get()); - gpu_context->SetHostZeroAllocator( - memory::allocation::AllocatorFacade::Instance() - .GetZeroAllocator(platform::CPUPlace()) - .get()); - gpu_context->SetGenerator( - phi::DefaultCUDAGenerator(place_.GetDeviceId()).get()); - gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get()); - - gpu_context->SetStream(gpu_resource->GetStream()); - gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator()); - gpu_context->SetBlasTensorCoreHandle( - gpu_resource->GetBlasTensorCoreHandleCreator()); - gpu_context->SetBlasTF32Handle( - gpu_resource->GetBlasTF32TensorCoreHandleCreator()); - gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator()); - gpu_context->SetSolverHandle( - gpu_resource->GetSolverDnHandleCreator()); - gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator()); - gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDeviceCreator()); - gpu_context->SetComputeCapability( - gpu_resource->GetGpuComputeCapability()); - gpu_context->SetMaxThreadsPerBlock( - gpu_resource->GetGpuMaxThreadsPerBlock()); - gpu_context->SetMaxThreadsPerMultiProcessor( - gpu_resource->GetGpuMaxThreadsPerMp()); - gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize()); - gpu_context->SetMultiProcessors( - gpu_resource->GetGPUMultiProcessors()); - gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion()); - gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion()); - VLOG(1) << "thread id is " << std::this_thread::get_id() - << ", stream id is " - << reinterpret_cast(gpu_resource->GetStream()) - << ", allotor ptr is " - << reinterpret_cast( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(place_, gpu_resource->GetStream()) - .get()); + UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_); return std::unique_ptr(gpu_context); })); } @@ -2083,17 +2085,27 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { #else cudaStreamSynchronize(static_cast(predictor_stream_)); #endif - ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_, + ResourceManager::Instance().GpuResourceSwitchStream(predictor_stream_, stream); predictor_stream_ = stream; - auto *dev_ctxs = reinterpret_cast>> *>( - this->GetDeviceContexts()); - auto *dev_ctx = - static_cast(dev_ctxs->at(place_).get().get()); - dev_ctx->SetStream(stream); + auto *dev_ctxs = const_cast< + std::map>> *>( + reinterpret_cast>> *>( + this->GetDeviceContexts())); + + dev_ctxs->erase(place_); + dev_ctxs->emplace( + place_, std::async(std::launch::deferred, [=] { + auto *gpu_resource = + ResourceManager::Instance().GetGPUResource(predictor_stream_); + auto *gpu_context = new InferGPUContext(place_); + UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_); + return std::unique_ptr(gpu_context); + })); } return ZeroCopyRun(); diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 0f3612a5e976c173305ecd00714e69674d3987b1..7968b8d7d9d039a36c2d471ac74c7b4b9a3db41e 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -154,6 +154,7 @@ void GPUContextResource::InitGPUResource(void* stream) { } InitGpuProperties(); + InitGpuEigenDevice(); } void GPUContextResource::DestroyGPUResource() { @@ -361,90 +362,6 @@ std::array GPUContextResource::GetGpuMaxGridDimSize() const { return max_grid_dim_size_; } -void GPUContextResource::ReBindStream(gpuStream_t stream) { - owned_stream_ = false; - stream_ = stream; -} - -void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const { - if (dnn_handle_) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::miopenSetStream(dnn_handle_, stream)); -#else - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cudnnSetStream(dnn_handle_, stream)); -#endif - } -} - -void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const { - if (blas_handle_) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::rocblas_set_stream(blas_handle_, stream)); -#else - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cublasSetStream(blas_handle_, stream)); -#endif - } -} - -void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const { - if (blas_tensor_core_handle_) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream)); -#else - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream)); -#endif - } -} - -void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const { - if (blas_tf32_tensor_core_handle_) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_set_stream( - blas_tf32_tensor_core_handle_, stream)); -#else - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream)); -#endif - } -} - -void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const { - if (solver_handle_) { -#ifndef PADDLE_WITH_HIP - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cusolverDnSetStream(solver_handle_, stream)); -#endif - } -} - -void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const { - if (sparse_handle_) { -#if defined(PADDLE_WITH_CUDA) -// The generic APIs is supported from CUDA10.1 -#if CUDA_VERSION >= 11000 - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cusparseSetStream(sparse_handle_, stream)); -#endif -#endif - } -} - -void GPUContextResource::ReBindEigenDevice(gpuStream_t stream, - GPUPlace place) const { - if (eigen_stream_) { - auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(place_) - .get(); - eigen_stream_->Reinitialize(stream, allocator, place); - } -} - #endif void ResourceManager::InitCPUResource() { @@ -486,24 +403,16 @@ void ResourceManager::DestroyGPUResource(void* stream) { } void ResourceManager::Decrease(void* stream) { - PADDLE_ENFORCE_EQ(ref_count_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in ref_count.", stream)); + if (ref_count_.count(stream) == 0) return; --ref_count_[stream]; + if (ref_count_[stream] == 0) { ref_count_.erase(stream); - gpu_resources_.erase(stream); + if (gpu_resources_.count(stream) > 0) gpu_resources_.erase(stream); } } -void ResourceManager::Increase(void* stream) { - PADDLE_ENFORCE_EQ(ref_count_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in ref_count.", stream)); - ++ref_count_[stream]; -} +void ResourceManager::Increase(void* stream) { ++ref_count_[stream]; } GPUContextResource* ResourceManager::GetGPUResource(void* stream) const { PADDLE_ENFORCE_EQ(gpu_resources_.count(stream), @@ -513,33 +422,29 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const { return gpu_resources_.at(stream).get(); } -void ResourceManager::GpuResourceReBindStream(void* old_stream, +void ResourceManager::GpuResourceSwitchStream(void* old_stream, void* new_stream) { + // NOTE: add lock to support stream rebind in multi-thread + std::lock_guard lock_gurad(gpu_mutex_); + if (old_stream == new_stream) return; PADDLE_ENFORCE_EQ( gpu_resources_.count(old_stream), true, platform::errors::InvalidArgument( "The stream[%p] not found in gpu_resources.", old_stream)); - auto gpu_resource = std::move(gpu_resources_.at(old_stream)); - DestroyGPUResource(old_stream); - PADDLE_ENFORCE_EQ( - ref_count_.count(old_stream), - 0, - platform::errors::Fatal("gpu resources rebind stream failed.")); - - gpu_resource->ReBindStream(static_cast(new_stream)); - gpu_resource->ReBindDnnHandle(static_cast(new_stream)); - gpu_resource->ReBindBlasHandle(static_cast(new_stream)); - gpu_resource->ReBindBlasTensorCoreHandle( - static_cast(new_stream)); - gpu_resource->ReBindBlasTF32Handle(static_cast(new_stream)); - gpu_resource->ReBindSolverDnHandle(static_cast(new_stream)); - gpu_resource->ReBindSparseHandle(static_cast(new_stream)); - gpu_resource->ReBindEigenDevice(static_cast(new_stream), - gpu_resource->Place()); - ref_count_[new_stream]++; - gpu_resources_.emplace(new_stream, std::move(gpu_resource)); + // NOTE: stream may be used by multiple predictor, skip resource + // operation if resource of new_stream is already exists + bool new_stream_existed = gpu_resources_.count(new_stream) > 0; + if (!new_stream_existed) { + auto place = gpu_resources_.at(old_stream)->Place(); + std::unique_ptr resource{ + new GPUContextResource(place, new_stream)}; + gpu_resources_.emplace(new_stream, std::move(resource)); + } + + Decrease(old_stream); + Increase(new_stream); } int ResourceManager::RefCount(void* stream) const { diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index 119b5d94dc56cd65784d65699121ff762d0f3416..3b4c66d3190c9681426c1b62d0c2a38efc075dfd 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -82,16 +82,6 @@ class GPUContextResource { int GetGpuMaxThreadsPerBlock() const; std::array GetGpuMaxGridDimSize() const; - // If stream changes, we need to rebind all handle to new stream. - void ReBindStream(gpuStream_t stream); - void ReBindDnnHandle(gpuStream_t stream) const; - void ReBindBlasHandle(gpuStream_t stream) const; - void ReBindBlasTensorCoreHandle(gpuStream_t stream) const; - void ReBindBlasTF32Handle(gpuStream_t stream) const; - void ReBindSolverDnHandle(gpuStream_t stream) const; - void ReBindSparseHandle(gpuStream_t stream) const; - void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const; - private: void InitGPUResource(void* stream); void DestroyGPUResource(); @@ -186,7 +176,7 @@ class ResourceManager { void DestroyGPUResource(void* stream); GPUContextResource* GetGPUResource(void* stream) const; int RefCount(void* stream) const; - void GpuResourceReBindStream(void* old_stream, void* new_stream); + void GpuResourceSwitchStream(void* old_stream, void* new_stream); private: void Decrease(void* stream); diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt index c4045e9ca603abb27b148d20eaeb91ad50e6d028..3c310e68065f020db405d802713d53259e3cda11 100644 --- a/test/cpp/inference/api/CMakeLists.txt +++ b/test/cpp/inference/api/CMakeLists.txt @@ -1028,6 +1028,18 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c) endif() + #TODO(inference): windows encounter a SEH error, we need to fix it. + if(NOT WIN32) + inference_analysis_test( + trt_rebind_stream_test + SRCS + trt_rebind_stream_test.cc + EXTRA_DEPS + paddle_inference_shared + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + endif() + set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) @@ -1378,6 +1390,10 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) endif() if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 240) + if(NOT WIN32) + set_tests_properties(trt_rebind_stream_test + PROPERTIES TIMEOUT 360 LABELS "RUN_TYPE=EXCLUSIVE") + endif() if(WITH_MKLDNN) set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120) diff --git a/test/cpp/inference/api/trt_rebind_stream_test.cc b/test/cpp/inference/api/trt_rebind_stream_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a42af93427c505c680f3e6e0a42b951522db7a1 --- /dev/null +++ b/test/cpp/inference/api/trt_rebind_stream_test.cc @@ -0,0 +1,183 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "test/cpp/inference/api/tester_helper.h" + +namespace paddle { +namespace inference { + +// TODO(inference): This case failed in windows with a SEH error, we need to fix +// it. +TEST(ReBindStream_single, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.EnableTensorRtEngine(); + + cudaStream_t stream1, stream2, stream3; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + cudaStreamCreate(&stream3); + + config.SetExecStream(stream1); + auto predictor = paddle_infer::CreatePredictor(config); + auto x_t = predictor->GetInputHandle("x"); + x_t->Reshape({1, 3, 224, 224}); + float x_data[3 * 224 * 224] = {0}; + x_t->CopyFromCpu(x_data); + ASSERT_TRUE(predictor->Run()); + cudaDeviceSynchronize(); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor.get(), stream2)); + cudaDeviceSynchronize(); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor.get(), stream3)); + cudaDeviceSynchronize(); +} + +TEST(ReBindStream_multi, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + AnalysisConfig config1; + config1.EnableUseGpu(100, 0); + config1.SetModel(model_dir); + config1.EnableTensorRtEngine(); + AnalysisConfig config2; + config2.EnableUseGpu(100, 0); + config2.EnableTensorRtEngine(); + config2.SetModel(model_dir); + + cudaStream_t stream1, stream2, stream3; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + cudaStreamCreate(&stream3); + + config1.SetExecStream(stream1); + config2.SetExecStream(stream1); + auto predictor1 = paddle_infer::CreatePredictor(config1); + auto predictor2 = paddle_infer::CreatePredictor(config2); + + std::vector x1(3 * 224 * 224, 1.0); + auto x_t1 = predictor1->GetInputHandle("x"); + x_t1->Reshape({1, 3, 224, 224}); + x_t1->CopyFromCpu(x1.data()); + std::vector x2(3 * 224 * 224, 2.0); + auto x_t2 = predictor2->GetInputHandle("x"); + x_t2->Reshape({1, 3, 224, 224}); + x_t2->CopyFromCpu(x2.data()); + + ASSERT_TRUE(predictor1->Run()); + cudaStreamSynchronize(stream1); + ASSERT_TRUE(predictor2->Run()); + cudaStreamSynchronize(stream1); + + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor1.get(), stream2)); + cudaDeviceSynchronize(); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor2.get(), stream2)); + cudaDeviceSynchronize(); + + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor1.get(), stream3)); + cudaStreamSynchronize(stream3); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor2.get(), stream3)); + cudaStreamSynchronize(stream3); +} + +TEST(SwitchStream_multi, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + AnalysisConfig config1; + config1.EnableUseGpu(100, 0); + config1.SetModel(model_dir); + AnalysisConfig config2; + config2.EnableUseGpu(100, 0); + config2.SetModel(model_dir); + AnalysisConfig config3; + config3.EnableUseGpu(100, 0); + config3.SetModel(model_dir); + + // config1.EnableTensorRtEngine(); + // config2.EnableTensorRtEngine(); + // config3.EnableTensorRtEngine(); + + cudaStream_t stream1, stream2, stream3; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + cudaStreamCreate(&stream3); + + config1.SetExecStream(stream1); + config2.SetExecStream(stream1); + config3.SetExecStream(stream1); + auto predictor1 = paddle_infer::CreatePredictor(config1); + auto predictor2 = paddle_infer::CreatePredictor(config2); + auto predictor3 = paddle_infer::CreatePredictor(config3); + + std::vector x1(3 * 224 * 224, 1.0); + auto x_t1 = predictor1->GetInputHandle("x"); + x_t1->Reshape({1, 3, 224, 224}); + x_t1->CopyFromCpu(x1.data()); + std::vector x2(3 * 224 * 224, 2.0); + auto x_t2 = predictor2->GetInputHandle("x"); + x_t2->Reshape({1, 3, 224, 224}); + x_t2->CopyFromCpu(x2.data()); + std::vector x3(3 * 224 * 224, 2.5); + auto x_t3 = predictor3->GetInputHandle("x"); + x_t3->Reshape({1, 3, 224, 224}); + x_t3->CopyFromCpu(x3.data()); + + // TODO(wilber): fix. + // NOTE: Must run once on master thread, but why? + // if remove the code, the unit test fail. + ASSERT_TRUE(predictor1->Run()); + cudaStreamSynchronize(stream1); + ASSERT_TRUE(predictor2->Run()); + cudaStreamSynchronize(stream1); + ASSERT_TRUE(predictor3->Run()); + cudaStreamSynchronize(stream1); + + auto Run = [&](paddle_infer::Predictor* p, + std::vector streams) { + for (auto s : streams) { + paddle_infer::experimental::InternalUtils::RunWithExternalStream(p, s); + } + }; + + std::thread p1(Run, + predictor1.get(), + std::vector{ + stream1, stream2, stream3, stream3, stream2, stream2}); + std::thread p2(Run, + predictor2.get(), + std::vector{ + stream1, stream3, stream1, stream2, stream1, stream3}); + std::thread p3(Run, + predictor3.get(), + std::vector{ + stream1, stream1, stream2, stream3, stream3, stream2}); + p1.join(); + p2.join(); + p3.join(); + cudaDeviceSynchronize(); +} + +} // namespace inference +} // namespace paddle