From 4f86092b95943a3e8ee140a276179389332b8a20 Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 22 Jul 2022 10:01:23 +0800 Subject: [PATCH] add batch stream (#44524) --- .../fluid/inference/api/analysis_predictor.cc | 29 ++++- .../fluid/inference/api/resource_manager.cc | 112 ++++++++++++++++++ paddle/fluid/inference/api/resource_manager.h | 13 ++ 3 files changed, 152 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d1c40f0291..cc169c9307 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -43,6 +43,7 @@ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/api/resource_manager.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/model_utils.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -56,6 +57,7 @@ #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" #include "paddle/utils/string/split.h" #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) @@ -1618,8 +1620,31 @@ bool AnalysisPredictor::ZeroCopyRun() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { - LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. " - "Please use config.SetExecStream instead."; + if (!private_context_) { + PADDLE_THROW(platform::errors::Fatal( + "Please use config.SetExecStream to init gpu resources, and then we " + "will bind gpu resources to execution stream.")); + } + + if (stream != predictor_stream_) { +#ifdef PADDLE_WITH_HIP + hipStreamSynchronize(static_cast(predictor_stream_)); +#else + cudaStreamSynchronize(static_cast(predictor_stream_)); +#endif + ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_, + stream); + predictor_stream_ = stream; + + auto *dev_ctxs = reinterpret_cast>> *>( + this->GetDeviceContexts()); + auto *dev_ctx = + static_cast(dev_ctxs->at(place_).get().get()); + dev_ctx->SetStream(stream); + } + return ZeroCopyRun(); } #endif diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 514f665509..6b3be72749 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -17,17 +17,29 @@ #include #include #include +#include #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_resources.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/errors.h" #include "paddle/phi/core/generator.h" #include "unsupported/Eigen/CXX11/Tensor" +#include "paddle/fluid/platform/enforce.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/phi/backends/dynload/cublas.h" +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/dynload/cusolver.h" +#include "paddle/phi/backends/dynload/cusparse.h" +#endif // PADDLE_WITH_CUDA + namespace paddle { namespace internal { @@ -237,6 +249,8 @@ void GPUContextResource::DestroySparseHandle() { phi::DestroySparseHandle(sparse_handle_); } +phi::Place GPUContextResource::Place() const { return place_; } + gpuStream_t GPUContextResource::GetStream() const { return stream_; } dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; } @@ -291,6 +305,75 @@ std::array GPUContextResource::GetGpuMaxGridDimSize() const { return max_grid_dim_size_; } +void GPUContextResource::ReBindStream(gpuStream_t stream) { + owned_stream_ = false; + stream_ = stream; +} + +void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenSetStream(dnn_handle_, stream)); +#else + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream)); +#endif +} + +void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::rocblas_set_stream(blas_handle_, stream)); +#else + PADDLE_RETRY_CUDA_SUCCESS( + phi::dynload::cublasSetStream(blas_handle_, stream)); +#endif +} + +void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream)); +#else + PADDLE_RETRY_CUDA_SUCCESS( + phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream)); +#endif +} + +void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream)); +#else + PADDLE_RETRY_CUDA_SUCCESS( + phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream)); +#endif +} + +void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const { +#ifndef PADDLE_WITH_HIP + PADDLE_RETRY_CUDA_SUCCESS( + phi::dynload::cusolverDnSetStream(solver_handle_, stream)); +#endif +} + +void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const { +#if defined(PADDLE_WITH_CUDA) +// The generic APIs is supported from CUDA10.1 +#if CUDA_VERSION >= 11000 + PADDLE_RETRY_CUDA_SUCCESS( + phi::dynload::cusparseSetStream(sparse_handle_, stream)); +#endif +#endif +} + +void GPUContextResource::ReBindEigenDevice(gpuStream_t stream, + GPUPlace place) const { + auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place_) + .get(); + eigen_stream_->Reinitialize(stream, allocator, place); +} + #endif void ResourceManager::InitCPUResource() { @@ -359,6 +442,35 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const { return gpu_resources_.at(stream).get(); } +void ResourceManager::GpuResourceReBindStream(void* old_stream, + void* new_stream) { + PADDLE_ENFORCE_EQ( + gpu_resources_.count(old_stream), + true, + platform::errors::InvalidArgument( + "The stream[%p] not found in gpu_resources.", old_stream)); + auto gpu_resource = std::move(gpu_resources_.at(old_stream)); + DestroyGPUResource(old_stream); + PADDLE_ENFORCE_EQ( + ref_count_.count(old_stream), + 0, + platform::errors::Fatal("gpu resources rebind stream failed.")); + + gpu_resource->ReBindStream(static_cast(new_stream)); + gpu_resource->ReBindDnnHandle(static_cast(new_stream)); + gpu_resource->ReBindBlasHandle(static_cast(new_stream)); + gpu_resource->ReBindBlasTensorCoreHandle( + static_cast(new_stream)); + gpu_resource->ReBindBlasTF32Handle(static_cast(new_stream)); + gpu_resource->ReBindSolverDnHandle(static_cast(new_stream)); + gpu_resource->ReBindSparseHandle(static_cast(new_stream)); + gpu_resource->ReBindEigenDevice(static_cast(new_stream), + gpu_resource->Place()); + + ref_count_[new_stream]++; + gpu_resources_.emplace(new_stream, std::move(gpu_resource)); +} + int ResourceManager::RefCount(void* stream) const { if (ref_count_.count(stream) == 0) return 0; return ref_count_.at(stream); diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index 0334540315..359b8f8973 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -22,6 +22,7 @@ #include "paddle/fluid/platform/macros.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/backends/cpu/forwards.h" +#include "paddle/phi/common/place.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_types.h" @@ -52,6 +53,7 @@ class GPUContextResource { public: explicit GPUContextResource(const phi::Place& place, void* stream); ~GPUContextResource(); + phi::Place Place() const; gpuStream_t GetStream() const; dnnHandle_t GetDnnHandle() const; @@ -70,6 +72,16 @@ class GPUContextResource { int GetGpuMaxThreadsPerBlock() const; std::array GetGpuMaxGridDimSize() const; + // If stream changes, we need to rebind all handle to new stream. + void ReBindStream(gpuStream_t stream); + void ReBindDnnHandle(gpuStream_t stream) const; + void ReBindBlasHandle(gpuStream_t stream) const; + void ReBindBlasTensorCoreHandle(gpuStream_t stream) const; + void ReBindBlasTF32Handle(gpuStream_t stream) const; + void ReBindSolverDnHandle(gpuStream_t stream) const; + void ReBindSparseHandle(gpuStream_t stream) const; + void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const; + private: void InitGPUResource(void* stream); void DestroyGPUResource(); @@ -138,6 +150,7 @@ class ResourceManager { void DestroyGPUResource(void* stream); GPUContextResource* GetGPUResource(void* stream) const; int RefCount(void* stream) const; + void GpuResourceReBindStream(void* old_stream, void* new_stream); private: void Decrease(void* stream); -- GitLab