add batch stream (#44524)

4f86092b · Wilber · GitHub · 3e1280ea · 4f86092b · 4f86092b
3 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -43,6 +43,7 @@
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/inference/utils/model_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -56,6 +57,7 @@
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/split.h"
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
@@ -1618,8 +1620,31 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
-  LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
+  if (!private_context_) {
-                             "Please use config.SetExecStream instead.";
+    PADDLE_THROW(platform::errors::Fatal(
+        "Please use config.SetExecStream to init gpu resources, and then we "
+        "will bind gpu resources to execution stream."));
+  }
+  if (stream != predictor_stream_) {
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
+#else
+    cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
+#endif
+    ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_,
+                                                        stream);
+    predictor_stream_ = stream;
+    auto *dev_ctxs = reinterpret_cast<const std::map<
+        phi::Place,
+        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        this->GetDeviceContexts());
+    auto *dev_ctx =
+        static_cast<InferGPUContext *>(dev_ctxs->at(place_).get().get());
+    dev_ctx->SetStream(stream);
+  }
  return ZeroCopyRun();
 }
 #endif

--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -17,17 +17,29 @@
 #include <memory>
 #include <mutex>
 #include <unordered_map>
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cublas.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/dynload/cusparse.h"
+#endif  // PADDLE_WITH_CUDA
 namespace paddle {
 namespace internal {
@@ -237,6 +249,8 @@ void GPUContextResource::DestroySparseHandle() {
  phi::DestroySparseHandle(sparse_handle_);
 }
+phi::Place GPUContextResource::Place() const { return place_; }
 gpuStream_t GPUContextResource::GetStream() const { return stream_; }
 dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
@@ -291,6 +305,75 @@ std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
  return max_grid_dim_size_;
 }
+void GPUContextResource::ReBindStream(gpuStream_t stream) {
+  owned_stream_ = false;
+  stream_ = stream;
+}
+void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenSetStream(dnn_handle_, stream));
+#else
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream));
+#endif
+}
+void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::rocblas_set_stream(blas_handle_, stream));
+#else
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cublasSetStream(blas_handle_, stream));
+#endif
+}
+void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
+#else
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
+#endif
+}
+void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream));
+#else
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
+#endif
+}
+void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
+#ifndef PADDLE_WITH_HIP
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cusolverDnSetStream(solver_handle_, stream));
+#endif
+}
+void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 11000
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cusparseSetStream(sparse_handle_, stream));
+#endif
+#endif
+}
+void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
+                                           GPUPlace place) const {
+  auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
+                        .GetAllocator(place_)
+                        .get();
+  eigen_stream_->Reinitialize(stream, allocator, place);
+}
 #endif
 void ResourceManager::InitCPUResource() {
@@ -359,6 +442,35 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
  return gpu_resources_.at(stream).get();
 }
+void ResourceManager::GpuResourceReBindStream(void* old_stream,
+                                              void* new_stream) {
+  PADDLE_ENFORCE_EQ(
+      gpu_resources_.count(old_stream),
+      true,
+      platform::errors::InvalidArgument(
+          "The stream[%p] not found in gpu_resources.", old_stream));
+  auto gpu_resource = std::move(gpu_resources_.at(old_stream));
+  DestroyGPUResource(old_stream);
+  PADDLE_ENFORCE_EQ(
+      ref_count_.count(old_stream),
+      0,
+      platform::errors::Fatal("gpu resources rebind stream failed."));
+  gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindBlasTensorCoreHandle(
+      static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
+  gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
+                                  gpu_resource->Place());
+  ref_count_[new_stream]++;
+  gpu_resources_.emplace(new_stream, std::move(gpu_resource));
+}
 int ResourceManager::RefCount(void* stream) const {
  if (ref_count_.count(stream) == 0) return 0;
  return ref_count_.at(stream);

--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/cpu/forwards.h"
+#include "paddle/phi/common/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
@@ -52,6 +53,7 @@ class GPUContextResource {
 public:
  explicit GPUContextResource(const phi::Place& place, void* stream);
  ~GPUContextResource();
+  phi::Place Place() const;
  gpuStream_t GetStream() const;
  dnnHandle_t GetDnnHandle() const;
@@ -70,6 +72,16 @@ class GPUContextResource {
  int GetGpuMaxThreadsPerBlock() const;
  std::array<int, 3> GetGpuMaxGridDimSize() const;
+  // If stream changes, we need to rebind all handle to new stream.
+  void ReBindStream(gpuStream_t stream);
+  void ReBindDnnHandle(gpuStream_t stream) const;
+  void ReBindBlasHandle(gpuStream_t stream) const;
+  void ReBindBlasTensorCoreHandle(gpuStream_t stream) const;
+  void ReBindBlasTF32Handle(gpuStream_t stream) const;
+  void ReBindSolverDnHandle(gpuStream_t stream) const;
+  void ReBindSparseHandle(gpuStream_t stream) const;
+  void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const;
 private:
  void InitGPUResource(void* stream);
  void DestroyGPUResource();
@@ -138,6 +150,7 @@ class ResourceManager {
  void DestroyGPUResource(void* stream);
  GPUContextResource* GetGPUResource(void* stream) const;
  int RefCount(void* stream) const;
+  void GpuResourceReBindStream(void* old_stream, void* new_stream);
 private:
  void Decrease(void* stream);