inference multi stream support handle lazy init. (#44563)

* multi stream support handle lazy init. * support eigen lazy init * update * fix ci problem

inference multi stream support handle lazy init. (#44563)
* multi stream support handle lazy init. * support eigen lazy init * update * fix ci problem
1892a441 · Wilber · GitHub · 65ad58b6 · 1892a441 · 1892a441
6 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -415,14 +415,16 @@ void AnalysisPredictor::InitDeviceContexts() {
          gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());

          gpu_context->SetStream(gpu_resource->GetStream());
-          gpu_context->SetBlasHandle(gpu_resource->GetBlasHandle());
+          gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
          gpu_context->SetBlasTensorCoreHandle(
-              gpu_resource->GetBlasTensorCoreHandle());
-          gpu_context->SetBlasTF32Handle(gpu_resource->GetBlasTF32Handle());
-          gpu_context->SetDnnHandle(gpu_resource->GetDnnHandle());
-          gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandle());
-          gpu_context->SetSparseHandle(gpu_resource->GetSparseHandle());
-          gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
+              gpu_resource->GetBlasTensorCoreHandleCreator());
+          gpu_context->SetBlasTF32Handle(
+              gpu_resource->GetBlasTF32TensorCoreHandleCreator());
+          gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
+          gpu_context->SetSolverHandle(
+              gpu_resource->GetSolverDnHandleCreator());
+          gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
+          gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDeviceCreator());
          gpu_context->SetComputeCapability(
              gpu_resource->GetGpuComputeCapability());
          gpu_context->SetMaxThreadsPerBlock(

--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/inference/api/resource_manager.h"

+#include <functional>
 #include <memory>
 #include <mutex>
 #include <unordered_map>
@@ -150,12 +151,6 @@ void GPUContextResource::InitGPUResource(void* stream) {
  }

  InitGpuProperties();
-  InitGpuEigenDevice();
-  InitDnnHanlde();
-  InitBlasHandle();
-  InitBlasLtHandle();
-  InitSolverHandle();
-  InitSparseHandle();
 }

 void GPUContextResource::DestroyGPUResource() {
@@ -203,22 +198,6 @@ void GPUContextResource::DestroyDnnHandle() {
  phi::DestroyDnnHandle(dnn_handle_);
 }

-void GPUContextResource::InitBlasHandle() {
-  phi::InitBlasHandle(&blas_handle_, stream_);
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-  phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-      blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#endif
-#if CUDA_VERSION >= 11000
-  phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-      blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif
-#endif
-}
-
 void GPUContextResource::DestroyBlasHandle() {
  phi::DestroyBlasHandle(blas_handle_);
  phi::DestroyBlasHandle(blas_tensor_core_handle_);
@@ -255,32 +234,106 @@ gpuStream_t GPUContextResource::GetStream() const { return stream_; }

 dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }

+std::function<phi::dnnHandle_t()> GPUContextResource::GetDnnHandleCreator() {
+  return [&]() -> phi::dnnHandle_t {
+    InitDnnHanlde();
+    return dnn_handle_;
+  };
+}
+
 blasHandle_t GPUContextResource::GetBlasHandle() const { return blas_handle_; }

+std::function<phi::blasHandle_t()> GPUContextResource::GetBlasHandleCreator() {
+  return [&]() -> phi::blasHandle_t {
+    phi::InitBlasHandle(&blas_handle_, stream_);
+    return blas_handle_;
+  };
+}
+
 blasHandle_t GPUContextResource::GetBlasTensorCoreHandle() const {
  return blas_tensor_core_handle_;
 }

+std::function<phi::blasHandle_t()>
+GPUContextResource::GetBlasTensorCoreHandleCreator() {
+  return [&]() -> phi::blasHandle_t {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#endif
+    return blas_tensor_core_handle_;
+  };
+}
+
 blasHandle_t GPUContextResource::GetBlasTF32Handle() const {
  return blas_tf32_tensor_core_handle_;
 }

+std::function<phi::blasHandle_t()>
+GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
+  return [&]() -> phi::blasHandle_t {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    return blas_tf32_tensor_core_handle_;
+  };
+}
+
 blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
  return blaslt_handle_;
 }

+std::function<phi::blasLtHandle_t()>
+GPUContextResource::GetBlasLtHandleCreator() {
+  return [&]() {
+    InitBlasLtHandle();
+    return blaslt_handle_;
+  };
+}
+
 phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
  return solver_handle_;
 }

+std::function<phi::solverHandle_t()>
+GPUContextResource::GetSolverDnHandleCreator() {
+  return [&]() {
+    InitSolverHandle();
+    return solver_handle_;
+  };
+}
+
 phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
  return sparse_handle_;
 }

+std::function<phi::sparseHandle_t()>
+GPUContextResource::GetSparseHandleCreator() {
+  return [&]() {
+    InitSparseHandle();
+    return sparse_handle_;
+  };
+}
+
 Eigen::GpuDevice* GPUContextResource::GetGpuEigenDevice() const {
  return gpu_eigen_device_.get();
 }

+std::function<Eigen::GpuDevice*()>
+GPUContextResource::GetGpuEigenDeviceCreator() {
+  return [&]() {
+    InitGpuEigenDevice();
+    return gpu_eigen_device_.get();
+  };
+}
+
 int GPUContextResource::GetGpuComputeCapability() const {
  return compute_capability_;
 }
@@ -311,67 +364,82 @@ void GPUContextResource::ReBindStream(gpuStream_t stream) {
 }

 void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
+  if (dnn_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenSetStream(dnn_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenSetStream(dnn_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cudnnSetStream(dnn_handle_, stream));
 #endif
+  }
 }

 void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
+  if (blas_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::rocblas_set_stream(blas_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::rocblas_set_stream(blas_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(blas_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cublasSetStream(blas_handle_, stream));
 #endif
+  }
 }

 void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
+  if (blas_tensor_core_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
 #endif
+  }
 }

 void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
+  if (blas_tf32_tensor_core_handle_) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_set_stream(
+        blas_tf32_tensor_core_handle_, stream));
 #else
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
 #endif
+  }
 }

 void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
+  if (solver_handle_) {
 #ifndef PADDLE_WITH_HIP
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cusolverDnSetStream(solver_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cusolverDnSetStream(solver_handle_, stream));
 #endif
+  }
 }

 void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
+  if (sparse_handle_) {
 #if defined(PADDLE_WITH_CUDA)
 // The generic APIs is supported from CUDA10.1
 #if CUDA_VERSION >= 11000
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cusparseSetStream(sparse_handle_, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        phi::dynload::cusparseSetStream(sparse_handle_, stream));
 #endif
 #endif
+  }
 }

 void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
                                           GPUPlace place) const {
-  auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
-                        .GetAllocator(place_)
-                        .get();
-  eigen_stream_->Reinitialize(stream, allocator, place);
+  if (eigen_stream_) {
+    auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place_)
+                          .get();
+    eigen_stream_->Reinitialize(stream, allocator, place);
+  }
 }

 #endif

--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -55,6 +55,15 @@ class GPUContextResource {
  ~GPUContextResource();
  phi::Place Place() const;

+  std::function<phi::dnnHandle_t()> GetDnnHandleCreator();
+  std::function<phi::blasHandle_t()> GetBlasHandleCreator();
+  std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
+  std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
+  std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
+  std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
+  std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
+  std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
+
  gpuStream_t GetStream() const;
  dnnHandle_t GetDnnHandle() const;
  blasHandle_t GetBlasHandle() const;
@@ -89,7 +98,6 @@ class GPUContextResource {
  void InitGpuEigenDevice();
  void InitDnnHanlde();
  void DestroyDnnHandle();
-  void InitBlasHandle();
  void DestroyBlasHandle();
  void InitBlasLtHandle();
  void DestroyBlasLtHandle();

--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/ddim.h"

 namespace paddle {
 namespace operators {

--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -213,7 +213,6 @@ struct GPUContext::Impl {
                           &max_threads_per_block_,
                           &max_grid_dim_size_);
    phi::InitStream(&stream_);
-    InitEigenDevice();
    InitDnnWorkspace();
  }

@@ -234,7 +233,6 @@ struct GPUContext::Impl {
  void PartialInitWithAllocator() {
    owned_ = true;
    backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitEigenDevice();
    InitDnnWorkspace();
  }

@@ -317,27 +315,49 @@ struct GPUContext::Impl {

  void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; }

-  Eigen::GpuDevice* eigen_device() const {
+  void SetEigenDevice(std::function<Eigen::GpuDevice*()>&& creator) {
+    eigen_device_creator_ = std::move(creator);
+  }
+
+  Eigen::GpuDevice* eigen_device() {
+    std::call_once(flag_eigen_device_, [&]() {
+      if (!eigen_device_) {
+        if (!eigen_device_creator_)
+          InitEigenDevice();
+        else
+          eigen_device_ = eigen_device_creator_();
+      }
+    });
    PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr.");
    return eigen_device_;
  }

  blasHandle_t GetBlasHandle() {
-    std::call_once(flag_blas_, [=]() {
+    std::call_once(flag_blas_, [&]() {
      if (!blas_handle_) {
-        phi::InitBlasHandle(&blas_handle_, stream_);
+        if (!blas_handle_creator_)
+          phi::InitBlasHandle(&blas_handle_, stream_);
+        else
+          blas_handle_ = blas_handle_creator_();
      }
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 9000
      if (!blas_tensor_core_handle_) {
-        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        if (!blas_tensor_core_handle_creator_)
+          phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        else
+          blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
      }
 #endif
 #if CUDA_VERSION >= 11000
      if (!blas_tf32_tensor_core_handle_) {
-        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        if (!blas_tf32_tensor_core_handle_creator_)
+          phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        else
+          blas_tf32_tensor_core_handle_ =
+              blas_tf32_tensor_core_handle_creator_();
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
      }
@@ -350,27 +370,53 @@ struct GPUContext::Impl {

  void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }

+  void SetBlasHandle(std::function<blasHandle_t()>&& handle_creator) {
+    blas_handle_creator_ = std::move(handle_creator);
+  }
+
  void SetBlasTensorCoreHandle(blasHandle_t handle) {
    blas_tensor_core_handle_ = handle;
  }

+  void SetBlasTensorCoreHandle(std::function<blasHandle_t()>&& handle_creator) {
+    blas_tensor_core_handle_creator_ = std::move(handle_creator);
+  }
+
  void SetBlasTF32Handle(blasHandle_t handle) {
    blas_tf32_tensor_core_handle_ = handle;
  }

+  void SetBlasTF32Handle(std::function<blasHandle_t()>&& handle_creator) {
+    blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator);
+  }
+
  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }

+  void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
+    blaslt_handle_creator_ = std::move(handle_creator);
+  }
+
  blasLtHandle_t GetBlasLtHandle() {
-    std::call_once(flag_blaslt_, [=]() {
-      if (!blaslt_handle_) phi::InitBlasLtHandle(&blaslt_handle_);
+    std::call_once(flag_blaslt_, [&]() {
+      if (!blaslt_handle_) {
+        if (!blaslt_handle_creator_)
+          phi::InitBlasLtHandle(&blaslt_handle_);
+        else
+          blaslt_handle_ = blaslt_handle_creator_();
+      }
    });
    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
    return blaslt_handle_;
  }

  dnnHandle_t GetDnnHandle() {
-    std::call_once(flag_dnn_, [=]() {
-      if (!dnn_handle_) phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    std::call_once(flag_dnn_, [&]() {
+      if (!dnn_handle_) {
+        if (!dnn_handle_creator_)
+          phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+        else
+          dnn_handle_ = dnn_handle_creator_();
+      }
    });
    PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
    return dnn_handle_;
@@ -392,9 +438,18 @@ struct GPUContext::Impl {

  void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }

+  void SetDnnHandle(std::function<dnnHandle_t()>&& handle_creator) {
+    dnn_handle_creator_ = std::move(handle_creator);
+  }
+
  solverHandle_t GetSolverHandle() {
-    std::call_once(flag_slover_, [=]() {
-      if (!solver_handle_) phi::InitSolverHandle(&solver_handle_, stream_);
+    std::call_once(flag_slover_, [&]() {
+      if (!solver_handle_) {
+        if (!solver_handle_creator_)
+          phi::InitSolverHandle(&solver_handle_, stream_);
+        else
+          solver_handle_ = solver_handle_creator_();
+      }
    });
    PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
    return solver_handle_;
@@ -402,9 +457,18 @@ struct GPUContext::Impl {

  void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }

+  void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
+    solver_handle_creator_ = std::move(handle_creator);
+  }
+
  sparseHandle_t GetSparseHandle() {
-    std::call_once(flag_sparse_, [=]() {
-      if (!sparse_handle_) phi::InitSparseHandle(&sparse_handle_, stream_);
+    std::call_once(flag_sparse_, [&]() {
+      if (!sparse_handle_) {
+        if (!sparse_handle_creator_)
+          phi::InitSparseHandle(&sparse_handle_, stream_);
+        else
+          sparse_handle_ = sparse_handle_creator_();
+      }
    });
    PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
    return sparse_handle_;
@@ -412,6 +476,10 @@ struct GPUContext::Impl {

  void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; }

+  void SetSparseHandle(std::function<sparseHandle_t()>&& handle_creator) {
+    sparse_handle_creator_ = std::move(handle_creator);
+  }
+
  void Wait() const {
 #ifdef PADDLE_WITH_HIP
    hipError_t e_sync = hipSuccess;
@@ -461,21 +529,31 @@ struct GPUContext::Impl {
  }

  inline void CublasCall(const std::function<void(blasHandle_t)>& callback) {
-    std::call_once(flag_cublas_, [=]() {
+    std::call_once(flag_cublas_, [&]() {
      if (!blas_handle_) {
-        phi::InitBlasHandle(&blas_handle_, stream_);
+        if (!blas_handle_creator_)
+          phi::InitBlasHandle(&blas_handle_, stream_);
+        else
+          blas_handle_ = blas_handle_creator_();
      }
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 9000
      if (!blas_tensor_core_handle_) {
-        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        if (!blas_tensor_core_handle_creator_)
+          phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        else
+          blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
      }
 #endif
 #if CUDA_VERSION >= 11000
      if (!blas_tf32_tensor_core_handle_) {
-        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        if (!blas_tf32_tensor_core_handle_creator_)
+          phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        else
+          blas_tf32_tensor_core_handle_ =
+              blas_tf32_tensor_core_handle_creator_();
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
      }
@@ -493,19 +571,31 @@ struct GPUContext::Impl {

  inline void TensorCoreCublasCallIfAvailable(
      const std::function<void(blasHandle_t)>& callback) {
-    std::call_once(flag_tensorcore_cublas_, [=]() {
-      if (!blas_handle_) phi::InitBlasHandle(&blas_handle_, stream_);
+    std::call_once(flag_tensorcore_cublas_, [&]() {
+      if (!blas_handle_) {
+        if (!blas_handle_creator_)
+          phi::InitBlasHandle(&blas_handle_, stream_);
+        else
+          blas_handle_ = blas_handle_creator_();
+      }
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 9000
      if (!blas_tensor_core_handle_) {
-        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        if (!blas_tensor_core_handle_creator_)
+          phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        else
+          blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
      }
 #endif
 #if CUDA_VERSION >= 11000
      if (!blas_tf32_tensor_core_handle_) {
-        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        if (!blas_tf32_tensor_core_handle_creator_)
+          phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        else
+          blas_tf32_tensor_core_handle_ =
+              blas_tf32_tensor_core_handle_creator_();
        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
      }
@@ -523,9 +613,12 @@ struct GPUContext::Impl {

  inline void CusparseCall(
      const std::function<void(sparseHandle_t)>& callback) {
-    std::call_once(flag_sparse_, [=]() {
+    std::call_once(flag_sparse_, [&]() {
      if (!sparse_handle_) {
-        phi::InitSparseHandle(&sparse_handle_, stream_);
+        if (!sparse_handle_creator_)
+          phi::InitSparseHandle(&sparse_handle_, stream_);
+        else
+          sparse_handle_ = sparse_handle_creator_();
      }
    });
    std::lock_guard<std::mutex> guard(sparse_mtx_);
@@ -597,13 +690,21 @@ struct GPUContext::Impl {

  gpuStream_t stream_{nullptr};
  Eigen::GpuDevice* eigen_device_{nullptr};
+  std::function<Eigen::GpuDevice*()> eigen_device_creator_{nullptr};
  blasHandle_t blas_handle_{nullptr};
+  std::function<blasHandle_t()> blas_handle_creator_{nullptr};
  blasHandle_t blas_tensor_core_handle_{nullptr};
+  std::function<blasHandle_t()> blas_tensor_core_handle_creator_{nullptr};
  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  std::function<blasHandle_t()> blas_tf32_tensor_core_handle_creator_{nullptr};
  blasLtHandle_t blaslt_handle_{nullptr};
+  std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
  dnnHandle_t dnn_handle_{nullptr};
+  std::function<dnnHandle_t()> dnn_handle_creator_{nullptr};
  solverHandle_t solver_handle_{nullptr};
+  std::function<solverHandle_t()> solver_handle_creator_{nullptr};
  sparseHandle_t sparse_handle_{nullptr};
+  std::function<sparseHandle_t()> sparse_handle_creator_{nullptr};
  DnnWorkspaceHandle* workspace_{nullptr};

  std::once_flag flag_sparse_;
@@ -613,6 +714,7 @@ struct GPUContext::Impl {
  std::once_flag flag_slover_;
  std::once_flag flag_cublas_;
  std::once_flag flag_tensorcore_cublas_;
+  std::once_flag flag_eigen_device_;

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  // NCCL communicator (single process version) for NCCL collective operations.
@@ -752,34 +854,66 @@ void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
  impl_->SetEigenDevice(device);
 }

+void GPUContext::SetEigenDevice(std::function<Eigen::GpuDevice*()>&& creator) {
+  impl_->SetEigenDevice(std::move(creator));
+}
+
 void GPUContext::SetBlasHandle(blasHandle_t blas) {
  impl_->SetBlasHandle(blas);
 }

+void GPUContext::SetBlasHandle(std::function<blasHandle_t()>&& func) {
+  impl_->SetBlasHandle(std::move(func));
+}
+
 void GPUContext::SetBlasTensorCoreHandle(blasHandle_t handle) {
  impl_->SetBlasTensorCoreHandle(handle);
 }

+void GPUContext::SetBlasTensorCoreHandle(std::function<blasHandle_t()>&& func) {
+  impl_->SetBlasTensorCoreHandle(std::move(func));
+}
+
 void GPUContext::SetBlasTF32Handle(blasHandle_t handle) {
  impl_->SetBlasTF32Handle(handle);
 }

+void GPUContext::SetBlasTF32Handle(std::function<blasHandle_t()>&& func) {
+  impl_->SetBlasTF32Handle(std::move(func));
+}
+
 void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
  impl_->SetBlasLtHandle(blaslt);
 }

+void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
+  impl_->SetBlasLtHandle(std::move(func));
+}
+
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
  impl_->SetDnnHandle(handle);
 }

+void GPUContext::SetDnnHandle(std::function<dnnHandle_t()>&& func) {
+  impl_->SetDnnHandle(std::move(func));
+}
+
 void GPUContext::SetSolverHandle(solverHandle_t handle) {
  impl_->SetSolverHandle(handle);
 }

+void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
+  impl_->SetSolverHandle(std::move(func));
+}
+
 void GPUContext::SetSparseHandle(sparseHandle_t handle) {
  impl_->SetSparseHandle(handle);
 }

+void GPUContext::SetSparseHandle(std::function<sparseHandle_t()>&& func) {
+  impl_->SetSparseHandle(std::move(func));
+}
+
 void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) {
  impl_->workspace_ = handle;
 }

--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -197,20 +197,28 @@ class PADDLE_API GPUContext : public DeviceContext {
  void SetStream(gpuStream_t);

  void SetEigenDevice(Eigen::GpuDevice*);
+  void SetEigenDevice(std::function<Eigen::GpuDevice*()>&&);

  void SetBlasHandle(blasHandle_t);
+  void SetBlasHandle(std::function<blasHandle_t()>&&);

  void SetBlasTensorCoreHandle(blasHandle_t);
+  void SetBlasTensorCoreHandle(std::function<blasHandle_t()>&&);

  void SetBlasTF32Handle(blasHandle_t);
+  void SetBlasTF32Handle(std::function<blasHandle_t()>&&);

  void SetBlasLtHandle(blasLtHandle_t);
+  void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);

  void SetDnnHandle(dnnHandle_t);
+  void SetDnnHandle(std::function<dnnHandle_t()>&&);

  void SetSolverHandle(solverHandle_t);
+  void SetSolverHandle(std::function<solverHandle_t()>&&);

  void SetSparseHandle(sparseHandle_t);
+  void SetSparseHandle(std::function<sparseHandle_t()>&&);

  void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);