diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f68e451039092373985f1292512d80d5fcfdabb1..ead53d648109de7695702682228e4017cda98fb2 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -214,23 +214,6 @@ struct GPUContext::Impl {
                            &max_grid_dim_size_);
     phi::InitStream(&stream_);
     InitEigenDevice();
-    phi::InitBlasHandle(&blas_handle_, stream_);
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#endif
-#if CUDA_VERSION >= 11000
-    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif
-#endif
-    phi::InitBlasLtHandle(&blaslt_handle_);
-    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
-    phi::InitSolverHandle(&solver_handle_, stream_);
-    phi::InitSparseHandle(&sparse_handle_, stream_);
     InitDnnWorkspace();
   }
 
@@ -246,23 +229,6 @@ struct GPUContext::Impl {
                            &max_threads_per_block_,
                            &max_grid_dim_size_);
     phi::InitStream(&stream_);
-    phi::InitBlasHandle(&blas_handle_, stream_);
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#endif
-#if CUDA_VERSION >= 11000
-    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif
-#endif
-    phi::InitBlasLtHandle(&blaslt_handle_);
-    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
-    phi::InitSolverHandle(&solver_handle_, stream_);
-    phi::InitSparseHandle(&sparse_handle_, stream_);
   }
 
   void PartialInitWithAllocator() {
@@ -356,7 +322,28 @@ struct GPUContext::Impl {
     return eigen_device_;
   }
 
-  blasHandle_t GetBlasHandle() const {
+  blasHandle_t GetBlasHandle() {
+    std::call_once(flag_blas_, [=]() {
+      if (!blas_handle_) {
+        phi::InitBlasHandle(&blas_handle_, stream_);
+      }
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+      if (!blas_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+      }
+#endif
+#if CUDA_VERSION >= 11000
+      if (!blas_tf32_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+      }
+#endif
+#endif
+    });
     PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
     return blas_handle_;
   }
@@ -373,12 +360,18 @@ struct GPUContext::Impl {
 
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
 
-  blasLtHandle_t GetBlasLtHandle() const {
+  blasLtHandle_t GetBlasLtHandle() {
+    std::call_once(flag_blaslt_, [=]() {
+      if (!blaslt_handle_) phi::InitBlasLtHandle(&blaslt_handle_);
+    });
     PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
     return blaslt_handle_;
   }
 
   dnnHandle_t GetDnnHandle() {
+    std::call_once(flag_dnn_, [=]() {
+      if (!dnn_handle_) phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    });
     PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
     return dnn_handle_;
   }
@@ -399,7 +392,10 @@ struct GPUContext::Impl {
 
   void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
 
-  solverHandle_t GetSolverHandle() const {
+  solverHandle_t GetSolverHandle() {
+    std::call_once(flag_slover_, [=]() {
+      if (!solver_handle_) phi::InitSolverHandle(&solver_handle_, stream_);
+    });
     PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
     return solver_handle_;
   }
@@ -461,8 +457,28 @@ struct GPUContext::Impl {
 #endif
   }
 
-  inline void CublasCall(
-      const std::function<void(blasHandle_t)>& callback) const {
+  inline void CublasCall(const std::function<void(blasHandle_t)>& callback) {
+    std::call_once(flag_cublas_, [=]() {
+      if (!blas_handle_) {
+        phi::InitBlasHandle(&blas_handle_, stream_);
+      }
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+      if (!blas_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+      }
+#endif
+#if CUDA_VERSION >= 11000
+      if (!blas_tf32_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+      }
+#endif
+#endif
+    });
     if (blas_tf32_tensor_core_handle_ != nullptr) {
       std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
       callback(blas_tf32_tensor_core_handle_);
@@ -473,7 +489,26 @@ struct GPUContext::Impl {
   }
 
   inline void TensorCoreCublasCallIfAvailable(
-      const std::function<void(blasHandle_t)>& callback) const {
+      const std::function<void(blasHandle_t)>& callback) {
+    std::call_once(flag_tensorcore_cublas_, [=]() {
+      if (!blas_handle_) phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+      if (!blas_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+      }
+#endif
+#if CUDA_VERSION >= 11000
+      if (!blas_tf32_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+      }
+#endif
+#endif
+    });
     if (blas_tensor_core_handle_ != nullptr) {
       std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
       callback(blas_tensor_core_handle_);
@@ -563,6 +598,13 @@ struct GPUContext::Impl {
   sparseHandle_t sparse_handle_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
 
+  std::once_flag flag_blas_;
+  std::once_flag flag_blaslt_;
+  std::once_flag flag_dnn_;
+  std::once_flag flag_slover_;
+  std::once_flag flag_cublas_;
+  std::once_flag flag_tensorcore_cublas_;
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs