diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index f68e451039092373985f1292512d80d5fcfdabb1..ead53d648109de7695702682228e4017cda98fb2 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -214,23 +214,6 @@ struct GPUContext::Impl { &max_grid_dim_size_); phi::InitStream(&stream_); InitEigenDevice(); - phi::InitBlasHandle(&blas_handle_, stream_); -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 9000 - phi::InitBlasHandle(&blas_tensor_core_handle_, stream_); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); -#endif -#if CUDA_VERSION >= 11000 - phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); -#endif -#endif - phi::InitBlasLtHandle(&blaslt_handle_); - phi::InitDnnHandle(&dnn_handle_, stream_, place_); - phi::InitSolverHandle(&solver_handle_, stream_); - phi::InitSparseHandle(&sparse_handle_, stream_); InitDnnWorkspace(); } @@ -246,23 +229,6 @@ struct GPUContext::Impl { &max_threads_per_block_, &max_grid_dim_size_); phi::InitStream(&stream_); - phi::InitBlasHandle(&blas_handle_, stream_); -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 9000 - phi::InitBlasHandle(&blas_tensor_core_handle_, stream_); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); -#endif -#if CUDA_VERSION >= 11000 - phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); -#endif -#endif - phi::InitBlasLtHandle(&blaslt_handle_); - phi::InitDnnHandle(&dnn_handle_, stream_, place_); - phi::InitSolverHandle(&solver_handle_, stream_); - phi::InitSparseHandle(&sparse_handle_, stream_); } void PartialInitWithAllocator() { @@ -356,7 +322,28 @@ struct GPUContext::Impl { return eigen_device_; } - blasHandle_t GetBlasHandle() const { + blasHandle_t GetBlasHandle() { + std::call_once(flag_blas_, [=]() { + if (!blas_handle_) { + phi::InitBlasHandle(&blas_handle_, stream_); + } +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 9000 + if (!blas_tensor_core_handle_) { + phi::InitBlasHandle(&blas_tensor_core_handle_, stream_); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( + blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); + } +#endif +#if CUDA_VERSION >= 11000 + if (!blas_tf32_tensor_core_handle_) { + phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( + blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); + } +#endif +#endif + }); PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr."); return blas_handle_; } @@ -373,12 +360,18 @@ struct GPUContext::Impl { void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } - blasLtHandle_t GetBlasLtHandle() const { + blasLtHandle_t GetBlasLtHandle() { + std::call_once(flag_blaslt_, [=]() { + if (!blaslt_handle_) phi::InitBlasLtHandle(&blaslt_handle_); + }); PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); return blaslt_handle_; } dnnHandle_t GetDnnHandle() { + std::call_once(flag_dnn_, [=]() { + if (!dnn_handle_) phi::InitDnnHandle(&dnn_handle_, stream_, place_); + }); PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr."); return dnn_handle_; } @@ -399,7 +392,10 @@ struct GPUContext::Impl { void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; } - solverHandle_t GetSolverHandle() const { + solverHandle_t GetSolverHandle() { + std::call_once(flag_slover_, [=]() { + if (!solver_handle_) phi::InitSolverHandle(&solver_handle_, stream_); + }); PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr."); return solver_handle_; } @@ -461,8 +457,28 @@ struct GPUContext::Impl { #endif } - inline void CublasCall( - const std::function& callback) const { + inline void CublasCall(const std::function& callback) { + std::call_once(flag_cublas_, [=]() { + if (!blas_handle_) { + phi::InitBlasHandle(&blas_handle_, stream_); + } +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 9000 + if (!blas_tensor_core_handle_) { + phi::InitBlasHandle(&blas_tensor_core_handle_, stream_); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( + blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); + } +#endif +#if CUDA_VERSION >= 11000 + if (!blas_tf32_tensor_core_handle_) { + phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( + blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); + } +#endif +#endif + }); if (blas_tf32_tensor_core_handle_ != nullptr) { std::lock_guard guard(blas_tf32_mtx_); callback(blas_tf32_tensor_core_handle_); @@ -473,7 +489,26 @@ struct GPUContext::Impl { } inline void TensorCoreCublasCallIfAvailable( - const std::function& callback) const { + const std::function& callback) { + std::call_once(flag_tensorcore_cublas_, [=]() { + if (!blas_handle_) phi::InitBlasHandle(&blas_handle_, stream_); +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 9000 + if (!blas_tensor_core_handle_) { + phi::InitBlasHandle(&blas_tensor_core_handle_, stream_); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( + blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); + } +#endif +#if CUDA_VERSION >= 11000 + if (!blas_tf32_tensor_core_handle_) { + phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( + blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); + } +#endif +#endif + }); if (blas_tensor_core_handle_ != nullptr) { std::lock_guard guard(blas_tensor_core_mtx_); callback(blas_tensor_core_handle_); @@ -563,6 +598,13 @@ struct GPUContext::Impl { sparseHandle_t sparse_handle_{nullptr}; DnnWorkspaceHandle* workspace_{nullptr}; + std::once_flag flag_blas_; + std::once_flag flag_blaslt_; + std::once_flag flag_dnn_; + std::once_flag flag_slover_; + std::once_flag flag_cublas_; + std::once_flag flag_tensorcore_cublas_; + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NCCL communicator (single process version) for NCCL collective operations. // NCCL collective operations provides fast collectives over multiple GPUs