From 5c59d2139eeef38343095a495a6caf186bdfe184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Wed, 1 Apr 2020 17:19:35 +0800 Subject: [PATCH] reverts the commit 23177, test=develop (#23363) --- paddle/fluid/platform/CMakeLists.txt | 3 +- paddle/fluid/platform/device_context.cc | 108 +++++++---- paddle/fluid/platform/device_context.h | 204 +++----------------- paddle/fluid/platform/stream/CMakeLists.txt | 3 - paddle/fluid/platform/stream/cuda_stream.cc | 60 ------ paddle/fluid/platform/stream/cuda_stream.h | 61 ------ 6 files changed, 95 insertions(+), 344 deletions(-) delete mode 100644 paddle/fluid/platform/stream/CMakeLists.txt delete mode 100644 paddle/fluid/platform/stream/cuda_stream.cc delete mode 100644 paddle/fluid/platform/stream/cuda_stream.h diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index ddf3035a927..b954d1b7658 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -44,7 +44,6 @@ cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) -add_subdirectory(stream) cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce) cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper) @@ -55,7 +54,7 @@ IF(WITH_DGC) ENDIF() IF(WITH_GPU) - set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) + set(GPU_CTX_DEPS dynload_cuda dynamic_loader) ENDIF() IF(WITH_MKLDNN) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 39be9f716d2..322a3279678 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -211,34 +211,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { allocation_ = memory::Alloc(device_context_, required_workspace_bytes); } -thread_local std::unordered_map> - CUDADeviceContext::thread_ctx_; -thread_local std::mutex CUDADeviceContext::ctx_mtx_; - -void CUDAContext::InitEigenContext(const stream::CUDAStream& stream) { - eigen_stream_.reset(new EigenCudaStreamDevice()); - eigen_stream_->Reinitialize(&stream.stream(), place_); - eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); -} - -CUDAContext::CUDAContext(const CUDAPlace& place, - const enum stream::Priority& priority) { - place_ = place; - CUDADeviceGuard guard(place_.device); - stream_.Init(place, priority); - InitEigenContext(stream_); - InitCuBlasContext(stream_); - InitCuDNNContext(stream_); - InitCallbackManager(stream_); -} - -CUDAContext::~CUDAContext() { - CUDADeviceGuard guard(place_.device); - DestoryCuDNNContext(); - DestoryCuBlasContext(); -} - CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceGuard guard(place_.device); compute_capability_ = GetCUDAComputeCapability(place_.device); @@ -246,6 +218,18 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device); max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); + eigen_stream_.reset(new EigenCudaStreamDevice()); + eigen_stream_->Reinitialize(&stream_, place); + eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); + + if (TensorCoreAvailable()) { +#if CUDA_VERSION >= 9000 + cublas_tensor_core_handle_.reset( + new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH)); +#endif + } driver_version_ = GetCUDADriverVersion(place_.device); runtime_version_ = GetCUDARuntimeVersion(place_.device); @@ -279,19 +263,73 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { << "Please recompile or reinstall Paddle with compatible CUDA " "version."; } + + if (dynload::HasCUDNN()) { + auto local_cudnn_version = cudnn_dso_ver / 100; + auto compile_cudnn_version = CUDNN_VERSION / 100; + if (local_cudnn_version < static_cast(compile_cudnn_version)) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDNN " + << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10 + << ", but CUDNN version in your machine is " + << local_cudnn_version / 10 << "." << local_cudnn_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDNN " + "version."; + } + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreate(&cudnn_handle_), + "Failed to create Cudnn handle in DeviceContext"); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnSetStream(cudnn_handle_, stream_), + "Failed to set stream for Cudnn handle in DeviceContext"); + } else { + cudnn_handle_ = nullptr; + } } - default_ctx_.reset(new CUDAContext(place_)); + + callback_manager_.reset(new StreamCallbackManager(stream_)); } CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); WaitStreamCallback(); + cublas_handle_.reset(); + cublas_tensor_core_handle_.reset(); + eigen_stream_.reset(); + eigen_device_.reset(); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); + if (cudnn_handle_) { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_), + "Failed to destory Cudnn handle"); + } +#if defined(PADDLE_WITH_NCCL) + if (nccl_comm_) { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); + } +#endif } Place CUDADeviceContext::GetPlace() const { return place_; } -void CUDADeviceContext::Wait() const { context()->Wait(); } +void CUDADeviceContext::Wait() const { + cudaError_t e_sync = cudaSuccess; +#if !defined(_WIN32) + e_sync = cudaStreamSynchronize(stream_); +#else + while (e_sync = cudaStreamQuery(stream_)) { + if (e_sync == cudaErrorNotReady) continue; + break; + } +#endif + + PADDLE_ENFORCE_CUDA_SUCCESS( + e_sync, platform::errors::Fatal( + "cudaStreamSynchronize raises error: %s, errono: %d", + cudaGetErrorString(e_sync), static_cast(e_sync))); +} int CUDADeviceContext::GetComputeCapability() const { return compute_capability_; @@ -308,26 +346,24 @@ int CUDADeviceContext::GetMaxThreadsPerBlock() const { } Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { - return context()->EigenDevice().get(); + return eigen_device_.get(); } bool CUDADeviceContext::tensor_core_available() const { - return context()->CublasTensorCoreHandle() != nullptr; + return cublas_tensor_core_handle_ != nullptr; } dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const { return max_grid_dim_size_; } -cudnnHandle_t CUDADeviceContext::cudnn_handle() const { - return context()->CudnnHandle(); -} +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); } -cudaStream_t CUDADeviceContext::stream() const { return context()->Stream(); } +cudaStream_t CUDADeviceContext::stream() const { return stream_; } CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d296b8760b6..d6b8cda94f2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -38,7 +38,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/stream_callback_manager.h" #endif #include "unsupported/Eigen/CXX11/Tensor" @@ -81,160 +80,6 @@ struct DefaultDeviceContextType { class EigenCudaStreamDevice; class CudnnWorkspaceHandle; -class CUDAContext { - public: - CUDAContext() = default; - explicit CUDAContext( - const CUDAPlace& place, - const enum stream::Priority& priority = stream::Priority::NORMAL); - - ~CUDAContext(); - - const CUDAPlace& Place() const { return place_; } - - const std::unique_ptr& EigenDevice() const { - return eigen_device_; - } - - const std::unique_ptr& EigenStream() const { - return eigen_stream_; - } - - const cudaStream_t& Stream() const { return stream_.stream(); } - - const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; } - - const std::unique_ptr& CublasHandle() const { - return cublas_handle_; - } - - const std::unique_ptr& CublasTensorCoreHandle() const { - return cublas_tensor_core_handle_; - } - - /*! \brief Call cublas function safely. */ - template - inline void CublasCall(Callback&& callback) const { - cublas_handle_->Call(std::forward(callback)); - } - - /*! \brief Check whether tensor core is supported */ - bool tensor_core_available() const; - - /*! \brief Call cublas function with Tensor Core safely. If - Tensor Core is not available, use DEFAULT_MATH instead. */ - template - inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { - if (cublas_tensor_core_handle_) { - cublas_tensor_core_handle_->Call(std::forward(callback)); - } else { - cublas_handle_->Call(std::forward(callback)); - } - } - - template - void RecordEvent(cudaEvent_t ev, Callback callback) { - callback(); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventRecord(ev, stream_.stream()), - platform::errors::Fatal("CUDA event recording failed.")); - } - - template - void AddStreamCallback(Callback&& callback) const { - callback_manager_->AddCallback(callback); - } - - void WaitStreamCallback() const { callback_manager_->Wait(); } - - void Wait() const { - cudaError_t e_sync = cudaSuccess; -#if !defined(_WIN32) - e_sync = cudaStreamSynchronize(stream_.stream()); -#else - while (e_sync = cudaStreamQuery(stream_.stream())) { - if (e_sync == cudaErrorNotReady) continue; - break; - } -#endif - - PADDLE_ENFORCE_CUDA_SUCCESS( - e_sync, platform::errors::Fatal( - "cudaStreamSynchronize raises error: %s, errono: %d", - cudaGetErrorString(e_sync), static_cast(e_sync))); - } - - private: - void InitEigenContext(const stream::CUDAStream& stream); - - void InitCuBlasContext(const stream::CUDAStream& stream) { - cublas_handle_.reset( - new CublasHandleHolder(stream.stream(), CUBLAS_DEFAULT_MATH)); - if (TensorCoreAvailable()) { -#if CUDA_VERSION >= 9000 - cublas_tensor_core_handle_.reset( - new CublasHandleHolder(stream.stream(), CUBLAS_TENSOR_OP_MATH)); -#endif - } - } - - void InitCallbackManager(const stream::CUDAStream& stream) { - callback_manager_.reset(new StreamCallbackManager(stream.stream())); - } - - void InitCuDNNContext(const stream::CUDAStream& stream) { - if (dynload::HasCUDNN()) { - auto local_cudnn_version = dynload::cudnnGetVersion() / 100; - auto compile_cudnn_version = CUDNN_VERSION / 100; - if (local_cudnn_version < static_cast(compile_cudnn_version)) { - LOG_FIRST_N(WARNING, 1) - << "WARNING: device: " << place_.device - << ". The installed Paddle is compiled with CUDNN " - << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10 - << ", but CUDNN version in your machine is " - << local_cudnn_version / 10 << "." << local_cudnn_version % 10 - << ", which may cause serious incompatible bug. " - << "Please recompile or reinstall Paddle with compatible CUDNN " - "version."; - } - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnCreate(&cudnn_handle_), - platform::errors::Fatal( - "Failed to create Cudnn handle in DeviceContext")); - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnSetStream(cudnn_handle_, stream.stream()), - platform::errors::Fatal( - "Failed to set stream for Cudnn handle in DeviceContext")); - } else { - cudnn_handle_ = nullptr; - } - } - - void DestoryCuDNNContext() { - if (cudnn_handle_) { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroy(cudnn_handle_), - platform::errors::Fatal("Failed to destory Cudnn handle")); - } - cudnn_handle_ = nullptr; - } - - void DestoryCuBlasContext() { - cublas_handle_.reset(); - cublas_tensor_core_handle_.reset(); - } - - CUDAPlace place_; - std::unique_ptr eigen_device_; - std::unique_ptr eigen_stream_; - stream::CUDAStream stream_; - cudnnHandle_t cudnn_handle_; - std::unique_ptr cublas_handle_; - std::unique_ptr cublas_tensor_core_handle_; - std::unique_ptr callback_manager_; - DISABLE_COPY_AND_ASSIGN(CUDAContext); -}; - class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -267,7 +112,7 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Call cublas function safely. */ template inline void CublasCall(Callback&& callback) const { - return context()->CublasCall(std::forward(callback)); + cublas_handle_->Call(std::forward(callback)); } /*! \brief Check whether tensor core is supported */ @@ -277,8 +122,11 @@ class CUDADeviceContext : public DeviceContext { Tensor Core is not available, use DEFAULT_MATH instead. */ template inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { - return context()->TensorCoreCublasCallIfAvailable( - std::forward(callback)); + if (cublas_tensor_core_handle_) { + cublas_tensor_core_handle_->Call(std::forward(callback)); + } else { + cublas_handle_->Call(std::forward(callback)); + } } /*! \brief Return cudnn handle in the device context. */ @@ -306,43 +154,32 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { - return context()->RecordEvent(ev, callback); + callback(); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); } template void AddStreamCallback(Callback&& callback) const { - return context()->AddStreamCallback(callback); - } - - void WaitStreamCallback() const { return context()->WaitStreamCallback(); } - - void ResetDefaultContext(const enum stream::Priority& priority) { - default_ctx_.reset(new CUDAContext(place_, priority)); - } - - void ResetThreadContext(const enum stream::Priority& priority) { - std::lock_guard guard(ctx_mtx_); - thread_ctx_[this].reset(new CUDAContext(place_, priority)); + callback_manager_->AddCallback(callback); } - const std::unique_ptr& context() const { - if (!thread_ctx_.count(this)) { - return default_ctx_; - } - return thread_ctx_.at(this); - } + void WaitStreamCallback() const { callback_manager_->Wait(); } private: CUDAPlace place_; - std::unique_ptr default_ctx_; - static thread_local std::unordered_map> - thread_ctx_; - static thread_local std::mutex ctx_mtx_; + mutable std::once_flag init_cudnn_; + + std::unique_ptr eigen_device_; + std::unique_ptr eigen_stream_; + cudaStream_t stream_; + cudnnHandle_t cudnn_handle_; mutable std::mutex cudnn_handle_mtx_; + std::unique_ptr cublas_handle_; + std::unique_ptr cublas_tensor_core_handle_; + #if defined(PADDLE_WITH_NCCL) // NCCL communicator (single process version) for NCCL collective operations. // NCCL collective operations provides fast collectives over multiple GPUs @@ -360,6 +197,9 @@ class CUDADeviceContext : public DeviceContext { int max_threads_per_block_; dim3 max_grid_dim_size_; + // StreamCallbackManager is thread-safe + std::unique_ptr callback_manager_; + DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt deleted file mode 100644 index 78eb776b544..00000000000 --- a/paddle/fluid/platform/stream/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -IF(WITH_GPU) -cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce) -ENDIF() diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc deleted file mode 100644 index 8880212b48c..00000000000 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/stream/cuda_stream.h" -#include "paddle/fluid/platform/cuda_device_guard.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace platform { -namespace stream { - -constexpr int64_t kHighPriority = -1; -constexpr int64_t kNormalPriority = 0; -constexpr unsigned int kDefaultFlag = cudaStreamDefault; - -bool CUDAStream::Init(const Place& place, const enum Priority& priority) { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::InvalidArgument( - "Cuda stream must be created using cuda place.")); - place_ = place; - CUDADeviceGuard guard(boost::get(place_).device); - if (priority == Priority::HIGH) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kHighPriority), - platform::errors::Fatal("High priority cuda stream creation failed.")); - } else if (priority == Priority::NORMAL) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kNormalPriority), - platform::errors::Fatal( - "Normal priority cuda stream creation failed.")); - } - VLOG(3) << "CUDAStream Init stream: " << stream_ - << ", priority: " << static_cast(priority); - return true; -} - -void CUDAStream::Destroy() { - CUDADeviceGuard guard(boost::get(place_).device); - if (stream_) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamDestroy(stream_), - platform::errors::Fatal("Cuda stream destruction failed.")); - } - stream_ = nullptr; -} - -} // namespace stream -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h deleted file mode 100644 index 8c3317645d3..00000000000 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/platform/gpu_info.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace platform { -namespace stream { - -#ifdef PADDLE_WITH_CUDA - -enum class Priority : uint8_t { - NIL = 0x0, - HIGH = 0x1, - NORMAL = 0x2, -}; - -class CUDAStream final { - public: - CUDAStream() = default; - CUDAStream(const Place& place, - const enum Priority& priority = Priority::NORMAL) { - Init(place, priority); - } - virtual ~CUDAStream() { Destroy(); } - - bool Init(const Place& place, - const enum Priority& priority = Priority::NORMAL); - - const cudaStream_t& stream() const { return stream_; } - void Destroy(); - - private: - Place place_; - cudaStream_t stream_{nullptr}; - Priority priority_{Priority::NORMAL}; - - DISABLE_COPY_AND_ASSIGN(CUDAStream); -}; - -#endif - -} // namespace stream -} // namespace platform -} // namespace paddle -- GitLab