From b0aca8824d99895c1082b9ac700bf46a6531d5c8 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 31 Aug 2018 15:10:39 +0800 Subject: [PATCH] make CudnnHolder thread safe --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 49 +++++++++-------- .../operators/conv_transpose_cudnn_op.cu.cc | 50 +++++++++--------- paddle/fluid/platform/device_context.cc | 52 +++++++++++++------ paddle/fluid/platform/device_context.h | 7 +-- 4 files changed, 92 insertions(+), 66 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 92435d7c4..4a7a6bcf7 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { output_channels / groups * output_height * output_width * output_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; if (user_workspace_size > 0) { @@ -159,16 +158,17 @@ class CUDNNConvOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // Get cudnn workspace - cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } }; @@ -311,8 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_filter_desc, filter_algo, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } - // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { @@ -320,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + i * group_offset_in)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + i * group_offset_in)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } // ------------------- cudnn conv backward filter --------------------- @@ -333,12 +335,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_output_grad_desc, output_grad_data + i * group_offset_out, - cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + i * group_offset_filter)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } } diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index c24cb14a6..2376212f5 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { conv_desc.descriptor(paddings, strides, dilations); // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; if (user_workspace_size > 0) { @@ -100,20 +99,20 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); - // Get cudnn workspace - cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes); - // ------------------- cudnn conv transpose forward --------------------- int input_offset = input->numel() / input->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } }; @@ -202,9 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::max(workspace_size_in_bytes, bwd_filter_ws_size); } - // ------------------- cudnn conv workspace --------------------- - // Get cudnn workspace - void* cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. int input_offset = input->numel() / input->dims()[0] / groups; @@ -216,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } @@ -231,12 +230,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_func) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 5c0dcdad3..1e46e5de9 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -15,6 +15,10 @@ limitations under the License. */ #include #include +#ifdef PADDLE_WITH_CUDA +#include +#endif + #include "paddle/fluid/memory/memory.h" namespace paddle { @@ -150,32 +154,45 @@ class CudnnHolder { PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); } - cudnnHandle_t get_cudnn_handle() const { return cudnn_handle_; } - - void* get_workspace(size_t required_len) { - if (required_len > workspace_len_) { - void* new_workspace = paddle::memory::Alloc(place_, required_len); - if (workspace_ != nullptr) { - // Maybe someone is using the current workspace - PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); - PADDLE_ENFORCE(cudaGetLastError()); - paddle::memory::Free(place_, workspace_); - } - workspace_ = new_workspace; - workspace_len_ = required_len; + cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } + + void RunFunc(const std::function& cudnn_func, + size_t required_workspace_len) { + boost::upgrade_lock shared_lock(mtx_); + if (required_workspace_len > workspace_len_) { + ReallocateWorkspace(required_workspace_len, &shared_lock); } - return workspace_; + cudnn_func(workspace_); } ~CudnnHolder() { PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); } private: + void ReallocateWorkspace(size_t required_workspace_len, + boost::upgrade_lock* lock) { + boost::upgrade_to_unique_lock unique_lock(*lock); + if (required_workspace_len <= workspace_len_) { + return; + } + void* new_workspace = paddle::memory::Alloc(place_, required_len); + if (workspace_ != nullptr) { + // Maybe someone is using the current workspace + PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); + PADDLE_ENFORCE(cudaGetLastError()); + paddle::memory::Free(place_, workspace_); + } + workspace_ = new_workspace; + workspace_len_ = required_len; + } + cudnnHandle_t cudnn_handle_; void* workspace_; size_t workspace_len_; const cudaStream_t* stream_; // not owned; const CUDAPlace place_; + + boost::shared_mutex mtx_; }; CUDADeviceContext::CUDADeviceContext(CUDAPlace place) @@ -228,11 +245,12 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { } cudnnHandle_t CUDADeviceContext::cudnn_handle() const { - return cudnn_holder_->get_cudnn_handle(); + return cudnn_holder_->cudnn_handle(); } -void* CUDADeviceContext::cudnn_workspace(size_t required_len) const { - return cudnn_holder_->get_workspace(required_len); +void CUDADeviceContext::RunCudnnFuncWithWorkspace( + const std::function& cudnn_func, size_t workspace_len) const { + cudnn_holder_->RunFunc(cudnn_func, workspace_len); } cudaStream_t CUDADeviceContext::stream() const { return stream_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 5bcd04fa0..35fb4a92d 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -97,9 +97,10 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; - /*! \brief Return a cudnn workspace whose length is greater than the - * 'required_len'. */ - void* cudnn_workspace(size_t required_len) const; + /*! \brief Run a cudnn function with the workspace provided by + * CUDADeviceContext */ + void RunCudnnFuncWithWorkspace(const std::function& cudnn_func, + size_t workspace_len) const; /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; -- GitLab