提交 b0aca882 编写于 作者: F fengjiayi

make CudnnHolder thread safe

上级 d5f74b73
...@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_channels / groups * output_height * output_width * output_depth; output_channels / groups * output_height * output_width * output_depth;
int group_offset_filter = filter->numel() / groups; int group_offset_filter = filter->numel() / groups;
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) { if (user_workspace_size > 0) {
...@@ -159,16 +158,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -159,16 +158,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
// Get cudnn workspace
cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes);
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter, cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out)); &beta, cudnn_output_desc, output_data + i * group_offset_out));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
...@@ -311,8 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -311,8 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, filter_algo, &tmp_size)); cudnn_filter_desc, filter_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
} }
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) { if (input_grad) {
...@@ -320,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -320,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc, filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, output_grad_data + i * group_offset_out, cudnn_conv_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
input_grad_data + i * group_offset_in)); cudnn_input_desc, input_grad_data + i * group_offset_in));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
...@@ -333,12 +335,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -333,12 +335,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, handle, &alpha, cudnn_input_desc,
cudnn_output_grad_desc, output_grad_data + i * group_offset_out, input_data + i * group_offset_in, cudnn_output_grad_desc,
cudnn_conv_desc, filter_algo, cudnn_workspace, output_grad_data + i * group_offset_out, cudnn_conv_desc,
workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
filter_grad_data + i * group_offset_filter)); cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
} }
......
...@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
if (user_workspace_size > 0) { if (user_workspace_size > 0) {
...@@ -100,20 +99,20 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -100,20 +99,20 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// Get cudnn workspace
cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
int input_offset = input->numel() / input->dims()[0] / groups; int input_offset = input->numel() / input->dims()[0] / groups;
int output_offset = output->numel() / output->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
algo, cudnn_workspace, workspace_size_in_bytes, &beta, algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g)); cudnn_output_desc, output_data + output_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
...@@ -202,9 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -202,9 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::max(workspace_size_in_bytes, bwd_filter_ws_size); std::max(workspace_size_in_bytes, bwd_filter_ws_size);
} }
// ------------------- cudnn conv workspace ---------------------
// Get cudnn workspace
void* cudnn_workspace = dev_ctx.cudnn_workspace(workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call. // FIXME(typhoonzero): template type T may not be the same as cudnn call.
int input_offset = input->numel() / input->dims()[0] / groups; int input_offset = input->numel() / input->dims()[0] / groups;
...@@ -216,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -216,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc, handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_filter_desc, output_grad_data + output_grad_offset * g, cudnn_filter_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo, filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g)); input_grad_data + input_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
...@@ -231,12 +230,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -231,12 +230,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_func) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc, handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_input_desc, output_grad_data + output_grad_offset * g, cudnn_input_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo, input_data + input_offset * g, cudnn_conv_desc, filter_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, cudnn_workspace, workspace_size_in_bytes, &beta,
filter_grad_data + filter_offset * g)); cudnn_filter_desc, filter_grad_data + filter_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
} }
......
...@@ -15,6 +15,10 @@ limitations under the License. */ ...@@ -15,6 +15,10 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_CUDA
#include <boost\thread\thread.hpp>
#endif
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
namespace paddle { namespace paddle {
...@@ -150,10 +154,26 @@ class CudnnHolder { ...@@ -150,10 +154,26 @@ class CudnnHolder {
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
} }
cudnnHandle_t get_cudnn_handle() const { return cudnn_handle_; } cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
void RunFunc(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_len) {
boost::upgrade_lock<boost::shared_mutex> shared_lock(mtx_);
if (required_workspace_len > workspace_len_) {
ReallocateWorkspace(required_workspace_len, &shared_lock);
}
cudnn_func(workspace_);
}
~CudnnHolder() { PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); }
void* get_workspace(size_t required_len) { private:
if (required_len > workspace_len_) { void ReallocateWorkspace(size_t required_workspace_len,
boost::upgrade_lock<boost::shared_mutex>* lock) {
boost::upgrade_to_unique_lock<boost::shared_mutex> unique_lock(*lock);
if (required_workspace_len <= workspace_len_) {
return;
}
void* new_workspace = paddle::memory::Alloc(place_, required_len); void* new_workspace = paddle::memory::Alloc(place_, required_len);
if (workspace_ != nullptr) { if (workspace_ != nullptr) {
// Maybe someone is using the current workspace // Maybe someone is using the current workspace
...@@ -164,18 +184,15 @@ class CudnnHolder { ...@@ -164,18 +184,15 @@ class CudnnHolder {
workspace_ = new_workspace; workspace_ = new_workspace;
workspace_len_ = required_len; workspace_len_ = required_len;
} }
return workspace_;
}
~CudnnHolder() { PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); }
private:
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
void* workspace_; void* workspace_;
size_t workspace_len_; size_t workspace_len_;
const cudaStream_t* stream_; // not owned; const cudaStream_t* stream_; // not owned;
const CUDAPlace place_; const CUDAPlace place_;
boost::shared_mutex mtx_;
}; };
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
...@@ -228,11 +245,12 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { ...@@ -228,11 +245,12 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
} }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return cudnn_holder_->get_cudnn_handle(); return cudnn_holder_->cudnn_handle();
} }
void* CUDADeviceContext::cudnn_workspace(size_t required_len) const { void CUDADeviceContext::RunCudnnFuncWithWorkspace(
return cudnn_holder_->get_workspace(required_len); const std::function<void(void*)>& cudnn_func, size_t workspace_len) const {
cudnn_holder_->RunFunc(cudnn_func, workspace_len);
} }
cudaStream_t CUDADeviceContext::stream() const { return stream_; } cudaStream_t CUDADeviceContext::stream() const { return stream_; }
......
...@@ -97,9 +97,10 @@ class CUDADeviceContext : public DeviceContext { ...@@ -97,9 +97,10 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle() const; cudnnHandle_t cudnn_handle() const;
/*! \brief Return a cudnn workspace whose length is greater than the /*! \brief Run a cudnn function with the workspace provided by
* 'required_len'. */ * CUDADeviceContext */
void* cudnn_workspace(size_t required_len) const; void RunCudnnFuncWithWorkspace(const std::function<void(void*)>& cudnn_func,
size_t workspace_len) const;
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; cudaStream_t stream() const;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册