Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b0aca882
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b0aca882
编写于
8月 31, 2018
作者:
F
fengjiayi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
make CudnnHolder thread safe
上级
d5f74b73
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
92 addition
and
66 deletion
+92
-66
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+27
-22
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+26
-24
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+35
-17
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+4
-3
未找到文件。
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
b0aca882
...
...
@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_channels
/
groups
*
output_height
*
output_width
*
output_depth
;
int
group_offset_filter
=
filter
->
numel
()
/
groups
;
// ------------------- cudnn conv workspace ---------------------
void
*
cudnn_workspace
=
nullptr
;
size_t
workspace_size_in_bytes
;
// final workspace to allocate.
size_t
workspace_size_limit
=
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
;
if
(
user_workspace_size
>
0
)
{
...
...
@@ -159,16 +158,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
// Get cudnn workspace
cudnn_workspace
=
dev_ctx
.
cudnn_workspace
(
workspace_size_in_bytes
);
// ------------------- cudnn conv forward ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
};
dev_ctx
.
RunCudnnFuncWithWorkspace
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
};
...
...
@@ -311,8 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc
,
filter_algo
,
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
// ------------------- cudnn conv workspace ---------------------
void
*
cudnn_workspace
=
dev_ctx
.
cudnn_workspace
(
workspace_size_in_bytes
);
// ------------------- cudnn conv backward data ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
if
(
input_grad
)
{
...
...
@@ -320,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
i
*
group_offset_in
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
i
*
group_offset_in
));
};
dev_ctx
.
RunCudnnFuncWithWorkspace
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
// ------------------- cudnn conv backward filter ---------------------
...
...
@@ -333,12 +335,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset filter_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
i
*
group_offset_filter
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
i
*
group_offset_filter
));
};
dev_ctx
.
RunCudnnFuncWithWorkspace
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
浏览文件 @
b0aca882
...
...
@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
conv_desc
.
descriptor
<
T
>
(
paddings
,
strides
,
dilations
);
// ------------------- cudnn conv workspace ---------------------
void
*
cudnn_workspace
=
nullptr
;
size_t
workspace_size_in_bytes
;
// final workspace to allocate.
size_t
workspace_size_limit
=
kConvCUDNNWorkspaceLimitBytes
;
if
(
user_workspace_size
>
0
)
{
...
...
@@ -100,20 +99,20 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
handle
,
cudnn_filter_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
// Get cudnn workspace
cudnn_workspace
=
dev_ctx
.
cudnn_workspace
(
workspace_size_in_bytes
);
// ------------------- cudnn conv transpose forward ---------------------
int
input_offset
=
input
->
numel
()
/
input
->
dims
()[
0
]
/
groups
;
int
output_offset
=
output
->
numel
()
/
output
->
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
output_offset
*
g
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
output_offset
*
g
));
};
dev_ctx
.
RunCudnnFuncWithWorkspace
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
};
...
...
@@ -202,9 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std
::
max
(
workspace_size_in_bytes
,
bwd_filter_ws_size
);
}
// ------------------- cudnn conv workspace ---------------------
// Get cudnn workspace
void
*
cudnn_workspace
=
dev_ctx
.
cudnn_workspace
(
workspace_size_in_bytes
);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
int
input_offset
=
input
->
numel
()
/
input
->
dims
()[
0
]
/
groups
;
...
...
@@ -216,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
input_offset
*
g
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
input_offset
*
g
));
};
dev_ctx
.
RunCudnnFuncWithWorkspace
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
...
...
@@ -231,12 +230,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
filter_offset
*
g
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_func
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
filter_offset
*
g
));
};
dev_ctx
.
RunCudnnFuncWithWorkspace
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
b0aca882
...
...
@@ -15,6 +15,10 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include <boost\thread\thread.hpp>
#endif
#include "paddle/fluid/memory/memory.h"
namespace
paddle
{
...
...
@@ -150,32 +154,45 @@ class CudnnHolder {
PADDLE_ENFORCE
(
dynload
::
cudnnSetStream
(
cudnn_handle_
,
*
stream_
));
}
cudnnHandle_t
get_cudnn_handle
()
const
{
return
cudnn_handle_
;
}
void
*
get_workspace
(
size_t
required_len
)
{
if
(
required_len
>
workspace_len_
)
{
void
*
new_workspace
=
paddle
::
memory
::
Alloc
(
place_
,
required_len
);
if
(
workspace_
!=
nullptr
)
{
// Maybe someone is using the current workspace
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
*
stream_
));
PADDLE_ENFORCE
(
cudaGetLastError
());
paddle
::
memory
::
Free
(
place_
,
workspace_
);
}
workspace_
=
new_workspace
;
workspace_len_
=
required_len
;
cudnnHandle_t
cudnn_handle
()
const
{
return
cudnn_handle_
;
}
void
RunFunc
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
required_workspace_len
)
{
boost
::
upgrade_lock
<
boost
::
shared_mutex
>
shared_lock
(
mtx_
);
if
(
required_workspace_len
>
workspace_len_
)
{
ReallocateWorkspace
(
required_workspace_len
,
&
shared_lock
);
}
return
workspace_
;
cudnn_func
(
workspace_
)
;
}
~
CudnnHolder
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroy
(
cudnn_handle_
));
}
private:
void
ReallocateWorkspace
(
size_t
required_workspace_len
,
boost
::
upgrade_lock
<
boost
::
shared_mutex
>*
lock
)
{
boost
::
upgrade_to_unique_lock
<
boost
::
shared_mutex
>
unique_lock
(
*
lock
);
if
(
required_workspace_len
<=
workspace_len_
)
{
return
;
}
void
*
new_workspace
=
paddle
::
memory
::
Alloc
(
place_
,
required_len
);
if
(
workspace_
!=
nullptr
)
{
// Maybe someone is using the current workspace
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
*
stream_
));
PADDLE_ENFORCE
(
cudaGetLastError
());
paddle
::
memory
::
Free
(
place_
,
workspace_
);
}
workspace_
=
new_workspace
;
workspace_len_
=
required_len
;
}
cudnnHandle_t
cudnn_handle_
;
void
*
workspace_
;
size_t
workspace_len_
;
const
cudaStream_t
*
stream_
;
// not owned;
const
CUDAPlace
place_
;
boost
::
shared_mutex
mtx_
;
};
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
...
...
@@ -228,11 +245,12 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
}
cudnnHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
return
cudnn_holder_
->
get_
cudnn_handle
();
return
cudnn_holder_
->
cudnn_handle
();
}
void
*
CUDADeviceContext
::
cudnn_workspace
(
size_t
required_len
)
const
{
return
cudnn_holder_
->
get_workspace
(
required_len
);
void
CUDADeviceContext
::
RunCudnnFuncWithWorkspace
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
workspace_len
)
const
{
cudnn_holder_
->
RunFunc
(
cudnn_func
,
workspace_len
);
}
cudaStream_t
CUDADeviceContext
::
stream
()
const
{
return
stream_
;
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
b0aca882
...
...
@@ -97,9 +97,10 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cudnn handle in the device context. */
cudnnHandle_t
cudnn_handle
()
const
;
/*! \brief Return a cudnn workspace whose length is greater than the
* 'required_len'. */
void
*
cudnn_workspace
(
size_t
required_len
)
const
;
/*! \brief Run a cudnn function with the workspace provided by
* CUDADeviceContext */
void
RunCudnnFuncWithWorkspace
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
workspace_len
)
const
;
/*! \brief Return cuda stream in the device context. */
cudaStream_t
stream
()
const
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录