Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b4aca8ed
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b4aca8ed
编写于
1月 16, 2019
作者:
C
chengduozh
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove workspace_handle
test=develop
上级
01dc15ce
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
90 addition
and
75 deletion
+90
-75
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+39
-26
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+29
-28
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+12
-11
paddle/fluid/operators/warpctc_cudnn_op.cu.cc
paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+10
-10
未找到文件。
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
b4aca8ed
...
@@ -104,7 +104,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -104,7 +104,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm ---------------------
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t
algo
;
cudnnConvolutionFwdAlgo_t
algo
;
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
Tensor
cudnn_workspace
;
void
*
cudnn_workspace_ptr
=
nullptr
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
...
@@ -118,19 +120,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -118,19 +120,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
{
}
else
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_limit
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
auto
search_func
=
[
&
]()
{
auto
search_func
=
[
&
]()
{
int
returned_algo_count
;
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
cudnn_workspace_ptr
,
workspace_size_limit
));
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
fwd_perf_stat
[
i
];
const
auto
&
stat
=
fwd_perf_stat
[
i
];
...
@@ -181,6 +188,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -181,6 +188,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
"workspace_size to be allocated exceeds the limit"
);
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_in_bytes
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
if
((
activation
==
"identity"
)
&&
(
!
residual
))
{
if
((
activation
==
"identity"
)
&&
(
!
residual
))
{
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
...
@@ -188,13 +204,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -188,13 +204,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// cudnnConvolutionForward and cudnnAddTensor
// cudnnConvolutionForward and cudnnAddTensor
// ------------- cudnn conv forward and bias add ---------------------
// ------------- cudnn conv forward and bias add ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
));
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnAddTensor
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnAddTensor
(
handle
,
&
alpha
,
cudnn_bias_desc
,
bias_data
,
&
alpha
,
cudnn_output_desc
,
handle
,
&
alpha
,
cudnn_bias_desc
,
bias_data
,
&
alpha
,
cudnn_output_desc
,
output_data
));
output_data
));
...
@@ -205,15 +220,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -205,15 +220,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv+bias+act forward --------------------
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType
<
T
>
alpha1
=
1.0
f
;
ScalingParamType
<
T
>
alpha1
=
1.0
f
;
ScalingParamType
<
T
>
alpha2
=
residual
?
1.0
f
:
0.0
f
;
ScalingParamType
<
T
>
alpha2
=
residual
?
1.0
f
:
0.0
f
;
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha1
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
handle
,
&
alpha1
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
alpha2
,
cudnn_output_desc
,
residual_data
,
workspace_size_in_bytes
,
&
alpha2
,
cudnn_output_desc
,
residual_data
,
cudnn_bias_desc
,
bias_data
,
cudnn_act_desc
,
cudnn_output_desc
,
cudnn_bias_desc
,
bias_data
,
cudnn_act_desc
,
cudnn_output_desc
,
output_data
));
output_data
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
std
::
vector
<
int
>
channels
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"split_channels"
);
std
::
vector
<
int
>
channels
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"split_channels"
);
if
(
channels
.
size
())
{
if
(
channels
.
size
())
{
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
浏览文件 @
b4aca8ed
...
@@ -104,16 +104,18 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
...
@@ -104,16 +104,18 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int
output_offset
=
output
->
numel
()
/
output
->
dims
()[
0
]
/
groups
;
int
output_offset
=
output
->
numel
()
/
output
->
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
temp_allocation
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
workspace_size_in_bytes
);
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
output_offset
*
g
));
cudnn_output_desc
,
output_data
+
output_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
}
};
};
...
@@ -209,20 +211,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
...
@@ -209,20 +211,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_grad
->
numel
()
/
output_grad
->
dims
()[
0
]
/
groups
;
output_grad
->
numel
()
/
output_grad
->
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
temp_allocation
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
workspace_size_in_bytes
);
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
if
(
input_grad
)
{
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_output_desc
,
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
input_offset
*
g
));
input_grad_data
+
input_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
}
...
@@ -232,15 +236,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
...
@@ -232,15 +236,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
// Gradient with respect to the filter
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_output_desc
,
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
filter_grad_data
+
filter_offset
*
g
));
cudnn_filter_desc
,
filter_grad_data
+
filter_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
}
}
}
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
b4aca8ed
...
@@ -216,18 +216,19 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
...
@@ -216,18 +216,19 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
out_datas
.
push_back
(
out_datas
.
push_back
(
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
+
oc2
)
*
h
*
w
));
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
+
oc2
)
*
h
*
w
));
auto
temp_allocation
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
workspace_size_in_bytes
);
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
auto
func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha
,
in_desc
[
i
],
in_datas
[
i
],
filter_desc
[
i
],
handle
,
&
alpha
,
in_desc
[
i
],
in_datas
[
i
],
filter_desc
[
i
],
static_cast
<
const
void
*>
(
filters
[
i
]
->
data
<
T
>
()),
conv_desc
[
i
],
static_cast
<
const
void
*>
(
filters
[
i
]
->
data
<
T
>
()),
conv_desc
[
i
],
algo
[
i
],
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
out_desc
[
i
],
algo
[
i
],
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
out_datas
[
i
],
bias_desc
[
i
],
out_desc
[
i
],
out_datas
[
i
],
bias_desc
[
i
],
static_cast
<
const
void
*>
(
bias
[
i
]
->
data
<
T
>
()),
cudnn_act_desc
,
static_cast
<
const
void
*>
(
bias
[
i
]
->
data
<
T
>
()),
cudnn_act_desc
,
out_desc
[
i
],
out_datas
[
i
]));
out_desc
[
i
],
out_datas
[
i
]));
};
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
workspace_handle
.
RunFunc
(
func
,
workspace_size_in_bytes
);
}
}
cudnnTensorDescriptor_t
x_desc
;
cudnnTensorDescriptor_t
x_desc
;
...
...
paddle/fluid/operators/warpctc_cudnn_op.cu.cc
浏览文件 @
b4aca8ed
...
@@ -145,16 +145,16 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
...
@@ -145,16 +145,16 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
T
*
loss_data
=
loss
->
mutable_data
<
T
>
(
loss_dims
,
ctx
.
GetPlace
());
T
*
loss_data
=
loss
->
mutable_data
<
T
>
(
loss_dims
,
ctx
.
GetPlace
());
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
temp_allocation
=
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCTCLoss
(
workspace_size
);
handle
,
cu_logits_desc
,
warpctc_logits_data
,
warpctc_label_data
,
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
warpctc_label_lengths
.
data
(),
warpctc_logits_lengths
.
data
(),
loss_data
,
cu_grad_desc
,
warpctc_grad_data
,
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCTCLoss
(
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
,
cu_ctcloss_desc
,
cudnn_workspace
,
handle
,
cu_logits_desc
,
warpctc_logits_data
,
warpctc_label_data
,
workspace_size
));
warpctc_label_lengths
.
data
(),
warpctc_logits_lengths
.
data
(),
loss_data
,
};
cu_grad_desc
,
warpctc_grad_data
,
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
,
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size
);
cu_ctcloss_desc
,
cudnn_workspace
,
workspace_size
)
);
}
}
};
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录