Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
3e2dec5b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3e2dec5b
编写于
10月 12, 2021
作者:
Z
Zhang Zheng
提交者:
GitHub
10月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Change the input param of fusion op interface from pointer to tensor (#36349)
上级
fba355fb
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
161 addition
and
122 deletion
+161
-122
paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+17
-47
paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+18
-6
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+74
-20
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+20
-41
paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+32
-8
未找到文件。
paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
浏览文件 @
3e2dec5b
...
...
@@ -536,32 +536,20 @@ class CudnnBNAddReluTester {
bn_bias
->
Resize
({
1
,
1
,
1
,
channels_
});
// input
float
*
sum_ptr
=
sum
->
data
<
float
>
();
float
*
sum_of_square_ptr
=
sum_of_square
->
data
<
float
>
();
float
*
bn_scale_ptr
=
bn_scale
->
data
<
float
>
();
float
*
bn_bias_ptr
=
bn_bias
->
data
<
float
>
();
mean
->
Resize
({
1
,
1
,
1
,
channels_
});
var
->
Resize
({
1
,
1
,
1
,
channels_
});
// output
float
*
mean_ptr
=
mean
->
data
<
float
>
();
float
*
var_ptr
=
var
->
data
<
float
>
();
float
*
saved_mean_ptr
=
saved_mean
->
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
float
*
saved_var_ptr
=
saved_var
->
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_scale_ptr
=
equiv_scale
->
mutable_data
<
T
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_bias_ptr
=
equiv_bias
->
mutable_data
<
T
>
({
1
,
1
,
1
,
channels_
},
place
);
equiv_scale
->
Resize
({
1
,
1
,
1
,
channels_
});
equiv_bias
->
Resize
({
1
,
1
,
1
,
channels_
});
saved_mean
->
Resize
({
1
,
1
,
1
,
channels_
});
saved_var
->
Resize
({
1
,
1
,
1
,
channels_
});
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale
->
dims
());
op
::
CudnnBNStatsFinalize
<
T
>
bn_op
(
ctx
,
param_shape
);
bn_op
.
Forward
(
ctx
,
sum_ptr
,
sum_of_square_ptr
,
bn_scale_ptr
,
bn_bias_ptr
,
saved_mean_ptr
,
saved_var_ptr
,
mean_ptr
,
var_ptr
,
equiv_scale_ptr
,
equiv_bias_ptr
,
eps_
,
momentum_
,
ele_count_
,
true
);
bn_op
.
Forward
(
ctx
,
*
sum
,
*
sum_of_square
,
*
bn_scale
,
*
bn_bias
,
saved_mean
,
saved_var
,
mean
,
var
,
equiv_scale
,
equiv_bias
,
eps_
,
momentum_
,
ele_count_
,
true
);
}
// Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
...
...
@@ -627,21 +615,13 @@ class CudnnBNAddReluTester {
&
saved_var_z
,
&
equiv_scale_z
,
&
equiv_bias_z
);
}
T
*
x_ptr
=
x
.
data
<
T
>
();
T
*
z_ptr
=
(
fuse_add_
||
has_shortcut_
)
?
z
.
data
<
T
>
()
:
nullptr
;
T
*
equiv_scale_x_ptr
=
equiv_scale_x
.
data
<
T
>
();
T
*
equiv_bias_x_ptr
=
equiv_bias_x
.
data
<
T
>
();
T
*
equiv_scale_z_ptr
=
has_shortcut_
?
equiv_scale_z
.
data
<
T
>
()
:
nullptr
;
T
*
equiv_bias_z_ptr
=
has_shortcut_
?
equiv_bias_z
.
data
<
T
>
()
:
nullptr
;
T
*
y_ptr
=
y
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
y
.
Resize
(
framework
::
make_ddim
({
batch_size_
,
height_
,
width_
,
channels_
}));
int
c
=
channels_
;
int64_t
nhw
=
ele_count_
;
int32_t
c_int32_elems
=
((
c
+
63
)
&
~
63
)
/
32
;
int32_t
nhw_int32_elems
=
(
nhw
+
31
)
&
~
31
;
int32_t
*
bitmask_ptr
=
bitmask
.
mutable_data
<
int32_t
>
(
{
nhw_int32_elems
,
c_int32_elems
,
1
},
place
);
bitmask
.
Resize
(
framework
::
make_ddim
({
nhw_int32_elems
,
c_int32_elems
,
1
}));
auto
data_shape
=
framework
::
vectorize
<
int
>
(
x
.
dims
());
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale_x
.
dims
());
...
...
@@ -651,8 +631,8 @@ class CudnnBNAddReluTester {
op
::
CudnnScaleBiasAddRelu
<
T
>
sbar_op
(
ctx
,
act_type_
,
fuse_add_
,
has_shortcut_
,
data_shape
,
param_shape
,
bitmask_shape
);
sbar_op
.
Forward
(
ctx
,
x
_ptr
,
equiv_scale_x_ptr
,
equiv_bias_x_ptr
,
y_ptr
,
bitmask_ptr
,
z_ptr
,
equiv_scale_z_ptr
,
equiv_bias_z_ptr
);
sbar_op
.
Forward
(
ctx
,
x
,
equiv_scale_x
,
equiv_bias_x
,
z
,
equiv_scale_z
,
equiv_bias_z
,
&
y
,
&
bitmask
);
TensorCopySync
(
mean_x
,
platform
::
CPUPlace
(),
cpu_mean_x
);
TensorCopySync
(
var_x
,
platform
::
CPUPlace
(),
cpu_var_x
);
...
...
@@ -697,19 +677,10 @@ class CudnnBNAddReluTester {
saved_mean
.
Resize
({
1
,
1
,
1
,
channels_
});
saved_var
.
Resize
({
1
,
1
,
1
,
channels_
});
T
*
dy_ptr
=
dy
.
data
<
T
>
();
T
*
x_ptr
=
x
.
data
<
T
>
();
float
*
bn_scale_ptr
=
bn_scale
.
data
<
float
>
();
float
*
bn_bias_ptr
=
bn_bias
.
data
<
float
>
();
float
*
saved_mean_ptr
=
saved_mean
.
data
<
float
>
();
float
*
saved_var_ptr
=
saved_var
.
data
<
float
>
();
int32_t
*
bitmask_ptr
=
bitmask
.
data
<
int32_t
>
();
T
*
dx_ptr
=
dx
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
T
*
dz_ptr
=
dz
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
float
*
dscale_ptr
=
dscale
.
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
float
*
dbias_ptr
=
dbias
.
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
dx
.
Resize
(
framework
::
make_ddim
({
batch_size_
,
height_
,
width_
,
channels_
}));
dz
.
Resize
(
framework
::
make_ddim
({
batch_size_
,
height_
,
width_
,
channels_
}));
dscale
.
Resize
(
framework
::
make_ddim
({
1
,
1
,
1
,
channels_
}));
dbias
.
Resize
(
framework
::
make_ddim
({
1
,
1
,
1
,
channels_
}));
auto
data_shape
=
framework
::
vectorize
<
int
>
(
x
.
dims
());
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale
.
dims
());
...
...
@@ -718,9 +689,8 @@ class CudnnBNAddReluTester {
std
::
string
act_type
=
"relu"
;
op
::
CudnnScaleBiasAddRelu
<
T
>
sbar_op
(
ctx
,
act_type
,
true
,
false
,
data_shape
,
param_shape
,
bitmask_shape
);
sbar_op
.
Backward
(
ctx
,
dy_ptr
,
x_ptr
,
bn_scale_ptr
,
bn_bias_ptr
,
saved_mean_ptr
,
saved_var_ptr
,
bitmask_ptr
,
dx_ptr
,
dz_ptr
,
dscale_ptr
,
dbias_ptr
,
eps_
);
sbar_op
.
Backward
(
ctx
,
dy
,
x
,
bn_scale
,
bn_bias
,
saved_mean
,
saved_var
,
bitmask
,
&
dx
,
&
dz
,
&
dscale
,
&
dbias
,
eps_
);
TensorCopySync
(
dx
,
platform
::
CPUPlace
(),
cpu_dx
);
TensorCopySync
(
dz
,
platform
::
CPUPlace
(),
cpu_dz
);
...
...
paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
浏览文件 @
3e2dec5b
...
...
@@ -68,12 +68,13 @@ class CudnnBNStatsFinalize {
}
~
CudnnBNStatsFinalize
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
float
*
sum_ptr
,
float
*
sum_of_squares_ptr
,
float
*
scale_ptr
,
float
*
bias_ptr
,
float
*
saved_mean_ptr
,
float
*
saved_invstd_ptr
,
float
*
running_mean_ptr
,
float
*
running_var_ptr
,
T
*
equiv_scale_ptr
,
T
*
equiv_bias_ptr
,
double
eps
,
float
momentum
,
int64_t
ele_count
,
bool
is_train
)
{
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
sum
,
const
Tensor
&
sum_of_squares
,
const
Tensor
&
scale
,
const
Tensor
&
bias
,
Tensor
*
saved_mean
,
Tensor
*
saved_invstd
,
Tensor
*
running_mean
,
Tensor
*
running_var
,
Tensor
*
equiv_scale
,
Tensor
*
equiv_bias
,
double
eps
,
float
momentum
,
int64_t
ele_count
,
bool
is_train
)
{
auto
place
=
ctx
.
GetPlace
();
if
(
is_train
)
{
TrainInit
(
ctx
);
}
else
{
...
...
@@ -82,6 +83,17 @@ class CudnnBNStatsFinalize {
auto
&
op
=
is_train
?
train_op_
:
inference_op_
;
// Set variant_param for both inference_op_ and train_op_
float
*
sum_ptr
=
const_cast
<
float
*>
(
sum
.
data
<
float
>
());
float
*
sum_of_squares_ptr
=
const_cast
<
float
*>
(
sum_of_squares
.
data
<
float
>
());
float
*
scale_ptr
=
const_cast
<
float
*>
(
scale
.
data
<
float
>
());
float
*
bias_ptr
=
const_cast
<
float
*>
(
bias
.
data
<
float
>
());
float
*
saved_mean_ptr
=
saved_mean
->
mutable_data
<
float
>
(
place
);
float
*
saved_invstd_ptr
=
saved_invstd
->
mutable_data
<
float
>
(
place
);
float
*
running_mean_ptr
=
running_mean
->
mutable_data
<
float
>
(
place
);
float
*
running_var_ptr
=
running_var
->
mutable_data
<
float
>
(
place
);
T
*
equiv_scale_ptr
=
equiv_scale
->
mutable_data
<
T
>
(
place
);
T
*
equiv_bias_ptr
=
equiv_bias
->
mutable_data
<
T
>
(
place
);
op
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_SCALE
,
scale_ptr
);
op
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_BIAS
,
bias_ptr
);
op
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_RUNNING_MEAN
,
running_mean_ptr
);
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
浏览文件 @
3e2dec5b
...
...
@@ -38,7 +38,8 @@ struct NormConvolutionArgs {
compute_type
=
platform
::
CudnnDataType
<
float
>::
type
;
}
void
Set
(
const
std
::
vector
<
int
>
&
input_shape
,
void
Set
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
int
padding
,
int
stride
,
int
dilation
,
int
group
)
{
...
...
@@ -61,12 +62,33 @@ struct NormConvolutionArgs {
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s]."
,
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
((
filter_shape
[
0
]
%
32
==
0
&&
filter_shape
[
3
]
%
8
==
0
),
true
,
platform
::
errors
::
InvalidArgument
(
"The input channel is expected to be multiple of 8, "
"and the output channel is expected to be multiple "
"of 32. But recieved input channel is %d, output "
"channel is %d."
,
filter_shape
[
3
],
filter_shape
[
0
]));
PADDLE_ENFORCE_EQ
(
output_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
output_shape
.
size
(),
framework
::
make_ddim
(
output_shape
)));
is_support
=
IsSupport
(
ctx
,
filter_shape
,
stride
,
dilation
,
group
);
PADDLE_ENFORCE_EQ
(
is_support
,
true
,
platform
::
errors
::
InvalidArgument
(
"Current test is only supported in the platforms with "
"compatiblity greater than or equal to 70 and the kernel size "
"must be equal to 1 or 3. When the kernel size is 1, "
"the stride must be 1 if the compatiblity is equal to 70. "
"Besides, the dilation and group must be equal to 1. But recieved "
"compatiblity is %d, kernel size is %d, stride is %d, "
"dilation is %d, group is %d"
,
ctx
.
GetComputeCapability
(),
filter_shape
[
1
],
stride
,
dilation
,
group
));
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
in_dims
.
push_back
(
input_shape
[
i
]);
...
...
@@ -89,6 +111,25 @@ struct NormConvolutionArgs {
conv_desc
.
set
(
dtype
,
paddings
,
strides
,
dilations
,
false
,
group
);
}
bool
IsSupport
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
filter_shape
,
int
stride
,
int
dilation
,
int
group
)
{
int
kernel_size
=
filter_shape
[
1
];
if
(
dilation
!=
1
||
group
!=
1
)
{
return
false
;
}
if
(
ctx
.
GetComputeCapability
()
==
70
)
{
if
((
kernel_size
==
3
)
||
((
kernel_size
==
1
)
&&
(
stride
==
1
)))
{
return
true
;
}
}
else
if
(
ctx
.
GetComputeCapability
()
>
70
)
{
if
((
kernel_size
==
3
)
||
(
kernel_size
==
1
))
{
return
true
;
}
}
return
false
;
}
cudnnDataType_t
dtype
;
cudnnTensorFormat_t
format
;
cudnnDataType_t
compute_type
;
...
...
@@ -104,6 +145,8 @@ struct NormConvolutionArgs {
platform
::
TensorDescriptor
out_desc
;
platform
::
TensorDescriptor
out_stats_desc
;
platform
::
ConvolutionDescriptor
conv_desc
;
bool
is_support
;
};
template
<
typename
T
>
...
...
@@ -115,15 +158,16 @@ class CudnnNormConvolution {
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
args_
.
Set
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
}
~
CudnnNormConvolution
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
float
*
sum_of_squares_ptr
)
{
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
input
,
const
Tensor
&
filter
,
Tensor
*
output
,
Tensor
*
sum
,
Tensor
*
sum_of_squares
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
auto
place
=
ctx
.
GetPlace
();
CudnnFusionOp
*
fwd_op
=
GetForwardOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
...
...
@@ -132,12 +176,17 @@ class CudnnNormConvolution {
// Set variant_param
// input ptr
T
*
input_ptr
=
const_cast
<
T
*>
(
input
.
data
<
T
>
());
T
*
filter_ptr
=
const_cast
<
T
*>
(
filter
.
data
<
T
>
());
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
// output ptr
T
*
output_ptr
=
output
->
mutable_data
<
T
>
(
place
);
float
*
sum_ptr
=
sum
->
mutable_data
<
float
>
(
place
);
float
*
sum_of_squares_ptr
=
sum_of_squares
->
mutable_data
<
float
>
(
place
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
...
...
@@ -209,28 +258,34 @@ class CudnnNormConvolutionGrad {
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
args_
.
Set
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
dgrad_algo_
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
~
CudnnNormConvolutionGrad
()
{}
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
T
*
filter_grad_ptr
,
bool
use_addto
=
false
)
{
if
(
filter_grad_ptr
)
{
BackwardFilter
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
filter_grad_ptr
);
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
input
,
const
Tensor
&
filter
,
const
Tensor
&
output_grad
,
Tensor
*
input_grad
,
Tensor
*
filter_grad
,
bool
use_addto
=
false
)
{
auto
place
=
ctx
.
GetPlace
();
T
*
input_ptr
=
const_cast
<
T
*>
(
input
.
data
<
T
>
());
T
*
filter_ptr
=
const_cast
<
T
*>
(
filter
.
data
<
T
>
());
T
*
output_grad_ptr
=
const_cast
<
T
*>
(
output_grad
.
data
<
T
>
());
if
(
filter_grad
)
{
T
*
filter_grad_ptr
=
filter_grad
->
mutable_data
<
T
>
(
place
);
BackwardFilter
(
ctx
,
output_grad_ptr
,
input_ptr
,
filter_grad_ptr
);
}
if
(
input_grad
_ptr
)
{
BackwardData
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
if
(
input_grad
)
{
T
*
input_grad_ptr
=
input_grad
->
mutable_data
<
T
>
(
place
);
BackwardData
(
ctx
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
}
}
private:
void
BackwardFilter
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter
_ptr
,
T
*
filter_grad_ptr
)
{
void
BackwardFilter
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
output_grad_ptr
,
T
*
input
_ptr
,
T
*
filter_grad_ptr
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
CudnnFusionOp
*
wgrad_op
=
GetBackwardFilterOp
(
ctx
);
...
...
@@ -255,9 +310,8 @@ class CudnnNormConvolutionGrad {
workspace_size
);
}
void
BackwardData
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
bool
use_addto
=
false
)
{
void
BackwardData
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
bool
use_addto
=
false
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
size_t
workspace_size
=
GetWorkspaceSizeBwdData
(
ctx
);
...
...
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
浏览文件 @
3e2dec5b
...
...
@@ -229,15 +229,6 @@ class CudnnNormConvolutionTester {
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
if
(
!
Support
(
*
ctx
))
{
LOG
(
INFO
)
<<
"Current test is only supported in the platforms with "
<<
"compatiblity greater than or equal to 70 and the kernel size "
<<
"must be equal to 1 or 3. Besides, when the kernel size is 1, "
<<
"the stride must be 1 if the compatiblity is equal to 70."
;
return
;
}
framework
::
Tensor
cpu_output_base
;
framework
::
Tensor
cpu_sum_base
;
framework
::
Tensor
cpu_sum_of_square_base
;
...
...
@@ -325,14 +316,10 @@ class CudnnNormConvolutionTester {
TensorCopySync
(
cpu_input_
,
place
,
&
input
);
TensorCopySync
(
cpu_filter_nhwc_
,
place
,
&
filter_nhwc
);
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
output_ptr
=
output
.
mutable_data
<
T
>
(
{
batch_size_
,
out_height_
,
out_width_
,
output_channels_
},
place
);
float
*
sum_ptr
=
sum
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
float
*
sum_of_square_ptr
=
sum_of_square
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
output
.
Resize
(
framework
::
make_ddim
(
{
batch_size_
,
out_height_
,
out_width_
,
output_channels_
}));
sum
.
Resize
(
framework
::
make_ddim
({
1
,
1
,
1
,
output_channels_
}));
sum_of_square
.
Resize
(
framework
::
make_ddim
({
1
,
1
,
1
,
output_channels_
}));
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_nhwc
.
dims
());
...
...
@@ -340,8 +327,7 @@ class CudnnNormConvolutionTester {
op
::
CudnnNormConvolution
<
T
>
conv_op
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding_
,
stride_
,
dilation_
,
group_
);
conv_op
.
Forward
(
ctx
,
input_ptr
,
filter_ptr
,
output_ptr
,
sum_ptr
,
sum_of_square_ptr
);
conv_op
.
Forward
(
ctx
,
input
,
filter_nhwc
,
&
output
,
&
sum
,
&
sum_of_square
);
TensorCopySync
(
output
,
platform
::
CPUPlace
(),
cpu_output
);
TensorCopySync
(
sum
,
platform
::
CPUPlace
(),
cpu_sum
);
...
...
@@ -362,11 +348,8 @@ class CudnnNormConvolutionTester {
TensorCopySync
(
cpu_filter_nhwc_
,
place
,
&
filter_nhwc
);
TensorCopySync
(
cpu_output_grad_
,
place
,
&
output_grad
);
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
output_grad_ptr
=
output_grad
.
data
<
T
>
();
T
*
input_grad_ptr
=
input_grad
.
mutable_data
<
T
>
(
input
.
dims
(),
place
);
T
*
filter_grad_ptr
=
filter_grad
.
mutable_data
<
T
>
(
filter_nhwc
.
dims
(),
place
);
input_grad
.
Resize
(
input
.
dims
());
filter_grad
.
Resize
(
filter_nhwc
.
dims
());
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_nhwc
.
dims
());
...
...
@@ -374,26 +357,13 @@ class CudnnNormConvolutionTester {
op
::
CudnnNormConvolutionGrad
<
T
>
conv_grad_op
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding_
,
stride_
,
dilation_
,
group_
);
conv_grad_op
.
Backward
(
ctx
,
input
_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
filter_grad_ptr
);
conv_grad_op
.
Backward
(
ctx
,
input
,
filter_nhwc
,
output_grad
,
&
input_grad
,
&
filter_grad
);
TensorCopySync
(
input_grad
,
platform
::
CPUPlace
(),
cpu_input_grad
);
TensorCopySync
(
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
}
bool
Support
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
if
(
ctx
.
GetComputeCapability
()
==
70
)
{
if
((
kernel_size_
==
3
)
||
((
kernel_size_
==
1
)
&&
(
stride_
==
1
)))
{
return
true
;
}
}
else
if
(
ctx
.
GetComputeCapability
()
>
70
)
{
if
((
kernel_size_
==
3
)
||
(
kernel_size_
==
1
))
{
return
true
;
}
}
return
false
;
}
private:
int
batch_size_
;
int
height_
;
...
...
@@ -477,6 +447,15 @@ TEST(CudnnNormConvFp16, K1S2O4) {
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
CheckBackward
(
1e-3
);
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
if
(
ctx
->
GetComputeCapability
()
<=
70
)
{
ASSERT_THROW
(
test
.
CheckForward
(
1e-3
,
true
),
paddle
::
platform
::
EnforceNotMet
);
ASSERT_THROW
(
test
.
CheckBackward
(
1e-3
),
paddle
::
platform
::
EnforceNotMet
);
}
else
{
ASSERT_NO_THROW
(
test
.
CheckForward
(
1e-3
,
true
));
ASSERT_NO_THROW
(
test
.
CheckBackward
(
1e-3
));
}
}
paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
浏览文件 @
3e2dec5b
...
...
@@ -107,25 +107,33 @@ class CudnnScaleBiasAddRelu {
~
CudnnScaleBiasAddRelu
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
x_ptr
,
T
*
x_scale_ptr
,
T
*
x_bias_ptr
,
T
*
out_ptr
,
int32_t
*
bitmask_ptr
,
T
*
z_ptr
=
nullptr
,
T
*
z_scale_ptr
=
nullptr
,
T
*
z_bias_ptr
=
nullptr
)
{
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
x
,
const
Tensor
&
x_scale
,
const
Tensor
&
x_bias
,
const
Tensor
&
z
,
const
Tensor
&
z_scale
,
const
Tensor
&
z_bias
,
Tensor
*
out
,
T
ensor
*
bitmask
)
{
ForwardInit
(
ctx
);
auto
handle
=
ctx
.
cudnn_handle
();
auto
place
=
ctx
.
GetPlace
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
fwd_workspace_byte_
=
fwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
// Set variant_param
// input ptr
T
*
x_ptr
=
const_cast
<
T
*>
(
x
.
data
<
T
>
());
T
*
x_scale_ptr
=
const_cast
<
T
*>
(
x_scale
.
data
<
T
>
());
T
*
x_bias_ptr
=
const_cast
<
T
*>
(
x_bias
.
data
<
T
>
());
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
x_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_EQSCALE
,
x_scale_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_EQBIAS
,
x_bias_ptr
);
if
(
has_shortcut_
)
{
T
*
z_ptr
=
const_cast
<
T
*>
(
z
.
data
<
T
>
());
T
*
z_scale_ptr
=
const_cast
<
T
*>
(
z_scale
.
data
<
T
>
());
T
*
z_bias_ptr
=
const_cast
<
T
*>
(
z_bias
.
data
<
T
>
());
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_ZDATA
,
z_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_Z_EQSCALE
,
z_scale_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_Z_EQBIAS
,
z_bias_ptr
);
}
else
{
if
(
fused_add_
)
{
T
*
z_ptr
=
const_cast
<
T
*>
(
z
.
data
<
T
>
());
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_ZDATA
,
z_ptr
);
}
}
...
...
@@ -134,6 +142,8 @@ class CudnnScaleBiasAddRelu {
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
fwd_workspace_byte_
);
// output ptr
T
*
out_ptr
=
out
->
mutable_data
<
T
>
(
place
);
int32_t
*
bitmask_ptr
=
bitmask
->
mutable_data
<
int32_t
>
(
place
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
out_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_ACTIVATION_BITMASK
,
bitmask_ptr
);
...
...
@@ -147,16 +157,30 @@ class CudnnScaleBiasAddRelu {
fwd_workspace_byte_
);
}
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
dy_ptr
,
T
*
x_ptr
,
float
*
scale_ptr
,
float
*
bias_ptr
,
float
*
saved_mean_ptr
,
float
*
saved_invstd_ptr
,
int32_t
*
bitmask_ptr
,
T
*
dx_ptr
,
T
*
dz_ptr
,
float
*
dscale_ptr
,
float
*
dbias_ptr
,
double
eps
)
{
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
dy
,
const
Tensor
&
x
,
const
Tensor
&
scale
,
const
Tensor
&
bias
,
const
Tensor
&
saved_mean
,
const
Tensor
&
saved_invstd
,
const
Tensor
&
bitmask
,
Tensor
*
dx
,
Tensor
*
dz
,
Tensor
*
dscale
,
Tensor
*
dbias
,
double
eps
)
{
BackwardInit
(
ctx
);
auto
handle
=
ctx
.
cudnn_handle
();
auto
place
=
ctx
.
GetPlace
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
bwd_workspace_byte_
=
bwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
// Set variant_param
// input ptr
T
*
dy_ptr
=
const_cast
<
T
*>
(
dy
.
data
<
T
>
());
T
*
x_ptr
=
const_cast
<
T
*>
(
x
.
data
<
T
>
());
float
*
scale_ptr
=
const_cast
<
float
*>
(
scale
.
data
<
float
>
());
float
*
bias_ptr
=
const_cast
<
float
*>
(
bias
.
data
<
float
>
());
float
*
saved_mean_ptr
=
const_cast
<
float
*>
(
saved_mean
.
data
<
float
>
());
float
*
saved_invstd_ptr
=
const_cast
<
float
*>
(
saved_invstd
.
data
<
float
>
());
int32_t
*
bitmask_ptr
=
const_cast
<
int32_t
*>
(
bitmask
.
data
<
int32_t
>
());
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
place
);
T
*
dz_ptr
=
dz
?
dz
->
mutable_data
<
T
>
(
place
)
:
nullptr
;
float
*
dscale_ptr
=
dscale
?
dscale
->
mutable_data
<
float
>
(
place
)
:
nullptr
;
float
*
dbias_ptr
=
dbias
?
dbias
->
mutable_data
<
float
>
(
place
)
:
nullptr
;
bwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
x_ptr
);
bwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DYDATA
,
dy_ptr
);
bwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_SCALE
,
scale_ptr
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录