Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
a679fcbb
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a679fcbb
编写于
10月 11, 2021
作者:
Z
Zhang Zheng
提交者:
GitHub
10月 11, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add more tests and fix bugs for cudnn_norm_conv_test and cudnn_bn_and_relu_test (#36314)
上级
830debc2
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
599 addition
and
122 deletion
+599
-122
paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+542
-108
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+57
-14
未找到文件。
paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
浏览文件 @
a679fcbb
...
@@ -33,6 +33,8 @@ namespace op = paddle::operators;
...
@@ -33,6 +33,8 @@ namespace op = paddle::operators;
using
Tensor
=
paddle
::
framework
::
Tensor
;
using
Tensor
=
paddle
::
framework
::
Tensor
;
USE_OP
(
batch_norm
);
USE_OP
(
batch_norm
);
USE_CUDA_ONLY_OP
(
fused_bn_add_activation
);
USE_CUDA_ONLY_OP
(
fused_bn_add_activation_grad
);
template
<
typename
T
>
template
<
typename
T
>
void
InitRandomTensor
(
const
std
::
vector
<
int64_t
>
&
dims
,
void
InitRandomTensor
(
const
std
::
vector
<
int64_t
>
&
dims
,
...
@@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
...
@@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
T
*
cpu_out_ptr
=
cpu_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
dims
),
T
*
cpu_out_ptr
=
cpu_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
dims
),
platform
::
CPUPlace
());
platform
::
CPUPlace
());
std
::
default_random_engine
random
(
0
);
std
::
default_random_engine
random
(
0
);
std
::
uniform_real_distribution
<
float
>
dis
(
0
.0
,
1.0
);
std
::
uniform_real_distribution
<
float
>
dis
(
-
1
.0
,
1.0
);
for
(
int
i
=
0
;
i
<
cpu_out
->
numel
();
++
i
)
{
for
(
int
i
=
0
;
i
<
cpu_out
->
numel
();
++
i
)
{
cpu_out_ptr
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
cpu_out_ptr
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
}
...
@@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res,
...
@@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res,
}
}
}
}
std
::
string
error_type
=
is_relative_atol
?
"relative"
:
"absolute"
;
std
::
string
error_type
=
is_relative_atol
?
"relative"
:
"absolute"
;
LOG
(
INFO
)
<<
"["
<<
name
<<
"]
,
The dims is ["
<<
cpu_res
.
dims
()
LOG
(
INFO
)
<<
"["
<<
name
<<
"] The dims is ["
<<
cpu_res
.
dims
()
<<
"], maximum "
<<
error_type
<<
" error is "
<<
max_diff
<<
": "
<<
"], maximum "
<<
error_type
<<
" error is "
<<
max_diff
<<
": "
<<
cpu_res_ptr
[
index
]
<<
" vs "
<<
cpu_base_ptr
[
index
];
<<
cpu_res_ptr
[
index
]
<<
" vs "
<<
cpu_base_ptr
[
index
];
}
}
...
@@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
...
@@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
}
}
}
}
// get paddle batchnorm op results as baseline
template
<
typename
T
>
void
ComputeInplaceAdd
(
const
framework
::
Tensor
&
cpu_x
,
framework
::
Tensor
*
cpu_y
)
{
EXPECT_EQ
(
cpu_x
.
dims
(),
cpu_y
->
dims
());
const
T
*
cpu_x_ptr
=
cpu_x
.
data
<
T
>
();
T
*
cpu_y_ptr
=
cpu_y
->
data
<
T
>
();
for
(
int64_t
i
=
0
;
i
<
cpu_x
.
numel
();
++
i
)
{
cpu_y_ptr
[
i
]
+=
cpu_x_ptr
[
i
];
}
}
template
<
typename
T
>
void
ComputeInplaceRelu
(
framework
::
Tensor
*
cpu_x
)
{
T
*
cpu_x_ptr
=
cpu_x
->
data
<
T
>
();
for
(
int64_t
i
=
0
;
i
<
cpu_x
->
numel
();
++
i
)
{
cpu_x_ptr
[
i
]
=
cpu_x_ptr
[
i
]
>
static_cast
<
T
>
(
0
)
?
cpu_x_ptr
[
i
]
:
static_cast
<
T
>
(
0
);
}
}
void
ComputeBatchNormForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
void
ComputeBatchNormForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_x
,
const
Tensor
&
cpu_scale
,
const
Tensor
&
cpu_x
,
const
Tensor
&
cpu_scale
,
const
Tensor
&
cpu_bias
,
Tensor
*
cpu_mean
,
const
Tensor
&
cpu_bias
,
Tensor
*
cpu_mean
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
Tensor
*
cpu_saved_var
,
Tensor
*
cpu_y
,
Tensor
*
cpu_saved_var
,
Tensor
*
cpu_y
,
Tensor
*
cpu
_reserve_space
)
{
Tensor
*
saved
_reserve_space
)
{
framework
::
Scope
scope
;
framework
::
Scope
scope
;
auto
*
x
=
scope
.
Var
(
"X"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
x
=
scope
.
Var
(
"X"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
scale
=
scope
.
Var
(
"Scale"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
scale
=
scope
.
Var
(
"Scale"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
...
@@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
...
@@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
TensorCopySync
(
*
var
,
platform
::
CPUPlace
(),
cpu_var
);
TensorCopySync
(
*
var
,
platform
::
CPUPlace
(),
cpu_var
);
TensorCopySync
(
*
saved_mean
,
platform
::
CPUPlace
(),
cpu_saved_mean
);
TensorCopySync
(
*
saved_mean
,
platform
::
CPUPlace
(),
cpu_saved_mean
);
TensorCopySync
(
*
saved_var
,
platform
::
CPUPlace
(),
cpu_saved_var
);
TensorCopySync
(
*
saved_var
,
platform
::
CPUPlace
(),
cpu_saved_var
);
TensorCopySync
(
*
reserve_space
,
platform
::
CPUPlace
(),
cpu_reserve_space
);
// reserved_space will stay on GPU and used in grad op.
saved_reserve_space
->
ShareDataWith
(
*
reserve_space
);
}
void
ComputeFusedBNAddReluForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_x
,
const
Tensor
&
cpu_z
,
const
Tensor
&
cpu_scale
,
const
Tensor
&
cpu_bias
,
Tensor
*
cpu_mean
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
Tensor
*
cpu_saved_var
,
Tensor
*
cpu_y
,
Tensor
*
saved_reserve_space
)
{
framework
::
Scope
scope
;
auto
*
x
=
scope
.
Var
(
"X"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
z
=
scope
.
Var
(
"Z"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
scale
=
scope
.
Var
(
"Scale"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
bias
=
scope
.
Var
(
"Bias"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
mean
=
scope
.
Var
(
"Mean"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
var
=
scope
.
Var
(
"Variance"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
y
=
scope
.
Var
(
"Y"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
saved_mean
=
scope
.
Var
(
"SavedMean"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
saved_var
=
scope
.
Var
(
"SavedVariance"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
reserve_space
=
scope
.
Var
(
"ReserveSpace"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_x
,
place
,
x
);
TensorCopySync
(
cpu_z
,
place
,
z
);
TensorCopySync
(
cpu_scale
,
place
,
scale
);
TensorCopySync
(
cpu_bias
,
place
,
bias
);
TensorCopySync
(
*
cpu_mean
,
place
,
mean
);
TensorCopySync
(
*
cpu_var
,
place
,
var
);
int64_t
channels
=
x
->
dims
()[
3
];
scale
->
Resize
({
channels
});
bias
->
Resize
({
channels
});
mean
->
Resize
({
channels
});
var
->
Resize
({
channels
});
framework
::
AttributeMap
attrs
;
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
"fused_bn_add_activation"
,
{{
"X"
,
{
"X"
}},
{
"Z"
,
{
"Z"
}},
{
"Scale"
,
{
"Scale"
}},
{
"Bias"
,
{
"Bias"
}}},
{{
"Y"
,
{
"Y"
}},
{
"MeanOut"
,
{
"Mean"
}},
{
"VarianceOut"
,
{
"Variance"
}},
{
"SavedMean"
,
{
"SavedMean"
}},
{
"SavedVariance"
,
{
"SavedVariance"
}},
{
"ReserveSpace"
,
{
"ReserveSpace"
}}},
attrs
);
op
->
Run
(
scope
,
ctx
.
GetPlace
());
TensorCopySync
(
*
y
,
platform
::
CPUPlace
(),
cpu_y
);
TensorCopySync
(
*
mean
,
platform
::
CPUPlace
(),
cpu_mean
);
TensorCopySync
(
*
var
,
platform
::
CPUPlace
(),
cpu_var
);
TensorCopySync
(
*
saved_mean
,
platform
::
CPUPlace
(),
cpu_saved_mean
);
TensorCopySync
(
*
saved_var
,
platform
::
CPUPlace
(),
cpu_saved_var
);
// reserved_space will stay on GPU and used in grad op.
saved_reserve_space
->
ShareDataWith
(
*
reserve_space
);
}
void
ComputeFusedBNAddReluBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_dy
,
const
Tensor
&
cpu_x
,
const
Tensor
&
cpu_scale
,
const
Tensor
&
cpu_bias
,
const
Tensor
&
cpu_saved_mean
,
const
Tensor
&
cpu_saved_var
,
const
Tensor
&
cpu_y
,
const
Tensor
&
saved_reserve_space
,
Tensor
*
cpu_dx
,
Tensor
*
cpu_dz
,
Tensor
*
cpu_dscale
,
Tensor
*
cpu_dbias
)
{
framework
::
Scope
scope
;
auto
*
x
=
scope
.
Var
(
"X"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
y
=
scope
.
Var
(
"Y"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
dy
=
scope
.
Var
(
"Y@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
scale
=
scope
.
Var
(
"Scale"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
bias
=
scope
.
Var
(
"Bias"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
saved_mean
=
scope
.
Var
(
"SavedMean"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
saved_var
=
scope
.
Var
(
"SavedVariance"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
reserve_space
=
scope
.
Var
(
"ReserveSpace"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
dx
=
scope
.
Var
(
"X@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
dz
=
scope
.
Var
(
"Z@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
dscale
=
scope
.
Var
(
"Scale@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
dbias
=
scope
.
Var
(
"Bias@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_x
,
place
,
x
);
TensorCopySync
(
cpu_y
,
place
,
y
);
TensorCopySync
(
cpu_dy
,
place
,
dy
);
TensorCopySync
(
cpu_scale
,
place
,
scale
);
TensorCopySync
(
cpu_bias
,
place
,
bias
);
TensorCopySync
(
cpu_saved_mean
,
place
,
saved_mean
);
TensorCopySync
(
cpu_saved_var
,
place
,
saved_var
);
reserve_space
->
ShareDataWith
(
saved_reserve_space
);
int64_t
channels
=
x
->
dims
()[
3
];
scale
->
Resize
({
channels
});
bias
->
Resize
({
channels
});
saved_mean
->
Resize
({
channels
});
saved_var
->
Resize
({
channels
});
framework
::
AttributeMap
attrs
;
float
momentum
=
0.9
;
float
epsilon
=
1e-5
;
std
::
string
act_type
=
"relu"
;
attrs
.
insert
({
"momentum"
,
momentum
});
attrs
.
insert
({
"epsilon"
,
epsilon
});
attrs
.
insert
({
"act_type"
,
act_type
});
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
"fused_bn_add_activation_grad"
,
{{
"X"
,
{
"X"
}},
{
"Y"
,
{
"Y"
}},
{
"Y@GRAD"
,
{
"Y@GRAD"
}},
{
"Scale"
,
{
"Scale"
}},
{
"Bias"
,
{
"Bias"
}},
{
"SavedMean"
,
{
"SavedMean"
}},
{
"SavedVariance"
,
{
"SavedVariance"
}},
{
"ReserveSpace"
,
{
"ReserveSpace"
}}},
{{
"X@GRAD"
,
{
"X@GRAD"
}},
{
"Z@GRAD"
,
{
"Z@GRAD"
}},
{
"Scale@GRAD"
,
{
"Scale@GRAD"
}},
{
"Bias@GRAD"
,
{
"Bias@GRAD"
}}},
attrs
);
op
->
Run
(
scope
,
ctx
.
GetPlace
());
TensorCopySync
(
*
dx
,
platform
::
CPUPlace
(),
cpu_dx
);
TensorCopySync
(
*
dz
,
platform
::
CPUPlace
(),
cpu_dz
);
TensorCopySync
(
*
dscale
,
platform
::
CPUPlace
(),
cpu_dscale
);
TensorCopySync
(
*
dbias
,
platform
::
CPUPlace
(),
cpu_dbias
);
}
}
template
<
typename
T
>
template
<
typename
T
>
class
CudnnBNAddReluTester
{
class
CudnnBNAddReluTester
{
public:
public:
CudnnBNAddReluTester
(
int
batch_size
,
int
height
,
int
width
,
int
channels
)
{
CudnnBNAddReluTester
(
int
batch_size
,
int
height
,
int
width
,
int
channels
,
std
::
string
act_type
,
bool
fuse_add
,
bool
has_shortcut
)
{
batch_size_
=
batch_size
;
batch_size_
=
batch_size
;
height_
=
height
;
height_
=
height
;
width_
=
width
;
width_
=
width
;
channels_
=
channels
;
channels_
=
channels
;
ele_count_
=
batch_size_
*
height_
*
width_
;
ele_count_
=
batch_size_
*
height_
*
width_
;
act_type_
=
act_type
;
fuse_add_
=
fuse_add
;
has_shortcut_
=
has_shortcut
;
SetUp
();
SetUp
();
}
}
~
CudnnBNAddReluTester
()
{}
~
CudnnBNAddReluTester
()
{}
void
CheckForward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
void
CheckForward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
LOG
(
INFO
)
<<
"[CheckForward, diff="
<<
diff
<<
", is_relative_atol="
<<
is_relative_atol
<<
"] act_type="
<<
act_type_
<<
", fuse_add="
<<
fuse_add_
<<
", has_shortcut="
<<
has_shortcut_
;
platform
::
CUDADeviceContext
*
ctx
=
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
platform
::
CUDAPlace
(
0
)));
framework
::
Tensor
cpu_mean_base
;
auto
select
=
[
&
](
Tensor
*
in
)
{
return
has_shortcut_
?
in
:
nullptr
;
};
framework
::
Tensor
cpu_var_base
;
framework
::
Tensor
cpu_saved_mean_base
;
framework
::
Tensor
cpu_mean_base_x
;
framework
::
Tensor
cpu_saved_var_base
;
framework
::
Tensor
cpu_var_base_x
;
framework
::
Tensor
cpu_y_base
;
framework
::
Tensor
cpu_mean_base_z
;
framework
::
Tensor
cpu_reserve_space_base
;
framework
::
Tensor
cpu_var_base_z
;
BaselineForward
(
*
ctx
,
&
cpu_mean_base
,
&
cpu_var_base
,
&
cpu_saved_mean_base
,
if
(
!
has_shortcut_
&&
fuse_add_
&&
(
act_type_
==
"relu"
))
{
&
cpu_saved_var_base
,
&
cpu_y_base
,
&
cpu_reserve_space_base
);
BaselineForwardFusedBNAddRelu
(
*
ctx
,
&
cpu_mean_base_x
,
&
cpu_var_base_x
,
&
cpu_saved_mean_base_x_
,
framework
::
Tensor
cpu_mean
;
&
cpu_saved_var_base_x_
,
&
cpu_y_base_
,
&
saved_reserve_space_x_
);
framework
::
Tensor
cpu_var
;
}
else
{
framework
::
Tensor
cpu_saved_mean
;
BaselineForward
(
framework
::
Tensor
cpu_saved_var
;
*
ctx
,
&
cpu_mean_base_x
,
&
cpu_var_base_x
,
&
cpu_saved_mean_base_x_
,
framework
::
Tensor
cpu_y
;
&
cpu_saved_var_base_x_
,
&
cpu_y_base_
,
&
saved_reserve_space_x_
,
framework
::
Tensor
cpu_bitmask
;
select
(
&
cpu_mean_base_z
),
select
(
&
cpu_var_base_z
),
FusedForward
(
*
ctx
,
&
cpu_mean
,
&
cpu_var
,
&
cpu_saved_mean
,
&
cpu_saved_var
,
select
(
&
cpu_saved_mean_base_z_
),
select
(
&
cpu_saved_var_base_z_
),
&
cpu_y
,
&
cpu_bitmask
);
select
(
&
saved_reserve_space_z_
));
}
CheckOutput
<
float
>
(
"Mean"
,
cpu_mean
,
cpu_mean_base
,
diff
,
is_relative_atol
);
framework
::
Tensor
cpu_mean_x
;
CheckOutput
<
float
>
(
"Variance"
,
cpu_var
,
cpu_var_base
,
diff
,
framework
::
Tensor
cpu_var_x
;
framework
::
Tensor
cpu_y
;
framework
::
Tensor
cpu_mean_z
;
framework
::
Tensor
cpu_var_z
;
FusedForward
(
*
ctx
,
&
cpu_mean_x
,
&
cpu_var_x
,
&
cpu_saved_mean_x_
,
&
cpu_saved_var_x_
,
&
cpu_y
,
&
cpu_bitmask_
,
select
(
&
cpu_mean_z
),
select
(
&
cpu_var_z
),
select
(
&
cpu_saved_mean_z_
),
select
(
&
cpu_saved_var_z_
));
CheckOutput
<
float
>
(
"Mean"
,
cpu_mean_x
,
cpu_mean_base_x
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
"Variance"
,
cpu_var_x
,
cpu_var_base_x
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
"SavedMean"
,
cpu_saved_mean_x_
,
cpu_saved_mean_base_x_
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
"SavedVariance"
,
cpu_saved_var_x_
,
cpu_saved_var_base_x_
,
diff
,
is_relative_atol
);
if
(
has_shortcut_
)
{
CheckOutput
<
float
>
(
"MeanZ"
,
cpu_mean_z
,
cpu_mean_base_z
,
diff
,
is_relative_atol
);
is_relative_atol
);
CheckOutput
<
float
>
(
"SavedMean"
,
cpu_saved_mean
,
cpu_saved_mean_base
,
diff
,
CheckOutput
<
float
>
(
"VarianceZ"
,
cpu_var_z
,
cpu_var_base_z
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
"SavedMeanZ"
,
cpu_saved_mean_z_
,
cpu_saved_mean_base_z_
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
"SavedVarianceZ"
,
cpu_saved_var_z_
,
cpu_saved_var_base_z_
,
diff
,
is_relative_atol
);
}
CheckOutput
<
T
>
(
"Y"
,
cpu_y
,
cpu_y_base_
,
diff
,
is_relative_atol
);
}
void
CheckBackward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
framework
::
Tensor
cpu_dx_base
;
framework
::
Tensor
cpu_dz_base
;
framework
::
Tensor
cpu_dscale_base
;
framework
::
Tensor
cpu_dbias_base
;
BaselineBackwardFusedBNAddRelu
(
*
ctx
,
&
cpu_dx_base
,
&
cpu_dz_base
,
&
cpu_dscale_base
,
&
cpu_dbias_base
);
framework
::
Tensor
cpu_dx
;
framework
::
Tensor
cpu_dz
;
framework
::
Tensor
cpu_dscale
;
framework
::
Tensor
cpu_dbias
;
FusedBackward
(
*
ctx
,
&
cpu_dx
,
&
cpu_dz
,
&
cpu_dscale
,
&
cpu_dbias
);
CheckOutput
<
T
>
(
"DX"
,
cpu_dx
,
cpu_dx_base
,
diff
,
is_relative_atol
);
CheckOutput
<
T
>
(
"DZ"
,
cpu_dz
,
cpu_dz_base
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
"DScale"
,
cpu_dscale
,
cpu_dscale_base
,
diff
,
is_relative_atol
);
is_relative_atol
);
CheckOutput
<
float
>
(
"
SavedVariance"
,
cpu_saved_var
,
cpu_saved_var
_base
,
diff
,
CheckOutput
<
float
>
(
"
DBias"
,
cpu_dbias
,
cpu_dbias
_base
,
diff
,
is_relative_atol
);
is_relative_atol
);
CheckOutput
<
T
>
(
"Y"
,
cpu_y
,
cpu_y_base
,
diff
,
is_relative_atol
);
}
}
private:
private:
void
SetUp
()
{
void
SetUp
()
{
// Initialize input data
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
&
cpu_x_
);
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
&
cpu_x_
);
ComputeSumAndSquareSum
<
T
>
(
cpu_x_
,
&
cpu_sum_
,
&
cpu_sum_of_square_
);
InitRandomTensor
<
float
>
({
channels_
},
&
cpu_bn_scale_x_
);
InitRandomTensor
<
float
>
({
channels_
},
&
cpu_bn_bias_x_
);
// scale and bias should be initialized randomly.
if
(
has_shortcut_
)
{
InitConstantTensor
<
float
>
({
channels_
},
static_cast
<
float
>
(
1.0
f
),
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
&
cpu_z_
);
&
cpu_bn_scale_
);
InitRandomTensor
<
float
>
({
channels_
},
&
cpu_bn_scale_z_
);
InitConstantTensor
<
float
>
({
channels_
},
static_cast
<
float
>
(
0.0
f
),
InitRandomTensor
<
float
>
({
channels_
},
&
cpu_bn_bias_z_
);
&
cpu_bn_bias_
);
}
else
{
if
(
fuse_add_
)
{
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
&
cpu_z_
);
}
}
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
&
cpu_dy_
);
}
}
void
InitMeanVar
(
Tensor
*
cpu_mean
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
void
InitMeanVar
(
Tensor
*
cpu_mean
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
...
@@ -252,71 +464,178 @@ class CudnnBNAddReluTester {
...
@@ -252,71 +464,178 @@ class CudnnBNAddReluTester {
cpu_saved_var
);
cpu_saved_var
);
}
}
void
BaselineForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_mean
,
void
BaselineForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
Tensor
*
cpu_mean_x
,
Tensor
*
cpu_var_x
,
Tensor
*
cpu_saved_mean_x
,
Tensor
*
cpu_saved_var_x
,
Tensor
*
cpu_y
,
Tensor
*
saved_reserve_space_x
,
Tensor
*
cpu_mean_z
=
nullptr
,
Tensor
*
cpu_var_z
=
nullptr
,
Tensor
*
cpu_saved_mean_z
=
nullptr
,
Tensor
*
cpu_saved_var_z
=
nullptr
,
Tensor
*
saved_reserve_space_z
=
nullptr
)
{
InitMeanVar
(
cpu_mean_x
,
cpu_var_x
,
cpu_saved_mean_x
,
cpu_saved_var_x
);
ComputeBatchNormForward
(
ctx
,
cpu_x_
,
cpu_bn_scale_x_
,
cpu_bn_bias_x_
,
cpu_mean_x
,
cpu_var_x
,
cpu_saved_mean_x
,
cpu_saved_var_x
,
cpu_y
,
saved_reserve_space_x
);
if
(
has_shortcut_
)
{
framework
::
Tensor
cpu_z_out
;
InitMeanVar
(
cpu_mean_z
,
cpu_var_z
,
cpu_saved_mean_z
,
cpu_saved_var_z
);
ComputeBatchNormForward
(
ctx
,
cpu_z_
,
cpu_bn_scale_z_
,
cpu_bn_bias_z_
,
cpu_mean_z
,
cpu_var_z
,
cpu_saved_mean_z
,
cpu_saved_var_z
,
&
cpu_z_out
,
saved_reserve_space_z
);
ComputeInplaceAdd
<
T
>
(
cpu_z_out
,
cpu_y
);
}
else
{
if
(
fuse_add_
)
{
ComputeInplaceAdd
<
T
>
(
cpu_z_
,
cpu_y
);
}
}
if
(
act_type_
==
"relu"
)
{
ComputeInplaceRelu
<
T
>
(
cpu_y
);
}
}
void
BaselineForwardFusedBNAddRelu
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_mean
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
Tensor
*
cpu_saved_var
,
Tensor
*
cpu_y
,
Tensor
*
cpu_saved_var
,
Tensor
*
cpu_y
,
Tensor
*
cpu
_reserve_space
)
{
Tensor
*
saved
_reserve_space
)
{
InitMeanVar
(
cpu_mean
,
cpu_var
,
cpu_saved_mean
,
cpu_saved_var
);
InitMeanVar
(
cpu_mean
,
cpu_var
,
cpu_saved_mean
,
cpu_saved_var
);
ComputeBatchNormForward
(
ctx
,
cpu_x_
,
cpu_bn_scale_
,
cpu_bn_bias_
,
cpu_mean
,
ComputeFusedBNAddReluForward
(
cpu_var
,
cpu_saved_mean
,
cpu_saved_var
,
cpu_y
,
ctx
,
cpu_x_
,
cpu_z_
,
cpu_bn_scale_x_
,
cpu_bn_bias_x_
,
cpu_mean
,
cpu_var
,
cpu_reserve_space
);
cpu_saved_mean
,
cpu_saved_var
,
cpu_y
,
saved_reserve_space
);
}
void
BaselineBackwardFusedBNAddRelu
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_dx
,
Tensor
*
cpu_dz
,
Tensor
*
cpu_dscale
,
Tensor
*
cpu_dbias
)
{
ComputeFusedBNAddReluBackward
(
ctx
,
cpu_dy_
,
cpu_x_
,
cpu_bn_scale_x_
,
cpu_bn_bias_x_
,
cpu_saved_mean_base_x_
,
cpu_saved_var_base_x_
,
cpu_y_base_
,
saved_reserve_space_x_
,
cpu_dx
,
cpu_dz
,
cpu_dscale
,
cpu_dbias
);
}
void
ComputeFusedBNStatsFinalize
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_x
,
const
Tensor
&
cpu_bn_scale
,
const
Tensor
&
cpu_bn_bias
,
Tensor
*
sum
,
Tensor
*
sum_of_square
,
Tensor
*
bn_scale
,
Tensor
*
bn_bias
,
Tensor
*
mean
,
Tensor
*
var
,
Tensor
*
saved_mean
,
Tensor
*
saved_var
,
Tensor
*
equiv_scale
,
Tensor
*
equiv_bias
)
{
framework
::
Tensor
cpu_sum
;
framework
::
Tensor
cpu_sum_of_square
;
ComputeSumAndSquareSum
<
T
>
(
cpu_x
,
&
cpu_sum
,
&
cpu_sum_of_square
);
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_sum
,
place
,
sum
);
TensorCopySync
(
cpu_sum_of_square
,
place
,
sum_of_square
);
TensorCopySync
(
cpu_bn_scale
,
place
,
bn_scale
);
TensorCopySync
(
cpu_bn_bias
,
place
,
bn_bias
);
bn_scale
->
Resize
({
1
,
1
,
1
,
channels_
});
bn_bias
->
Resize
({
1
,
1
,
1
,
channels_
});
// input
float
*
sum_ptr
=
sum
->
data
<
float
>
();
float
*
sum_of_square_ptr
=
sum_of_square
->
data
<
float
>
();
float
*
bn_scale_ptr
=
bn_scale
->
data
<
float
>
();
float
*
bn_bias_ptr
=
bn_bias
->
data
<
float
>
();
mean
->
Resize
({
1
,
1
,
1
,
channels_
});
var
->
Resize
({
1
,
1
,
1
,
channels_
});
// output
float
*
mean_ptr
=
mean
->
data
<
float
>
();
float
*
var_ptr
=
var
->
data
<
float
>
();
float
*
saved_mean_ptr
=
saved_mean
->
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
float
*
saved_var_ptr
=
saved_var
->
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_scale_ptr
=
equiv_scale
->
mutable_data
<
T
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_bias_ptr
=
equiv_bias
->
mutable_data
<
T
>
({
1
,
1
,
1
,
channels_
},
place
);
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale
->
dims
());
op
::
CudnnBNStatsFinalize
<
T
>
bn_op
(
ctx
,
param_shape
);
bn_op
.
Forward
(
ctx
,
sum_ptr
,
sum_of_square_ptr
,
bn_scale_ptr
,
bn_bias_ptr
,
saved_mean_ptr
,
saved_var_ptr
,
mean_ptr
,
var_ptr
,
equiv_scale_ptr
,
equiv_bias_ptr
,
eps_
,
momentum_
,
ele_count_
,
true
);
}
}
// Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
// Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
void
FusedForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_mean
,
void
FusedForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_mean_x
,
Tensor
*
cpu_var
,
Tensor
*
cpu_saved_mean
,
Tensor
*
cpu_var_x
,
Tensor
*
cpu_saved_mean_x
,
Tensor
*
cpu_saved_var
,
Tensor
*
cpu_y
,
Tensor
*
cpu_bitmask
)
{
Tensor
*
cpu_saved_var_x
,
Tensor
*
cpu_y
,
Tensor
*
cpu_bitmask
,
Tensor
*
cpu_mean_z
=
nullptr
,
Tensor
*
cpu_var_z
=
nullptr
,
Tensor
*
cpu_saved_mean_z
=
nullptr
,
Tensor
*
cpu_saved_var_z
=
nullptr
)
{
framework
::
Tensor
x
;
framework
::
Tensor
x
;
framework
::
Tensor
sum
;
framework
::
Tensor
sum_x
;
framework
::
Tensor
sum_of_square
;
framework
::
Tensor
sum_of_square_x
;
framework
::
Tensor
bn_scale
;
framework
::
Tensor
bn_scale_x
;
framework
::
Tensor
bn_bias
;
framework
::
Tensor
bn_bias_x
;
framework
::
Tensor
z
;
framework
::
Tensor
sum_z
;
framework
::
Tensor
sum_of_square_z
;
framework
::
Tensor
bn_scale_z
;
framework
::
Tensor
bn_bias_z
;
auto
place
=
ctx
.
GetPlace
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_x_
,
place
,
&
x
);
TensorCopySync
(
cpu_x_
,
place
,
&
x
);
TensorCopySync
(
cpu_sum_
,
place
,
&
sum
);
if
(
fuse_add_
||
has_shortcut_
)
{
TensorCopySync
(
cpu_sum_of_square_
,
place
,
&
sum_of_square
);
TensorCopySync
(
cpu_z_
,
place
,
&
z
);
TensorCopySync
(
cpu_bn_scale_
,
place
,
&
bn_scale
);
}
TensorCopySync
(
cpu_bn_bias_
,
place
,
&
bn_bias
);
bn_scale
.
Resize
({
1
,
1
,
1
,
channels_
});
framework
::
Tensor
mean_x
;
bn_bias
.
Resize
({
1
,
1
,
1
,
channels_
});
framework
::
Tensor
var_x
;
framework
::
Tensor
saved_mean_x
;
framework
::
Tensor
saved_var_x
;
framework
::
Tensor
equiv_scale_x
;
framework
::
Tensor
equiv_bias_x
;
T
*
x_ptr
=
x
.
data
<
T
>
();
framework
::
Tensor
mean_z
;
float
*
sum_ptr
=
sum
.
data
<
float
>
();
framework
::
Tensor
var_z
;
float
*
sum_of_square_ptr
=
sum_of_square
.
data
<
float
>
();
framework
::
Tensor
saved_mean_z
;
float
*
bn_scale_ptr
=
bn_scale
.
data
<
float
>
();
framework
::
Tensor
saved_var_z
;
float
*
bn_bias_ptr
=
bn_bias
.
data
<
float
>
();
framework
::
Tensor
equiv_scale_z
;
framework
::
Tensor
equiv_bias_z
;
framework
::
Tensor
mean
;
framework
::
Tensor
var
;
framework
::
Tensor
saved_mean
;
framework
::
Tensor
saved_var
;
framework
::
Tensor
equiv_scale
;
framework
::
Tensor
equiv_bias
;
framework
::
Tensor
y
;
framework
::
Tensor
y
;
framework
::
Tensor
bitmask
;
framework
::
Tensor
bitmask
;
InitMeanVar
(
cpu_mean
,
cpu_var
,
cpu_saved_mean
,
cpu_saved_var
);
InitMeanVar
(
cpu_mean_x
,
cpu_var_x
,
cpu_saved_mean_x
,
cpu_saved_var_x
);
TensorCopySync
(
*
cpu_mean
,
place
,
&
mean
);
TensorCopySync
(
*
cpu_mean_x
,
place
,
&
mean_x
);
TensorCopySync
(
*
cpu_var
,
place
,
&
var
);
TensorCopySync
(
*
cpu_var_x
,
place
,
&
var_x
);
if
(
has_shortcut_
)
{
InitMeanVar
(
cpu_mean_z
,
cpu_var_z
,
cpu_saved_mean_z
,
cpu_saved_var_z
);
TensorCopySync
(
*
cpu_mean_z
,
place
,
&
mean_z
);
TensorCopySync
(
*
cpu_var_z
,
place
,
&
var_z
);
}
mean
.
Resize
({
1
,
1
,
1
,
channels_
});
// 1. BN Stats Finalize
var
.
Resize
({
1
,
1
,
1
,
channels_
});
ComputeFusedBNStatsFinalize
(
ctx
,
cpu_x_
,
cpu_bn_scale_x_
,
cpu_bn_bias_x_
,
&
sum_x
,
&
sum_of_square_x
,
&
bn_scale_x
,
&
bn_bias_x
,
&
mean_x
,
&
var_x
,
&
saved_mean_x
,
&
saved_var_x
,
&
equiv_scale_x
,
&
equiv_bias_x
);
if
(
has_shortcut_
)
{
ComputeFusedBNStatsFinalize
(
ctx
,
cpu_z_
,
cpu_bn_scale_z_
,
cpu_bn_bias_z_
,
&
sum_z
,
&
sum_of_square_z
,
&
bn_scale_z
,
&
bn_bias_z
,
&
mean_z
,
&
var_z
,
&
saved_mean_z
,
&
saved_var_z
,
&
equiv_scale_z
,
&
equiv_bias_z
);
}
float
*
mean_ptr
=
mean
.
data
<
float
>
();
T
*
x_ptr
=
x
.
data
<
T
>
();
float
*
var_ptr
=
var
.
data
<
float
>
();
T
*
z_ptr
=
(
fuse_add_
||
has_shortcut_
)
?
z
.
data
<
T
>
()
:
nullptr
;
float
*
saved_mean_ptr
=
T
*
equiv_scale_x_ptr
=
equiv_scale_x
.
data
<
T
>
();
saved_mean
.
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_bias_x_ptr
=
equiv_bias_x
.
data
<
T
>
();
float
*
saved_var_ptr
=
T
*
equiv_scale_z_ptr
=
has_shortcut_
?
equiv_scale_z
.
data
<
T
>
()
:
nullptr
;
saved_var
.
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_bias_z_ptr
=
has_shortcut_
?
equiv_bias_z
.
data
<
T
>
()
:
nullptr
;
T
*
equiv_scale_ptr
=
equiv_scale
.
mutable_data
<
T
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
equiv_bias_ptr
=
equiv_bias
.
mutable_data
<
T
>
({
1
,
1
,
1
,
channels_
},
place
);
T
*
y_ptr
=
T
*
y_ptr
=
y
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
y
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
// bitmask
int
c
=
channels_
;
int
c
=
channels_
;
int64_t
nhw
=
ele_count_
;
int64_t
nhw
=
ele_count_
;
int32_t
c_int32_elems
=
((
c
+
63
)
&
~
63
)
/
32
;
int32_t
c_int32_elems
=
((
c
+
63
)
&
~
63
)
/
32
;
...
@@ -325,31 +644,90 @@ class CudnnBNAddReluTester {
...
@@ -325,31 +644,90 @@ class CudnnBNAddReluTester {
{
nhw_int32_elems
,
c_int32_elems
,
1
},
place
);
{
nhw_int32_elems
,
c_int32_elems
,
1
},
place
);
auto
data_shape
=
framework
::
vectorize
<
int
>
(
x
.
dims
());
auto
data_shape
=
framework
::
vectorize
<
int
>
(
x
.
dims
());
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale
.
dims
());
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale
_x
.
dims
());
auto
bitmask_shape
=
framework
::
vectorize
<
int
>
(
bitmask
.
dims
());
auto
bitmask_shape
=
framework
::
vectorize
<
int
>
(
bitmask
.
dims
());
// 1. BN Stats Finalize
// 2. Scale Bias + Relu
op
::
CudnnBNStatsFinalize
<
T
>
bn_op
(
ctx
,
param_shape
);
op
::
CudnnScaleBiasAddRelu
<
T
>
sbar_op
(
ctx
,
act_type_
,
fuse_add_
,
bn_op
.
Forward
(
ctx
,
sum_ptr
,
sum_of_square_ptr
,
bn_scale_ptr
,
bn_bias_ptr
,
has_shortcut_
,
data_shape
,
param_shape
,
saved_mean_ptr
,
saved_var_ptr
,
mean_ptr
,
var_ptr
,
bitmask_shape
);
equiv_scale_ptr
,
equiv_bias_ptr
,
eps_
,
momentum_
,
ele_count_
,
sbar_op
.
Forward
(
ctx
,
x_ptr
,
equiv_scale_x_ptr
,
equiv_bias_x_ptr
,
y_ptr
,
true
);
bitmask_ptr
,
z_ptr
,
equiv_scale_z_ptr
,
equiv_bias_z_ptr
);
// 2. Scale Bias + Relu (not fused add)
TensorCopySync
(
mean_x
,
platform
::
CPUPlace
(),
cpu_mean_x
);
std
::
string
act_type
=
""
;
TensorCopySync
(
var_x
,
platform
::
CPUPlace
(),
cpu_var_x
);
op
::
CudnnScaleBiasAddRelu
<
T
>
sbar_op
(
TensorCopySync
(
saved_mean_x
,
platform
::
CPUPlace
(),
cpu_saved_mean_x
);
ctx
,
act_type
,
false
,
false
,
data_shape
,
param_shape
,
bitmask_shape
);
TensorCopySync
(
saved_var_x
,
platform
::
CPUPlace
(),
cpu_saved_var_x
);
sbar_op
.
Forward
(
ctx
,
x_ptr
,
equiv_scale_ptr
,
equiv_bias_ptr
,
y_ptr
,
if
(
has_shortcut_
)
{
bitmask_ptr
);
TensorCopySync
(
mean_z
,
platform
::
CPUPlace
(),
cpu_mean_z
);
TensorCopySync
(
var_z
,
platform
::
CPUPlace
(),
cpu_var_z
);
TensorCopySync
(
mean
,
platform
::
CPUPlace
(),
cpu_mean
);
TensorCopySync
(
saved_mean_z
,
platform
::
CPUPlace
(),
cpu_saved_mean_z
);
TensorCopySync
(
var
,
platform
::
CPUPlace
(),
cpu_var
);
TensorCopySync
(
saved_var_z
,
platform
::
CPUPlace
(),
cpu_saved_var_z
);
TensorCopySync
(
saved_mean
,
platform
::
CPUPlace
(),
cpu_saved_mean
);
}
TensorCopySync
(
saved_var
,
platform
::
CPUPlace
(),
cpu_saved_var
);
TensorCopySync
(
y
,
platform
::
CPUPlace
(),
cpu_y
);
TensorCopySync
(
y
,
platform
::
CPUPlace
(),
cpu_y
);
TensorCopySync
(
bitmask
,
platform
::
CPUPlace
(),
cpu_bitmask
);
TensorCopySync
(
bitmask
,
platform
::
CPUPlace
(),
cpu_bitmask
);
}
}
// Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
void
FusedBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
Tensor
*
cpu_dx
,
Tensor
*
cpu_dz
,
Tensor
*
cpu_dscale
,
Tensor
*
cpu_dbias
)
{
framework
::
Tensor
dy
;
framework
::
Tensor
x
;
framework
::
Tensor
bn_scale
;
framework
::
Tensor
bn_bias
;
framework
::
Tensor
saved_mean
;
framework
::
Tensor
saved_var
;
framework
::
Tensor
bitmask
;
framework
::
Tensor
dx
;
framework
::
Tensor
dz
;
framework
::
Tensor
dscale
;
framework
::
Tensor
dbias
;
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_dy_
,
place
,
&
dy
);
TensorCopySync
(
cpu_x_
,
place
,
&
x
);
TensorCopySync
(
cpu_bn_scale_x_
,
place
,
&
bn_scale
);
TensorCopySync
(
cpu_bn_bias_x_
,
place
,
&
bn_bias
);
TensorCopySync
(
cpu_saved_mean_x_
,
place
,
&
saved_mean
);
TensorCopySync
(
cpu_saved_var_x_
,
place
,
&
saved_var
);
TensorCopySync
(
cpu_bitmask_
,
place
,
&
bitmask
);
bn_scale
.
Resize
({
1
,
1
,
1
,
channels_
});
bn_bias
.
Resize
({
1
,
1
,
1
,
channels_
});
saved_mean
.
Resize
({
1
,
1
,
1
,
channels_
});
saved_var
.
Resize
({
1
,
1
,
1
,
channels_
});
T
*
dy_ptr
=
dy
.
data
<
T
>
();
T
*
x_ptr
=
x
.
data
<
T
>
();
float
*
bn_scale_ptr
=
bn_scale
.
data
<
float
>
();
float
*
bn_bias_ptr
=
bn_bias
.
data
<
float
>
();
float
*
saved_mean_ptr
=
saved_mean
.
data
<
float
>
();
float
*
saved_var_ptr
=
saved_var
.
data
<
float
>
();
int32_t
*
bitmask_ptr
=
bitmask
.
data
<
int32_t
>
();
T
*
dx_ptr
=
dx
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
T
*
dz_ptr
=
dz
.
mutable_data
<
T
>
({
batch_size_
,
height_
,
width_
,
channels_
},
place
);
float
*
dscale_ptr
=
dscale
.
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
float
*
dbias_ptr
=
dbias
.
mutable_data
<
float
>
({
1
,
1
,
1
,
channels_
},
place
);
auto
data_shape
=
framework
::
vectorize
<
int
>
(
x
.
dims
());
auto
param_shape
=
framework
::
vectorize
<
int
>
(
bn_scale
.
dims
());
auto
bitmask_shape
=
framework
::
vectorize
<
int
>
(
bitmask
.
dims
());
std
::
string
act_type
=
"relu"
;
op
::
CudnnScaleBiasAddRelu
<
T
>
sbar_op
(
ctx
,
act_type
,
true
,
false
,
data_shape
,
param_shape
,
bitmask_shape
);
sbar_op
.
Backward
(
ctx
,
dy_ptr
,
x_ptr
,
bn_scale_ptr
,
bn_bias_ptr
,
saved_mean_ptr
,
saved_var_ptr
,
bitmask_ptr
,
dx_ptr
,
dz_ptr
,
dscale_ptr
,
dbias_ptr
,
eps_
);
TensorCopySync
(
dx
,
platform
::
CPUPlace
(),
cpu_dx
);
TensorCopySync
(
dz
,
platform
::
CPUPlace
(),
cpu_dz
);
TensorCopySync
(
dscale
,
platform
::
CPUPlace
(),
cpu_dscale
);
TensorCopySync
(
dbias
,
platform
::
CPUPlace
(),
cpu_dbias
);
}
private:
private:
int
batch_size_
;
int
batch_size_
;
int
height_
;
int
height_
;
...
@@ -357,24 +735,80 @@ class CudnnBNAddReluTester {
...
@@ -357,24 +735,80 @@ class CudnnBNAddReluTester {
int
channels_
;
int
channels_
;
int
ele_count_
;
int
ele_count_
;
std
::
string
act_type_
;
bool
fuse_add_
;
bool
has_shortcut_
;
// Forward input
// Forward input
framework
::
Tensor
cpu_x_
;
framework
::
Tensor
cpu_x_
;
framework
::
Tensor
cpu_sum_
;
framework
::
Tensor
cpu_bn_scale_x_
;
framework
::
Tensor
cpu_sum_of_square_
;
framework
::
Tensor
cpu_bn_bias_x_
;
framework
::
Tensor
cpu_bn_scale_
;
framework
::
Tensor
cpu_z_
;
framework
::
Tensor
cpu_bn_bias_
;
framework
::
Tensor
cpu_bn_scale_z_
;
framework
::
Tensor
cpu_bn_bias_z_
;
// Backward input
framework
::
Tensor
cpu_dy_
;
framework
::
Tensor
cpu_bitmask_
;
framework
::
Tensor
cpu_saved_mean_x_
;
framework
::
Tensor
cpu_saved_var_x_
;
framework
::
Tensor
cpu_saved_mean_z_
;
framework
::
Tensor
cpu_saved_var_z_
;
framework
::
Tensor
cpu_saved_mean_base_x_
;
framework
::
Tensor
cpu_saved_var_base_x_
;
framework
::
Tensor
saved_reserve_space_x_
;
framework
::
Tensor
cpu_saved_mean_base_z_
;
framework
::
Tensor
cpu_saved_var_base_z_
;
framework
::
Tensor
saved_reserve_space_z_
;
framework
::
Tensor
cpu_y_base_
;
double
eps_
=
1e-5
;
double
eps_
=
1e-5
;
float
momentum_
=
0.9
;
float
momentum_
=
0.9
;
};
};
TEST
(
CudnnBNAddReluForward
,
GPUCudnnBNAddReluForwardFp16
)
{
TEST
(
CudnnBNAddReluFp16
,
BNAdd
)
{
int
batch_size
=
4
;
int
height
=
8
;
int
width
=
8
;
int
channels
=
64
;
std
::
string
act_type
=
""
;
bool
has_shortcut
=
false
;
FLAGS_cudnn_batchnorm_spatial_persistent
=
true
;
for
(
auto
fuse_add
:
{
false
,
true
})
{
CudnnBNAddReluTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
channels
,
act_type
,
fuse_add
,
has_shortcut
);
test
.
CheckForward
(
2e-3
);
}
}
TEST
(
CudnnBNAddReluFp16
,
BNAddRelu
)
{
int
batch_size
=
4
;
int
batch_size
=
4
;
int
height
=
8
;
int
height
=
8
;
int
width
=
8
;
int
width
=
8
;
int
channels
=
64
;
int
channels
=
64
;
std
::
string
act_type
=
"relu"
;
bool
has_shortcut
=
false
;
FLAGS_cudnn_batchnorm_spatial_persistent
=
true
;
FLAGS_cudnn_batchnorm_spatial_persistent
=
true
;
CudnnBNAddReluTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
for
(
auto
fuse_add
:
{
false
,
true
})
{
width
,
channels
);
CudnnBNAddReluTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
channels
,
act_type
,
fuse_add
,
has_shortcut
);
test
.
CheckForward
(
2e-3
);
test
.
CheckForward
(
2e-3
);
if
(
fuse_add
)
{
test
.
CheckBackward
(
2e-4
);
}
}
}
TEST
(
CudnnBNAddReluFp16
,
HasShortcut
)
{
int
batch_size
=
4
;
int
height
=
8
;
int
width
=
8
;
int
channels
=
64
;
std
::
string
act_type
=
""
;
bool
fuse_add
=
false
;
bool
has_shortcut
=
true
;
FLAGS_cudnn_batchnorm_spatial_persistent
=
true
;
CudnnBNAddReluTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
channels
,
act_type
,
fuse_add
,
has_shortcut
);
test
.
CheckForward
(
5e-3
);
}
}
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
浏览文件 @
a679fcbb
...
@@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res,
...
@@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res,
}
}
// Use Paddle conv2d op results as baseline
// Use Paddle conv2d op results as baseline
template
<
typename
T
>
void
ComputeConv2DForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
void
ComputeConv2DForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
Tensor
*
cpu_output
)
{
Tensor
*
cpu_output
,
int
stride
,
int
padding
)
{
framework
::
Scope
scope
;
framework
::
Scope
scope
;
auto
*
input
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
input
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter
=
scope
.
Var
(
"Filter"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter
=
scope
.
Var
(
"Filter"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
...
@@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
...
@@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
framework
::
AttributeMap
attrs
;
framework
::
AttributeMap
attrs
;
bool
use_cudnn
=
true
;
bool
use_cudnn
=
true
;
std
::
string
data_format
=
"NHWC"
;
std
::
string
data_format
=
"NHWC"
;
std
::
string
padding_algorithm
=
"SAME"
;
std
::
vector
<
int
>
strides
=
{
stride
,
stride
};
std
::
vector
<
int
>
paddings
=
{
padding
,
padding
};
attrs
.
insert
({
"strides"
,
strides
});
attrs
.
insert
({
"paddings"
,
paddings
});
attrs
.
insert
({
"use_cudnn"
,
use_cudnn
});
attrs
.
insert
({
"use_cudnn"
,
use_cudnn
});
attrs
.
insert
({
"data_format"
,
data_format
});
attrs
.
insert
({
"data_format"
,
data_format
});
attrs
.
insert
({
"padding_algorithm"
,
padding_algorithm
});
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
"conv2d"
,
{{
"Input"
,
{
"Input"
}},
{
"Filter"
,
{
"Filter"
}}},
"conv2d"
,
{{
"Input"
,
{
"Input"
}},
{
"Filter"
,
{
"Filter"
}}},
...
@@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
...
@@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
}
}
// Use Paddle conv2d_grad op results as baseline
// Use Paddle conv2d_grad op results as baseline
template
<
typename
T
>
void
ComputeConv2DBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
void
ComputeConv2DBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
const
Tensor
&
cpu_output_grad
,
const
Tensor
&
cpu_output_grad
,
...
@@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
...
@@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
framework
::
AttributeMap
attrs
;
framework
::
AttributeMap
attrs
;
bool
use_cudnn
=
true
;
bool
use_cudnn
=
true
;
std
::
string
data_format
=
"NHWC"
;
std
::
string
data_format
=
"NHWC"
;
std
::
string
padding_algorithm
=
"
SAME
"
;
std
::
string
padding_algorithm
=
"
EXPLICIT
"
;
std
::
vector
<
int
>
strides
=
{
stride
,
stride
};
std
::
vector
<
int
>
strides
=
{
stride
,
stride
};
std
::
vector
<
int
>
paddings
=
{
padding
,
padding
};
std
::
vector
<
int
>
paddings
=
{
padding
,
padding
};
std
::
vector
<
int
>
dilations
=
{
dilation
,
dilation
};
std
::
vector
<
int
>
dilations
=
{
dilation
,
dilation
};
...
@@ -216,6 +216,8 @@ class CudnnNormConvolutionTester {
...
@@ -216,6 +216,8 @@ class CudnnNormConvolutionTester {
kernel_size_
=
kernel_size
;
kernel_size_
=
kernel_size
;
stride_
=
stride
;
stride_
=
stride
;
padding_
=
(
kernel_size_
-
1
)
/
2
;
padding_
=
(
kernel_size_
-
1
)
/
2
;
out_height_
=
(
height_
+
2
*
padding_
-
kernel_size_
)
/
stride_
+
1
;
out_width_
=
(
width_
+
2
*
padding_
-
kernel_size_
)
/
stride_
+
1
;
SetUp
();
SetUp
();
}
}
...
@@ -227,6 +229,15 @@ class CudnnNormConvolutionTester {
...
@@ -227,6 +229,15 @@ class CudnnNormConvolutionTester {
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
platform
::
CUDAPlace
(
0
)));
if
(
!
Support
(
*
ctx
))
{
LOG
(
INFO
)
<<
"Current test is only supported in the platforms with "
<<
"compatiblity greater than or equal to 70 and the kernel size "
<<
"must be equal to 1 or 3. Besides, when the kernel size is 1, "
<<
"the stride must be 1 if the compatiblity is equal to 70."
;
return
;
}
framework
::
Tensor
cpu_output_base
;
framework
::
Tensor
cpu_output_base
;
framework
::
Tensor
cpu_sum_base
;
framework
::
Tensor
cpu_sum_base
;
framework
::
Tensor
cpu_sum_of_square_base
;
framework
::
Tensor
cpu_sum_of_square_base
;
...
@@ -277,7 +288,8 @@ class CudnnNormConvolutionTester {
...
@@ -277,7 +288,8 @@ class CudnnNormConvolutionTester {
&
cpu_filter_nchw_
);
&
cpu_filter_nchw_
);
// transpoes for filter, NCHW -> NHWC
// transpoes for filter, NCHW -> NHWC
TransposeNchwToNhwc
<
T
>
(
cpu_filter_nchw_
,
&
cpu_filter_nhwc_
);
TransposeNchwToNhwc
<
T
>
(
cpu_filter_nchw_
,
&
cpu_filter_nhwc_
);
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
output_channels_
},
InitRandomTensor
<
T
>
(
{
batch_size_
,
out_height_
,
out_width_
,
output_channels_
},
&
cpu_output_grad_
);
&
cpu_output_grad_
);
}
}
...
@@ -285,7 +297,8 @@ class CudnnNormConvolutionTester {
...
@@ -285,7 +297,8 @@ class CudnnNormConvolutionTester {
framework
::
Tensor
*
cpu_output_base
,
framework
::
Tensor
*
cpu_output_base
,
framework
::
Tensor
*
cpu_sum_base
,
framework
::
Tensor
*
cpu_sum_base
,
framework
::
Tensor
*
cpu_sum_of_square_base
)
{
framework
::
Tensor
*
cpu_sum_of_square_base
)
{
ComputeConv2DForward
<
T
>
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_base
);
ComputeConv2DForward
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_base
,
stride_
,
padding_
);
ComputeSumAndSquareSum
<
T
>
(
*
cpu_output_base
,
cpu_sum_base
,
ComputeSumAndSquareSum
<
T
>
(
*
cpu_output_base
,
cpu_sum_base
,
cpu_sum_of_square_base
);
cpu_sum_of_square_base
);
}
}
...
@@ -293,10 +306,9 @@ class CudnnNormConvolutionTester {
...
@@ -293,10 +306,9 @@ class CudnnNormConvolutionTester {
void
BaselineBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
void
BaselineBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
framework
::
Tensor
*
cpu_input_grad_base
,
framework
::
Tensor
*
cpu_input_grad_base
,
framework
::
Tensor
*
cpu_filter_grad_base
)
{
framework
::
Tensor
*
cpu_filter_grad_base
)
{
ComputeConv2DBackward
<
T
>
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
ComputeConv2DBackward
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_grad_
,
cpu_output_grad_
,
cpu_input_grad_base
,
cpu_input_grad_base
,
cpu_filter_grad_base
,
stride_
,
cpu_filter_grad_base
,
stride_
,
padding_
,
padding_
,
dilation_
);
dilation_
);
}
}
// get forward results of cudnn_norm_conv
// get forward results of cudnn_norm_conv
...
@@ -316,7 +328,7 @@ class CudnnNormConvolutionTester {
...
@@ -316,7 +328,7 @@ class CudnnNormConvolutionTester {
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
output_ptr
=
output
.
mutable_data
<
T
>
(
T
*
output_ptr
=
output
.
mutable_data
<
T
>
(
{
batch_size_
,
height_
,
width_
,
output_channels_
},
place
);
{
batch_size_
,
out_height_
,
out_
width_
,
output_channels_
},
place
);
float
*
sum_ptr
=
float
*
sum_ptr
=
sum
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
sum
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
float
*
sum_of_square_ptr
=
float
*
sum_of_square_ptr
=
...
@@ -369,10 +381,25 @@ class CudnnNormConvolutionTester {
...
@@ -369,10 +381,25 @@ class CudnnNormConvolutionTester {
TensorCopySync
(
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
TensorCopySync
(
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
}
}
bool
Support
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
if
(
ctx
.
GetComputeCapability
()
==
70
)
{
if
((
kernel_size_
==
3
)
||
((
kernel_size_
==
1
)
&&
(
stride_
==
1
)))
{
return
true
;
}
}
else
if
(
ctx
.
GetComputeCapability
()
>
70
)
{
if
((
kernel_size_
==
3
)
||
(
kernel_size_
==
1
))
{
return
true
;
}
}
return
false
;
}
private:
private:
int
batch_size_
;
int
batch_size_
;
int
height_
;
int
height_
;
int
width_
;
int
width_
;
int
out_height_
;
int
out_width_
;
int
input_channels_
;
int
input_channels_
;
int
output_channels_
;
int
output_channels_
;
int
kernel_size_
;
int
kernel_size_
;
...
@@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) {
...
@@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) {
test
.
CheckForward
(
1e-3
,
true
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
CheckBackward
(
1e-3
,
true
);
test
.
CheckBackward
(
1e-3
,
true
);
}
}
// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
TEST
(
CudnnNormConvFp16
,
K1S2O4
)
{
int
batch_size
=
4
;
int
height
=
8
;
int
width
=
8
;
int
input_channels
=
32
;
int
output_channels
=
128
;
int
kernel_size
=
1
;
int
stride
=
2
;
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
CheckBackward
(
1e-3
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录