Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
c63a63d5
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c63a63d5
编写于
1月 14, 2020
作者:
Z
Zhen Wang
提交者:
Zeng Jinle
1月 13, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support the fusion of batch_norm and relu for AMP. test=release/1.7 (#22210)
上级
fedb609d
变更
17
显示空白变更内容
内联
并排
Showing
17 changed file
with
1587 addition
and
3 deletion
+1587
-3
cmake/operators.cmake
cmake/operators.cmake
+1
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+1
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+8
-0
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-0
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-0
paddle/fluid/framework/ir/fuse_bn_act_pass.cc
paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+332
-0
paddle/fluid/framework/ir/fuse_bn_act_pass.h
paddle/fluid/framework/ir/fuse_bn_act_pass.h
+64
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+129
-0
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+61
-0
paddle/fluid/framework/unused_var_check.cc
paddle/fluid/framework/unused_var_check.cc
+2
-0
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+6
-0
paddle/fluid/operators/fused/fused_bn_activation_op.cc
paddle/fluid/operators/fused/fused_bn_activation_op.cc
+296
-0
paddle/fluid/operators/fused/fused_bn_activation_op.cu
paddle/fluid/operators/fused/fused_bn_activation_op.cu
+432
-0
paddle/fluid/operators/fused/fused_bn_activation_op.h
paddle/fluid/operators/fused/fused_bn_activation_op.h
+109
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+20
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+3
-1
python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+121
-0
未找到文件。
cmake/operators.cmake
浏览文件 @
c63a63d5
...
...
@@ -118,7 +118,7 @@ function(op_library TARGET)
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
"fusion_conv_inception_op"
"sync_batch_norm_op"
"dgc_op"
"fused_fc_elementwise_layernorm_op"
"multihead_matmul_op"
"fusion_group_op"
)
"multihead_matmul_op"
"fusion_group_op"
"fused_bn_activation_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
endif
()
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
c63a63d5
...
...
@@ -100,7 +100,7 @@ endif()
cc_library
(
build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass
multi_batch_merge_pass
fuse_elewise_add_act_pass
fuse_bn_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass
lock_free_optimize_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
c63a63d5
...
...
@@ -167,6 +167,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"fuse_relu_depthwise_conv_pass"
);
AppendPassWithCheck
(
strategy_
.
fuse_elewise_add_act_ops_
,
"fuse_elewise_add_act_pass"
);
AppendPassWithCheck
(
strategy_
.
fuse_bn_act_ops_
,
"fuse_bn_act_pass"
);
// for single card training, fuse_all_reduce_ops is unnecessary.
// coalesce_grad_tensor_pass should be before of MultiDevPass.
AppendPassWithCheck
(
strategy_
.
fuse_all_reduce_ops_
,
...
...
@@ -369,6 +370,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fuse_bn_act_pass"
)
{
if
(
!
use_cuda
)
{
LOG
(
WARNING
)
<<
"fuse_bn_act_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"mkldnn_placement_pass"
)
{
pass
->
Set
(
"mkldnn_enabled_op_types"
,
new
std
::
unordered_set
<
std
::
string
>
(
mkldnn_enabled_op_types_
));
...
...
@@ -394,6 +401,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
USE_PASS
(
sync_batch_norm_pass
);
USE_PASS
(
fuse_relu_depthwise_conv_pass
);
USE_PASS
(
fuse_elewise_add_act_pass
);
USE_PASS
(
fuse_bn_act_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
multi_batch_merge_pass
);
USE_PASS
(
reduce_mode_multi_devices_pass
);
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
c63a63d5
...
...
@@ -87,6 +87,7 @@ struct BuildStrategy {
// TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
// cycle.
bool
fuse_elewise_add_act_ops_
{
false
};
bool
fuse_bn_act_ops_
{
false
};
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types
boost
::
optional
<
bool
>
fuse_all_optimizer_ops_
{
false
};
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
c63a63d5
...
...
@@ -106,6 +106,7 @@ if(WITH_NGRAPH)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
ngraph_subgraph_pass CACHE INTERNAL
""
)
endif
()
cc_library
(
fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector
)
cc_library
(
fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector
)
cc_library
(
fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector
)
...
...
paddle/fluid/framework/ir/fuse_bn_act_pass.cc
0 → 100644
浏览文件 @
c63a63d5
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fuse_bn_act_pass.h"
#include <algorithm>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
FuseBatchNormActPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
#ifdef PADDLE_WITH_CUDA
#if CUDNN_VERSION_MIN(7, 4, 1)
// forward
std
::
unordered_set
<
std
::
string
>
act_types
=
{
"relu"
};
graph
=
FuseBatchNormAct
(
graph
,
act_types
);
// backward
std
::
unordered_set
<
std
::
string
>
act_grad_types
=
{
"relu_grad"
};
graph
=
FuseBatchNormActGrad
(
graph
,
act_grad_types
);
#endif
#endif
}
// act(bn(x))
ir
::
Graph
*
FuseBatchNormActPass
::
FuseBatchNormAct
(
ir
::
Graph
*
graph
,
const
std
::
unordered_set
<
std
::
string
>
&
act_types
)
const
{
PADDLE_ENFORCE_NOT_NULL
(
graph
,
platform
::
errors
::
InvalidArgument
(
"The input graph of FuseBatchNormAct should not be nullptr."
));
FusePassBase
::
Init
(
"bn_act"
,
graph
);
GraphPatternDetector
gpd
;
auto
*
x
=
gpd
.
mutable_pattern
()
->
NewNode
(
"bn_act/x"
)
->
AsInput
()
->
assert_is_op_input
(
"batch_norm"
,
"X"
)
->
assert_var_dtype
(
proto
::
VarType
::
FP16
);
patterns
::
BatchNormAct
bn_act_pattern
(
gpd
.
mutable_pattern
(),
"bn_act"
);
bn_act_pattern
(
x
,
act_types
);
int
found_bn_act_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FuseBatchNormAct fuse"
;
// BN inputs
GET_IR_NODE_FROM_SUBGRAPH
(
bn_scale
,
bn_scale
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_bias
,
bn_bias
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_variance
,
bn_variance
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_mean
,
bn_mean
,
bn_act_pattern
);
// BN outputs
GET_IR_NODE_FROM_SUBGRAPH
(
bn_mean_out
,
bn_mean_out
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_variance_out
,
bn_variance_out
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_saved_variance
,
bn_saved_variance
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_saved_mean
,
bn_saved_mean
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_reserve_space
,
bn_reserve_space
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_out
,
bn_out
,
bn_act_pattern
);
// ACT output
GET_IR_NODE_FROM_SUBGRAPH
(
act_out
,
act_out
,
bn_act_pattern
);
// ops
GET_IR_NODE_FROM_SUBGRAPH
(
batch_norm
,
batch_norm
,
bn_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
act
,
act
,
bn_act_pattern
);
std
::
string
bn_x_n
=
subgraph
.
at
(
x
)
->
Name
();
std
::
string
bn_scale_n
=
bn_scale
->
Name
();
std
::
string
bn_bias_n
=
bn_bias
->
Name
();
std
::
string
bn_variance_n
=
bn_variance
->
Name
();
std
::
string
bn_mean_n
=
bn_mean
->
Name
();
std
::
string
bn_mean_out_n
=
bn_mean_out
->
Name
();
std
::
string
bn_variance_out_n
=
bn_variance_out
->
Name
();
std
::
string
bn_saved_variance_n
=
bn_saved_variance
->
Name
();
std
::
string
bn_saved_mean_n
=
bn_saved_mean
->
Name
();
std
::
string
bn_reserve_space_n
=
bn_reserve_space
->
Name
();
std
::
string
bn_out_n
=
bn_out
->
Name
();
std
::
string
act_out_n
=
act_out
->
Name
();
Node
*
fused_bn_act_node
=
CreateFusedBatchNormActNode
(
g
,
act
,
batch_norm
,
bn_x_n
,
bn_scale_n
,
bn_bias_n
,
bn_variance_n
,
bn_mean_n
,
bn_mean_out_n
,
bn_variance_out_n
,
bn_saved_variance_n
,
bn_saved_mean_n
,
bn_reserve_space_n
,
act_out_n
);
VLOG
(
4
)
<<
"
\n\t
"
<<
bn_x_n
<<
", "
<<
bn_scale_n
<<
", "
<<
bn_bias_n
<<
", "
<<
bn_variance_n
<<
" and "
<<
bn_mean_n
<<
" -> "
<<
batch_norm
->
Name
()
<<
" -> "
<<
bn_mean_out_n
<<
", "
<<
bn_variance_out_n
<<
", "
<<
bn_saved_variance_n
<<
", "
<<
bn_saved_mean_n
<<
", "
<<
bn_reserve_space_n
<<
" and "
<<
bn_out_n
<<
"
\n
"
<<
"
\t
"
<<
bn_out_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_out_n
;
ReLinkNodes
(
g
,
bn_out
,
batch_norm
,
act
,
fused_bn_act_node
);
found_bn_act_count
++
;
};
gpd
(
graph
,
handler
);
AddStatis
(
found_bn_act_count
);
return
graph
;
}
Node
*
FuseBatchNormActPass
::
CreateFusedBatchNormActNode
(
Graph
*
g
,
const
Node
*
act
,
const
Node
*
bn
,
const
std
::
string
&
bn_x_n
,
const
std
::
string
&
bn_scale_n
,
const
std
::
string
&
bn_bias_n
,
const
std
::
string
&
bn_variance_n
,
const
std
::
string
&
bn_mean_n
,
const
std
::
string
&
bn_mean_out_n
,
const
std
::
string
&
bn_variance_out_n
,
const
std
::
string
&
bn_saved_variance_n
,
const
std
::
string
&
bn_saved_mean_n
,
const
std
::
string
&
bn_reserve_space_n
,
const
std
::
string
&
act_out_n
)
const
{
OpDesc
desc
;
desc
.
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
bn_x_n
}));
desc
.
SetInput
(
"Scale"
,
std
::
vector
<
std
::
string
>
({
bn_scale_n
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
bn_bias_n
}));
desc
.
SetInput
(
"Mean"
,
std
::
vector
<
std
::
string
>
({
bn_mean_n
}));
desc
.
SetInput
(
"Variance"
,
std
::
vector
<
std
::
string
>
({
bn_variance_n
}));
desc
.
SetOutput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
act_out_n
}));
desc
.
SetOutput
(
"MeanOut"
,
std
::
vector
<
std
::
string
>
({
bn_mean_out_n
}));
desc
.
SetOutput
(
"VarianceOut"
,
std
::
vector
<
std
::
string
>
({
bn_variance_out_n
}));
desc
.
SetOutput
(
"SavedMean"
,
std
::
vector
<
std
::
string
>
({
bn_saved_mean_n
}));
desc
.
SetOutput
(
"SavedVariance"
,
std
::
vector
<
std
::
string
>
({
bn_saved_variance_n
}));
desc
.
SetOutput
(
"ReserveSpace"
,
std
::
vector
<
std
::
string
>
({
bn_reserve_space_n
}));
desc
.
SetType
(
"fused_batch_norm_act"
);
desc
.
SetAttr
(
"act_type"
,
act
->
Name
());
// Set attrs
for
(
auto
&
n
:
{
act
->
Op
(),
bn
->
Op
()})
{
for
(
auto
&
m
:
n
->
GetAttrMap
())
{
desc
.
SetAttr
(
m
.
first
,
m
.
second
);
}
}
auto
fused_bn_act_node
=
g
->
CreateOpNode
(
&
desc
);
return
fused_bn_act_node
;
}
// the backward of act(bn(x))
// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
// bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance",
// "ReserveSpace"],
// out["X@GRAD", "Scale@GRAD", "Bias@GRAD"]
ir
::
Graph
*
FuseBatchNormActPass
::
FuseBatchNormActGrad
(
ir
::
Graph
*
graph
,
const
std
::
unordered_set
<
std
::
string
>
&
act_grad_types
)
const
{
PADDLE_ENFORCE_NOT_NULL
(
graph
,
platform
::
errors
::
InvalidArgument
(
"The input graph of FuseBatchNormActGrad should not be nullptr."
));
FusePassBase
::
Init
(
"bn_act_grad"
,
graph
);
GraphPatternDetector
gpd
;
auto
*
d_act_out
=
gpd
.
mutable_pattern
()
->
NewNode
(
"bn_act_grad/x"
)
->
AsInput
()
->
assert_is_ops_input
(
act_grad_types
,
GradVarName
(
"Out"
));
patterns
::
BatchNormActGrad
bn_act_grad_pattern
(
gpd
.
mutable_pattern
(),
"bn_act_grad"
);
bn_act_grad_pattern
(
d_act_out
,
act_grad_types
);
int
found_bn_act_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FuseBatchNormActGrad fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
act_grad
,
act_grad
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
batch_norm_grad
,
batch_norm_grad
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
act_out
,
act_out
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
d_itermediate_out
,
d_itermediate_out
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_x
,
bn_x
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_scale
,
bn_scale
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_bias
,
bn_bias
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_saved_mean
,
bn_saved_mean
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_saved_variance
,
bn_saved_variance
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
bn_reserve_space
,
bn_reserve_space
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
d_bn_x
,
d_bn_x
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
d_bn_scale
,
d_bn_scale
,
bn_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
d_bn_bias
,
d_bn_bias
,
bn_act_grad_pattern
);
std
::
string
d_act_out_n
=
subgraph
.
at
(
d_act_out
)
->
Name
();
// Y@GRAD
std
::
string
act_out_n
=
act_out
->
Name
();
// Y
std
::
string
d_itermediate_out_n
=
d_itermediate_out
->
Name
();
std
::
string
bn_x_n
=
bn_x
->
Name
();
std
::
string
bn_scale_n
=
bn_scale
->
Name
();
std
::
string
bn_bias_n
=
bn_bias
->
Name
();
std
::
string
bn_saved_mean_n
=
bn_saved_mean
->
Name
();
std
::
string
bn_saved_variance_n
=
bn_saved_variance
->
Name
();
std
::
string
bn_reserve_space_n
=
bn_reserve_space
->
Name
();
std
::
string
d_bn_x_n
=
d_bn_x
->
Name
();
std
::
string
d_bn_scale_n
=
d_bn_scale
->
Name
();
std
::
string
d_bn_bias_n
=
d_bn_bias
->
Name
();
OpDesc
desc
;
desc
.
SetType
(
"fused_batch_norm_act_grad"
);
desc
.
SetInput
(
"X"
,
{
bn_x_n
});
desc
.
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
act_out_n
}));
desc
.
SetInput
(
GradVarName
(
"Y"
),
std
::
vector
<
std
::
string
>
({
d_act_out_n
}));
desc
.
SetInput
(
"Scale"
,
std
::
vector
<
std
::
string
>
({
bn_scale_n
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
bn_bias_n
}));
desc
.
SetInput
(
"SavedMean"
,
std
::
vector
<
std
::
string
>
({
bn_saved_mean_n
}));
desc
.
SetInput
(
"SavedVariance"
,
std
::
vector
<
std
::
string
>
({
bn_saved_variance_n
}));
desc
.
SetInput
(
"ReserveSpace"
,
std
::
vector
<
std
::
string
>
({
bn_reserve_space_n
}));
desc
.
SetOutput
(
GradVarName
(
"X"
),
std
::
vector
<
std
::
string
>
({
d_bn_x_n
}));
desc
.
SetOutput
(
GradVarName
(
"Scale"
),
std
::
vector
<
std
::
string
>
({
d_bn_scale_n
}));
desc
.
SetOutput
(
GradVarName
(
"Bias"
),
std
::
vector
<
std
::
string
>
({
d_bn_bias_n
}));
std
::
string
act
=
act_grad
->
Name
();
act
=
act
.
substr
(
0
,
act
.
length
()
-
5
);
// remove "_grad"
desc
.
SetAttr
(
"act_type"
,
act
);
for
(
auto
&
n
:
{
act_grad
->
Op
(),
batch_norm_grad
->
Op
()})
{
for
(
auto
&
m
:
n
->
GetAttrMap
())
{
desc
.
SetAttr
(
m
.
first
,
m
.
second
);
}
}
auto
fused_node
=
g
->
CreateOpNode
(
&
desc
);
VLOG
(
4
)
<<
"
\n\t
"
<<
d_act_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
act_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
<<
"
\n\t
"
<<
bn_x_n
<<
", "
<<
d_itermediate_out_n
<<
", "
<<
bn_scale_n
<<
", "
<<
bn_bias_n
<<
", "
<<
bn_saved_mean_n
<<
", "
<<
bn_saved_variance_n
<<
" and "
<<
bn_reserve_space_n
<<
" -> "
<<
batch_norm_grad
->
Name
()
<<
" -> "
<<
d_bn_x_n
<<
", "
<<
d_bn_scale_n
<<
" and "
<<
d_bn_bias_n
;
ReLinkNodes
(
g
,
d_itermediate_out
,
act_grad
,
batch_norm_grad
,
fused_node
);
found_bn_act_count
++
;
};
gpd
(
graph
,
handler
);
AddStatis
(
found_bn_act_count
);
return
graph
;
}
void
FuseBatchNormActPass
::
ReLinkNodes
(
Graph
*
graph
,
const
Node
*
intermediate_out
,
Node
*
op_1
,
Node
*
op_2
,
Node
*
fused_op
)
const
{
// delete act
for
(
auto
&
in
:
op_1
->
inputs
)
{
fused_op
->
inputs
.
emplace_back
(
in
);
in
->
outputs
=
this
->
ReplaceNode
(
op_1
,
fused_op
,
in
->
outputs
);
}
std
::
unordered_set
<
const
Node
*>
nodes2delete
;
for
(
auto
&
out
:
op_1
->
outputs
)
{
// intermediate_out or ctr_var
auto
result_iter
=
std
::
find_if
(
op_2
->
inputs
.
begin
(),
op_2
->
inputs
.
end
(),
[
&
out
](
const
Node
*
node
)
->
bool
{
return
node
==
out
;
});
if
(
result_iter
==
op_2
->
inputs
.
end
())
{
IR_OP_VAR_LINK
(
fused_op
,
out
);
}
else
{
nodes2delete
.
emplace
(
out
);
}
}
for
(
auto
&
in
:
op_2
->
inputs
)
{
if
(
in
==
intermediate_out
||
nodes2delete
.
count
(
in
))
{
continue
;
}
fused_op
->
inputs
.
emplace_back
(
in
);
in
->
outputs
=
this
->
ReplaceNode
(
op_2
,
fused_op
,
in
->
outputs
);
}
for
(
auto
&
out
:
op_2
->
outputs
)
{
IR_OP_VAR_LINK
(
fused_op
,
out
);
}
nodes2delete
.
insert
(
std
::
move
(
op_1
));
nodes2delete
.
insert
(
std
::
move
(
op_2
));
GraphSafeRemoveNodes
(
graph
,
nodes2delete
);
}
std
::
vector
<
Node
*>
FuseBatchNormActPass
::
ReplaceNode
(
Node
*
cur_node
,
Node
*
new_node
,
const
std
::
vector
<
Node
*>
&
nodes
)
const
{
std
::
vector
<
Node
*>
new_list
(
nodes
.
size
());
bool
has_replaced
=
false
;
std
::
transform
(
nodes
.
begin
(),
nodes
.
end
(),
new_list
.
begin
(),
[
&
](
Node
*
node
)
->
Node
*
{
if
(
node
==
cur_node
)
{
has_replaced
=
true
;
return
new_node
;
}
return
node
;
});
PADDLE_ENFORCE_EQ
(
has_replaced
,
true
,
platform
::
errors
::
NotFound
(
"Not find %s in the node list."
,
cur_node
->
Name
()));
return
new_list
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
fuse_bn_act_pass
,
paddle
::
framework
::
ir
::
FuseBatchNormActPass
);
paddle/fluid/framework/ir/fuse_bn_act_pass.h
0 → 100644
浏览文件 @
c63a63d5
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
/*
* Fuse the BatchNorm and activation.
*/
class
FuseBatchNormActPass
:
public
FusePassBase
{
public:
virtual
~
FuseBatchNormActPass
()
{}
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
ir
::
Graph
*
FuseBatchNormAct
(
ir
::
Graph
*
graph
,
const
std
::
unordered_set
<
std
::
string
>
&
act_types
)
const
;
ir
::
Graph
*
FuseBatchNormActGrad
(
ir
::
Graph
*
graph
,
const
std
::
unordered_set
<
std
::
string
>
&
act_grad_types
)
const
;
std
::
vector
<
Node
*>
ReplaceNode
(
Node
*
cur_node
,
Node
*
new_node
,
const
std
::
vector
<
Node
*>
&
nodes
)
const
;
void
ReLinkNodes
(
Graph
*
graph
,
const
Node
*
intermediate_out
,
Node
*
op_1
,
Node
*
op_2
,
Node
*
fused_op
)
const
;
Node
*
CreateFusedBatchNormActNode
(
Graph
*
g
,
const
Node
*
act
,
const
Node
*
bn
,
const
std
::
string
&
bn_x_n
,
const
std
::
string
&
bn_scale_n
,
const
std
::
string
&
bn_bias_n
,
const
std
::
string
&
bn_variance_n
,
const
std
::
string
&
bn_mean_n
,
const
std
::
string
&
bn_mean_out_n
,
const
std
::
string
&
bn_variance_out_n
,
const
std
::
string
&
bn_saved_variance_n
,
const
std
::
string
&
bn_saved_mean_n
,
const
std
::
string
&
bn_reserve_space_n
,
const
std
::
string
&
act_out_n
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
c63a63d5
...
...
@@ -383,6 +383,13 @@ PDNode *PDNode::assert_is_var() {
return
this
;
}
PDNode
*
PDNode
::
assert_var_dtype
(
proto
::
VarType
::
Type
dtype
)
{
assert_is_var
();
asserts_
.
emplace_back
(
[
dtype
](
Node
*
x
)
{
return
x
->
Var
()
->
GetDataType
()
==
dtype
;
});
return
this
;
}
PDNode
*
PDNode
::
assert_is_not_ctrl_var
()
{
asserts_
.
emplace_back
([](
Node
*
x
)
{
return
x
&&
!
x
->
IsCtrlVar
();
});
return
this
;
...
...
@@ -476,6 +483,7 @@ PDNode *PDNode::assert_is_op_output(const std::string &op_type,
assert_is_op_nth_output
(
op_type
,
argument
,
0
);
return
this
;
}
PDNode
*
PDNode
::
assert_is_op_input
(
const
std
::
string
&
op_type
)
{
assert_is_var
();
asserts_
.
emplace_back
([
=
](
Node
*
x
)
{
...
...
@@ -489,6 +497,16 @@ PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
return
this
;
}
PDNode
*
PDNode
::
assert_is_not_op_input
(
const
std
::
string
&
argument
)
{
assert_is_op
();
asserts_
.
emplace_back
([
=
](
Node
*
x
)
{
auto
&
ins
=
x
->
Op
()
->
Inputs
();
auto
iter
=
ins
.
find
(
argument
);
return
iter
==
ins
.
end
()
||
iter
->
second
.
empty
();
});
return
this
;
}
PDNode
*
PDNode
::
assert_is_op_input
(
const
std
::
string
&
op_type
,
const
std
::
string
&
argument
)
{
assert_is_var
();
...
...
@@ -1048,6 +1066,117 @@ PDNode *patterns::ActElewiseAdd::operator()(
return
elewise_add_out
;
}
PDNode
*
patterns
::
BatchNormAct
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
bn_x_var
,
std
::
unordered_set
<
std
::
string
>
act_types
)
{
auto
*
bn_scale_var
=
pattern
->
NewNode
(
bn_scale_repr
())
->
assert_is_op_input
(
"batch_norm"
,
"Scale"
);
auto
*
bn_bias_var
=
pattern
->
NewNode
(
bn_bias_repr
())
->
assert_is_op_input
(
"batch_norm"
,
"Bias"
);
auto
*
bn_variance_var
=
pattern
->
NewNode
(
bn_variance_repr
())
->
assert_is_op_input
(
"batch_norm"
,
"Variance"
);
auto
*
bn_mean_var
=
pattern
->
NewNode
(
bn_mean_repr
())
->
assert_is_op_input
(
"batch_norm"
,
"Mean"
);
auto
*
bn
=
pattern
->
NewNode
(
batch_norm_repr
())
->
assert_is_op
(
"batch_norm"
)
->
assert_is_not_op_input
(
"MomentumTensor"
)
->
assert_op_attr
<
bool
>
(
"is_test"
,
false
)
->
assert_op_attr
<
bool
>
(
"use_global_stats"
,
false
)
->
assert_op_attr
<
std
::
string
>
(
"data_layout"
,
"NHWC"
);
auto
*
bn_mean_out_var
=
pattern
->
NewNode
(
bn_mean_out_repr
())
->
assert_is_op_output
(
"batch_norm"
,
"MeanOut"
);
auto
*
bn_variance_out_var
=
pattern
->
NewNode
(
bn_variance_out_repr
())
->
assert_is_op_output
(
"batch_norm"
,
"VarianceOut"
);
auto
*
bn_saved_variance_var
=
pattern
->
NewNode
(
bn_saved_variance_repr
())
->
assert_is_op_output
(
"batch_norm"
,
"SavedVariance"
);
auto
*
bn_saved_mean_var
=
pattern
->
NewNode
(
bn_saved_mean_repr
())
->
assert_is_op_output
(
"batch_norm"
,
"SavedMean"
);
auto
*
bn_reserve_space
=
pattern
->
NewNode
(
bn_reserve_space_repr
())
->
assert_is_op_output
(
"batch_norm"
,
"ReserveSpace"
);
auto
*
bn_out_var
=
pattern
->
NewNode
(
bn_out_repr
())
->
assert_is_op_output
(
"batch_norm"
,
"Y"
)
->
assert_has_n_outputs
(
1
);
bn_out_var
->
AsIntermediate
()
->
assert_is_ops_input
(
act_types
);
auto
*
act
=
pattern
->
NewNode
(
act_repr
())
->
assert_is_ops
(
act_types
);
auto
*
act_out_var
=
pattern
->
NewNode
(
act_out_repr
())
->
assert_is_ops_output
(
act_types
,
"Out"
);
bn
->
LinksFrom
(
{
bn_x_var
,
bn_scale_var
,
bn_bias_var
,
bn_variance_var
,
bn_mean_var
})
.
LinksTo
({
bn_mean_out_var
,
bn_variance_out_var
,
bn_saved_variance_var
,
bn_saved_mean_var
,
bn_reserve_space
,
bn_out_var
});
act
->
LinksFrom
({
bn_out_var
}).
LinksTo
({
act_out_var
});
return
act_out_var
;
}
PDNode
*
patterns
::
BatchNormActGrad
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
d_act_out_var
,
std
::
unordered_set
<
std
::
string
>
act_grad_types
)
{
auto
*
act_grad
=
pattern
->
NewNode
(
act_grad_repr
())
->
assert_is_ops
(
act_grad_types
);
auto
*
bn_grad
=
pattern
->
NewNode
(
batch_norm_grad_repr
())
->
assert_is_op
(
"batch_norm_grad"
)
->
assert_op_attr
<
bool
>
(
"use_global_stats"
,
false
)
->
assert_op_attr
<
std
::
string
>
(
"data_layout"
,
"NHWC"
);
auto
*
act_out_var
=
pattern
->
NewNode
(
act_out_repr
())
->
assert_is_ops_input
(
act_grad_types
,
"Out"
);
auto
*
d_intermediate_var
=
pattern
->
NewNode
(
d_itermediate_out_repr
())
->
assert_is_ops_output
(
act_grad_types
,
GradVarName
(
"X"
))
->
assert_has_n_outputs
(
1
);
auto
*
bn_x_var
=
pattern
->
NewNode
(
bn_x_repr
())
->
assert_is_op_input
(
"batch_norm_grad"
,
"X"
)
->
assert_var_dtype
(
proto
::
VarType
::
FP16
);
auto
*
bn_scale_var
=
pattern
->
NewNode
(
bn_scale_repr
())
->
assert_is_op_input
(
"batch_norm_grad"
,
"Scale"
);
auto
*
bn_bias_var
=
pattern
->
NewNode
(
bn_bias_repr
())
->
assert_is_op_input
(
"batch_norm_grad"
,
"Bias"
);
auto
*
bn_saved_mean_var
=
pattern
->
NewNode
(
bn_saved_mean_repr
())
->
assert_is_op_input
(
"batch_norm_grad"
,
"SavedMean"
);
auto
*
bn_saved_variance_var
=
pattern
->
NewNode
(
bn_saved_variance_repr
())
->
assert_is_op_input
(
"batch_norm_grad"
,
"SavedVariance"
);
// ReserveSpace as the output is equal to:
// data_layout == 'NHWC' && FLAGS_cudnn_batchnorm_spatial_persistent == true
auto
*
bn_reserve_space
=
pattern
->
NewNode
(
bn_reserve_space_repr
())
->
assert_is_op_input
(
"batch_norm_grad"
,
"ReserveSpace"
);
auto
*
d_bn_x_var
=
pattern
->
NewNode
(
d_bn_x_repr
())
->
assert_is_not_ctrl_var
()
->
assert_is_op_output
(
"batch_norm_grad"
,
GradVarName
(
"X"
));
auto
*
d_bn_scale_var
=
pattern
->
NewNode
(
d_bn_scale_repr
())
->
assert_is_not_ctrl_var
()
->
assert_is_op_output
(
"batch_norm_grad"
,
GradVarName
(
"Scale"
));
auto
*
d_bn_bias_var
=
pattern
->
NewNode
(
d_bn_bias_repr
())
->
assert_is_not_ctrl_var
()
->
assert_is_op_output
(
"batch_norm_grad"
,
GradVarName
(
"Bias"
));
act_grad
->
LinksFrom
({
d_act_out_var
,
act_out_var
})
.
LinksTo
({
d_intermediate_var
});
bn_grad
->
LinksFrom
({
bn_x_var
,
d_intermediate_var
,
bn_scale_var
,
bn_bias_var
,
bn_saved_mean_var
,
bn_saved_variance_var
,
bn_reserve_space
})
.
LinksTo
({
d_bn_x_var
,
d_bn_scale_var
,
d_bn_bias_var
});
return
bn_grad
;
}
PDNode
*
patterns
::
ElewiseAddAct
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
ele_x_var
,
std
::
unordered_set
<
std
::
string
>
act_types
)
{
...
...
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
c63a63d5
...
...
@@ -27,6 +27,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/inference/analysis/dot.h"
...
...
@@ -100,6 +101,7 @@ struct PDNode {
PDNode
*
assert_is_op
();
PDNode
*
assert_is_op
(
const
std
::
string
&
op_type
);
PDNode
*
assert_is_var
();
PDNode
*
assert_var_dtype
(
proto
::
VarType
::
Type
dtype
);
PDNode
*
assert_is_not_ctrl_var
();
PDNode
*
assert_var_not_persistable
();
PDNode
*
assert_is_persistable_var
();
...
...
@@ -111,6 +113,7 @@ struct PDNode {
const
std
::
string
&
argument
);
PDNode
*
assert_is_op_nth_input
(
const
std
::
string
&
op_type
,
const
std
::
string
&
argument
,
int
nth
);
PDNode
*
assert_is_not_op_input
(
const
std
::
string
&
argument
);
PDNode
*
assert_is_op_nth_output
(
const
std
::
string
&
op_type
,
const
std
::
string
&
argument
,
int
nth
);
PDNode
*
assert_is_only_input_of_op
(
const
std
::
string
&
op_type
);
...
...
@@ -590,6 +593,64 @@ struct GRU : public PatternBase {
PATTERN_DECL_NODE
(
Hidden
);
};
// The following pattern is used to fuse batch_norm and act
// formula: act(bn(x))
// op: batch_norm + act
struct
BatchNormAct
:
public
PatternBase
{
BatchNormAct
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"bn_act"
)
{}
PDNode
*
operator
()(
PDNode
*
x
,
std
::
unordered_set
<
std
::
string
>
acts
);
// declare operator node's name
PATTERN_DECL_NODE
(
batch_norm
);
PATTERN_DECL_NODE
(
act
);
// declare variable node's name
// BN inputs
PATTERN_DECL_NODE
(
bn_scale
);
PATTERN_DECL_NODE
(
bn_bias
);
PATTERN_DECL_NODE
(
bn_variance
);
PATTERN_DECL_NODE
(
bn_mean
);
// BN outputs
PATTERN_DECL_NODE
(
bn_mean_out
);
PATTERN_DECL_NODE
(
bn_variance_out
);
PATTERN_DECL_NODE
(
bn_saved_variance
);
PATTERN_DECL_NODE
(
bn_saved_mean
);
PATTERN_DECL_NODE
(
bn_reserve_space
);
PATTERN_DECL_NODE
(
bn_out
);
// ACT output
PATTERN_DECL_NODE
(
act_out
);
};
// the backward of act(bn(x))
// op: batch_norm_grad + act_grad
struct
BatchNormActGrad
:
public
PatternBase
{
BatchNormActGrad
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"bn_act_grad"
)
{}
// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
// bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance",
// "ReserveSpace"],
// out["X@GRAD", "Scale@GRAD", "Bias@GRAD"]
PDNode
*
operator
()(
PDNode
*
x
,
std
::
unordered_set
<
std
::
string
>
act_grad_types
);
// declare operator node's name
PATTERN_DECL_NODE
(
act_grad
);
PATTERN_DECL_NODE
(
batch_norm_grad
);
// declare variable node's name
PATTERN_DECL_NODE
(
act_out
);
PATTERN_DECL_NODE
(
d_itermediate_out
);
PATTERN_DECL_NODE
(
bn_x
);
PATTERN_DECL_NODE
(
bn_scale
);
PATTERN_DECL_NODE
(
bn_bias
);
PATTERN_DECL_NODE
(
bn_saved_mean
);
PATTERN_DECL_NODE
(
bn_saved_variance
);
PATTERN_DECL_NODE
(
bn_reserve_space
);
PATTERN_DECL_NODE
(
d_bn_x
);
PATTERN_DECL_NODE
(
d_bn_scale
);
PATTERN_DECL_NODE
(
d_bn_bias
);
};
// The following patterns are used to fuse elewise_add and act
// formula: act(ele_add(x, y))
// op: elementwise_add + act
...
...
paddle/fluid/framework/unused_var_check.cc
浏览文件 @
c63a63d5
...
...
@@ -30,6 +30,8 @@ const std::unordered_set<std::string> op_has_unsed_vars_white_list = {
"auc"
,
"batch_norm"
,
"batch_norm_grad"
,
"fused_batch_norm_act"
,
"fused_batch_norm_act_grad"
,
"sync_batch_norm_grad"
,
"center_loss_grad"
,
"crop"
,
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
c63a63d5
include
(
operators
)
register_operators
(
EXCLUDES
fused_bn_activation_op
conv_fusion_op
fusion_transpose_flatten_concat_op
fusion_conv_inception_op
...
...
@@ -8,6 +9,11 @@ register_operators(EXCLUDES
fusion_group_op
)
if
(
WITH_GPU
)
# fused_bn_activation_op needs cudnn 7.4.1 above
if
(
NOT
${
CUDNN_VERSION
}
VERSION_LESS 7401
)
op_library
(
fused_bn_activation_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(fused_batch_norm_act);
\n
"
)
endif
()
# conv_fusion_op needs cudnn 7 above
if
(
NOT
${
CUDNN_VERSION
}
VERSION_LESS 7100
)
op_library
(
conv_fusion_op
)
...
...
paddle/fluid/operators/fused/fused_bn_activation_op.cc
0 → 100644
浏览文件 @
c63a63d5
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
void
FusedBatchNormActOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"X"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(X) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Scale"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Scale) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Bias"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Bias) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Mean"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Mean) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Variance"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Variance) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Y"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Y) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"MeanOut"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(MeanOut) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"VarianceOut"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(VarianceOut) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"SavedMean"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(SavedMean) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"SavedVariance"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(SavedVariance) of BatchNormOp should not be null."
));
// make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
PADDLE_ENFORCE_EQ
(
ctx
->
Inputs
(
"Mean"
)[
0
],
ctx
->
Outputs
(
"MeanOut"
)[
0
],
platform
::
errors
::
PreconditionNotMet
(
"Mean and MeanOut should share the same memory"
));
PADDLE_ENFORCE_EQ
(
ctx
->
Inputs
(
"Variance"
)[
0
],
ctx
->
Outputs
(
"VarianceOut"
)[
0
],
platform
::
errors
::
PreconditionNotMet
(
"Variance and VarianceOut should share the same memory"
));
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
PreconditionNotMet
(
"ShapeError: the dimension of input "
"X must greater than or equal to 2."
"But received: the shape of input X "
"= [%s], the dimension of input X ="
"[%d]"
,
x_dims
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
PreconditionNotMet
(
"ShapeError: the dimension of input "
"X must smaller than or equal to 5."
"But received: the shape of input X "
"= [%s], the dimension of input X ="
"[%d]"
,
x_dims
,
x_dims
.
size
()));
const
int64_t
C
=
x_dims
[
x_dims
.
size
()
-
1
];
auto
scale_dim
=
ctx
->
GetInputDim
(
"Scale"
);
auto
bias_dim
=
ctx
->
GetInputDim
(
"Bias"
);
PADDLE_ENFORCE_EQ
(
scale_dim
.
size
(),
1UL
,
platform
::
errors
::
PreconditionNotMet
(
"ShapeError: the dimension of scale must equal to 1."
"But received: the shape of scale is [%s], the dimension "
"of scale is [%d]"
,
scale_dim
,
scale_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
bias_dim
.
size
(),
1UL
,
platform
::
errors
::
PreconditionNotMet
(
"ShapeError: the dimension of bias must equal to 1."
"But received: the shape of bias is [%s],the dimension "
"of bias is [%d]"
,
bias_dim
,
bias_dim
.
size
()));
bool
check
=
true
;
if
((
!
ctx
->
IsRuntime
())
&&
(
framework
::
product
(
scale_dim
)
<=
0
||
framework
::
product
(
bias_dim
)
<=
0
))
{
check
=
false
;
}
if
(
check
)
{
PADDLE_ENFORCE_EQ
(
scale_dim
[
0
],
C
,
platform
::
errors
::
PreconditionNotMet
(
"ShapeError: the shape of scale must equal to [%d]"
"But received: the shape of scale is [%d]"
,
C
,
scale_dim
[
0
]));
PADDLE_ENFORCE_EQ
(
bias_dim
[
0
],
C
,
platform
::
errors
::
PreconditionNotMet
(
"ShapeError: the shape of bias must equal to [%d]"
"But received: the shape of bias is [%d]"
,
C
,
bias_dim
[
0
]));
}
ctx
->
SetOutputDim
(
"Y"
,
x_dims
);
ctx
->
SetOutputDim
(
"MeanOut"
,
{
C
});
ctx
->
SetOutputDim
(
"VarianceOut"
,
{
C
});
ctx
->
SetOutputDim
(
"SavedMean"
,
{
C
});
ctx
->
SetOutputDim
(
"SavedVariance"
,
{
C
});
ctx
->
ShareLoD
(
"X"
,
"Y"
);
}
framework
::
OpKernelType
FusedBatchNormActOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
input_data_type
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
// By default, the type of the scale, bias, mean,
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto
bn_param_type
=
framework
::
proto
::
VarType
::
FP32
;
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP64
)
{
bn_param_type
=
framework
::
proto
::
VarType
::
FP64
;
}
PADDLE_ENFORCE_EQ
(
bn_param_type
,
ctx
.
Input
<
Tensor
>
(
"Scale"
)
->
type
(),
platform
::
errors
::
PreconditionNotMet
(
"Scale input should be of float type"
));
PADDLE_ENFORCE_EQ
(
bn_param_type
,
ctx
.
Input
<
Tensor
>
(
"Bias"
)
->
type
(),
platform
::
errors
::
PreconditionNotMet
(
"Bias input should be of float type"
));
PADDLE_ENFORCE_EQ
(
bn_param_type
,
ctx
.
Input
<
Tensor
>
(
"Mean"
)
->
type
(),
platform
::
errors
::
PreconditionNotMet
(
"Mean input should be of float type"
));
PADDLE_ENFORCE_EQ
(
bn_param_type
,
ctx
.
Input
<
Tensor
>
(
"Variance"
)
->
type
(),
platform
::
errors
::
PreconditionNotMet
(
"Variance input should be of float type"
));
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
);
}
framework
::
OpKernelType
FusedBatchNormActOp
::
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
{
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
void
FusedBatchNormActOpMaker
::
Make
()
{
AddAttr
<
float
>
(
"momentum"
,
""
).
SetDefault
(
0.9
);
AddAttr
<
float
>
(
"epsilon"
,
""
)
.
SetDefault
(
1e-5
)
.
AddCustomChecker
([](
const
float
&
epsilon
)
{
PADDLE_ENFORCE_EQ
(
epsilon
>=
0.0
f
&&
epsilon
<=
0.001
f
,
true
,
platform
::
errors
::
InvalidArgument
(
"'epsilon' should be between 0.0 and 0.001."
));
});
AddAttr
<
std
::
string
>
(
"act_type"
,
"The activation type to be fused."
)
.
SetDefault
(
"relu"
);
AddInput
(
"X"
,
"The input tensor"
);
AddInput
(
"Scale"
,
"Scale is a 1-dimensional tensor of size C "
"that is applied to the output"
);
AddInput
(
"Bias"
,
"Bias is a 1-dimensional tensor of size C "
"that is applied to the output"
);
AddInput
(
"Mean"
,
"The global mean (for training) or "
"estimated mean (for testing)"
);
AddInput
(
"Variance"
,
"The global variance (for training) "
"or estimated Variance (for testing)"
);
AddOutput
(
"Y"
,
"result after normalization"
);
AddOutput
(
"MeanOut"
,
"Share memory with Mean. "
"Store the global mean when training"
);
AddOutput
(
"VarianceOut"
,
"Share memory with Variance. "
"Store the global Variance when training"
);
AddOutput
(
"SavedMean"
,
"Mean of the current mini batch, "
"will apply to output when training"
)
.
AsIntermediate
();
AddOutput
(
"SavedVariance"
,
"Variance of the current mini batch, "
"will apply to output when training"
)
.
AsIntermediate
();
AddOutput
(
"ReserveSpace"
,
"Reserve GPU space for triggering the new semi-persistent "
"NHWC kernel"
);
AddComment
(
R"DOC(
Fused Batch Normalization with activation.
Batch Norm has been implemented as discussed in the paper:
https://arxiv.org/pdf/1502.03167.pdf
Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
Now, the required data format for FusedBatchNormActOp is NHWC `[batch, in_height, in_width, in_channels]`.
)DOC"
);
}
void
FusedBatchNormActGradOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
// check input
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"X"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(X) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Scale"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Scale) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Y@GRAD) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"SavedMean"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(SavedMean) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"SavedVariance"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(SavedVariance) should not be null"
));
// check output
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(X@GRAD) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Scale"
)),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Scale@GRAD) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Bias@GRAD) should not be null."
));
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
int
C
=
x_dims
[
x_dims
.
size
()
-
1
];
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
// has_scale_grad == has_bias_grad, judge has_scale_grad is enough
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Scale"
),
{
C
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
{
C
});
}
framework
::
OpKernelType
FusedBatchNormActGradOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
auto
*
var
=
ctx
.
InputVar
(
framework
::
GradVarName
(
"Y"
));
if
(
var
==
nullptr
)
{
PADDLE_THROW
(
platform
::
errors
::
NotFound
(
"Can not find Y@GRAD in the execution context."
));
}
const
Tensor
*
t
=
nullptr
;
if
(
var
->
IsType
<
Tensor
>
())
{
t
=
&
var
->
Get
<
Tensor
>
();
}
else
if
(
var
->
IsType
<
LoDTensor
>
())
{
t
=
&
var
->
Get
<
LoDTensor
>
();
}
if
(
t
==
nullptr
)
{
PADDLE_THROW
(
platform
::
errors
::
NotFound
(
"Can not get the tensor value of Y@GRAD."
));
}
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
(),
layout
,
library
);
}
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_batch_norm_act
,
ops
::
FusedBatchNormActOp
,
ops
::
FusedBatchNormActOpMaker
,
ops
::
FusedBatchNormActOpInferVarType
,
ops
::
FusedBatchNormActGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
FusedBatchNormActGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OPERATOR
(
fused_batch_norm_act_grad
,
ops
::
FusedBatchNormActGradOp
);
paddle/fluid/operators/fused/fused_bn_activation_op.cu
0 → 100644
浏览文件 @
c63a63d5
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
using
CudnnDataType
=
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
using
BatchNormParamType
=
typename
CudnnDataType
<
T
>::
BatchNormParamType
;
template
<
typename
T
>
class
FusedBatchNormActKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
std
::
string
act_type
=
ctx
.
Attr
<
std
::
string
>
(
"act_type"
);
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
// Get the size for each dimension.
// NHWC [batch_size, in_height, in_width, in_channels]
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_EQ
(
x_dims
.
size
()
>=
2
&&
x_dims
.
size
()
<=
5
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"The Input dim size should be between 2 and 5"
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
// Run training mode.
// obtain running mean and running inv var, and see if we need to
// initialize them.
auto
*
mean_out
=
ctx
.
Output
<
Tensor
>
(
"MeanOut"
);
auto
*
variance_out
=
ctx
.
Output
<
Tensor
>
(
"VarianceOut"
);
mean_out
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
variance_out
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
saved_mean
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
N
,
C
,
H
,
W
,
D
;
const
DataLayout
data_layout
=
DataLayout
::
kNHWC
;
ExtractNCWHD
(
x_dims
,
data_layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
((
N
*
H
*
W
*
D
)
==
1
)
{
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = act(x).
auto
x_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
&
dev
=
*
dev_ctx
.
eigen_device
();
if
(
act_type
==
"relu"
)
{
ReluFunctor
<
T
>
()(
dev
,
x_v
,
y_v
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported activation type"
));
}
return
;
}
// ------------------- cudnn descriptors ---------------------
auto
handle
=
dev_ctx
.
cudnn_handle
();
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
cudnnBatchNormMode_t
mode_
=
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bn_param_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."
));
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
=
{
N
,
C
,
H
,
W
,
D
};
std
::
vector
<
int
>
strides
=
{
H
*
W
*
D
*
C
,
1
,
W
*
D
*
C
,
D
*
C
,
C
};
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()),
platform
::
errors
::
External
(
"The error has happened when calling cudnnSetTensorNdDescriptor."
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."
));
double
this_factor
=
1.
-
momentum
;
cudnnBatchNormOps_t
bnOps_
=
CUDNN_BATCHNORM_OPS_BN_ACTIVATION
;
platform
::
ScopedActivationDescriptor
scope_act_desc
;
cudnnActivationDescriptor_t
activation_desc_
=
scope_act_desc
.
descriptor
<
T
>
(
act_type
);
size_t
workspace_size
=
0
;
size_t
reserve_space_size
=
0
;
void
*
reserve_space_ptr
=
nullptr
;
void
*
workspace_ptr
=
nullptr
;
Tensor
workspace_tensor
;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
auto
*
reserve_space
=
ctx
.
Output
<
Tensor
>
(
"ReserveSpace"
);
PADDLE_ENFORCE_NOT_NULL
(
reserve_space
,
platform
::
errors
::
NotFound
(
"The argument ReserveSpace of batch_norm op is not found."
));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize
(
/*handle=*/
handle
,
/*mode=*/
mode_
,
/*bnOps=*/
bnOps_
,
/*xDesc=*/
data_desc_
,
/*zDesc=*/
nullptr
,
/*yDesc=*/
data_desc_
,
/*bnScaleBiasMeanVarDesc=*/
bn_param_desc_
,
/*activationDesc=*/
activation_desc_
,
/*sizeInBytes=*/
&
workspace_size
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."
));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize
(
/*handle=*/
handle
,
/*mode=*/
mode_
,
/*bnOps=*/
bnOps_
,
/*activationDesc=*/
activation_desc_
,
/*xDesc=*/
data_desc_
,
/*sizeInBytes=*/
&
reserve_space_size
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnGetBatchNormalizationTrainingExReserveSpaceSize."
));
reserve_space_ptr
=
reserve_space
->
mutable_data
(
ctx
.
GetPlace
(),
x
->
type
(),
reserve_space_size
);
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
x
->
type
(),
workspace_size
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTrainingEx
(
handle
,
mode_
,
bnOps_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
nullptr
,
nullptr
,
data_desc_
,
y
->
template
data
<
T
>(),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
this_factor
,
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
activation_desc_
,
workspace_ptr
,
workspace_size
,
reserve_space_ptr
,
reserve_space_size
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnBatchNormalizationForwardTrainingEx."
));
// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."
));
}
};
template
<
typename
T
>
class
FusedBatchNormActGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
std
::
string
act_type
=
ctx
.
Attr
<
std
::
string
>
(
"act_type"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
const
auto
*
reserve_space
=
ctx
.
Input
<
Tensor
>
(
"ReserveSpace"
);
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_EQ
(
x_dims
.
size
()
>=
2
&&
x_dims
.
size
()
<=
5
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"The Input dim size should be between 2 and 5"
));
int
N
,
C
,
H
,
W
,
D
;
const
DataLayout
data_layout
=
DataLayout
::
kNHWC
;
ExtractNCWHD
(
x_dims
,
data_layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_EQ
(
d_scale
&&
d_bias
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"Both the scale grad and the bias grad must not be null."
));
d_scale
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
,
platform
::
errors
::
PreconditionNotMet
(
"The scale only has one dimension."
));
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
,
platform
::
errors
::
PreconditionNotMet
(
"The size of scale is equal to the channel of Input(X)."
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
((
N
*
H
*
W
*
D
)
==
1
)
{
if
(
act_type
==
"relu"
)
{
auto
x_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dx_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_x
);
auto
dy_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_y
);
auto
&
dev
=
*
dev_ctx
.
eigen_device
();
ReluGradFunctor
<
T
>
()(
dev
,
x_v
,
y_v
,
dy_v
,
dx_v
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported activation type"
));
}
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
std
::
vector
<
int
>
dims
=
{
N
,
C
,
H
,
W
,
D
};
std
::
vector
<
int
>
strides
=
{
H
*
W
*
C
*
D
,
1
,
W
*
D
*
C
,
D
*
C
,
C
};
// ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
cudnnBatchNormMode_t
mode_
=
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bn_param_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()),
platform
::
errors
::
External
(
"The error has happened when calling cudnnSetTensorNdDescriptor."
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."
));
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
*
saved_mean_data
=
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
auto
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
size_t
workspace_size
=
0
;
void
*
workspace_ptr
=
nullptr
;
Tensor
workspace_tensor
;
auto
reserve_space_size
=
reserve_space
->
memory_size
();
cudnnBatchNormOps_t
bnOps_
=
CUDNN_BATCHNORM_OPS_BN_ACTIVATION
;
platform
::
ScopedActivationDescriptor
scope_act_desc
;
cudnnActivationDescriptor_t
activation_desc_
=
scope_act_desc
.
descriptor
<
T
>
(
act_type
);
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetBatchNormalizationBackwardExWorkspaceSize
(
/*handle=*/
dev_ctx
.
cudnn_handle
(),
/*mode=*/
mode_
,
/*bnOps=*/
bnOps_
,
/*xDesc=*/
data_desc_
,
/*yDesc=*/
data_desc_
,
/*dyDesc=*/
data_desc_
,
/*dzDesc=*/
nullptr
,
/*dxDesc=*/
data_desc_
,
/*bnScaleBiasMeanVarDesc=*/
bn_param_desc_
,
/*activationDesc=*/
activation_desc_
,
/*sizeInBytes=*/
&
workspace_size
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnGetBatchNormalizationBackwardExWorkspaceSize."
));
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
x
->
type
(),
workspace_size
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackwardEx
(
/*handle=*/
dev_ctx
.
cudnn_handle
(),
/*mode=*/
mode_
,
/*bnOps=*/
bnOps_
,
/*alphaDataDiff=*/
CudnnDataType
<
T
>::
kOne
(),
/*betaDataDiff=*/
CudnnDataType
<
T
>::
kZero
(),
/*alphaParamDiff=*/
CudnnDataType
<
T
>::
kOne
(),
/*betaParamDiff=*/
CudnnDataType
<
T
>::
kZero
(),
/*xDesc=*/
data_desc_
,
/*xData=*/
x
->
template
data
<
T
>(),
/*yDesc=*/
data_desc_
,
/*yData=*/
y
->
template
data
<
T
>(),
/*dyDesc=*/
data_desc_
,
/*dyData=*/
d_y
->
template
data
<
T
>(),
/*dzDesc=*/
nullptr
,
/*dzData=*/
nullptr
,
/*dxDesc=*/
data_desc_
,
/*dxData=*/
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
/*dBnScaleBiasDesc=*/
bn_param_desc_
,
/*bnScaleData=*/
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
/*bnBiasData=*/
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
/*dBnScaleData=*/
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
/*dBnBiasData=*/
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
/*epsilon=*/
epsilon
,
/*savedMean=*/
saved_mean_data
,
/*savedInvVariance=*/
saved_var_data
,
/*activationDesc=*/
activation_desc_
,
/*workspace=*/
workspace_ptr
,
/*workSpaceSizeInBytes=*/
workspace_size
,
/*reserveSpace=*/
const_cast
<
T
*>
(
reserve_space
->
template
data
<
T
>()),
/*reserveSpaceSizeInBytes=*/
reserve_space_size
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnBatchNormalizationBackwardEx."
));
// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
),
platform
::
errors
::
External
(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."
));
}
};
}
// namespace operators
}
// namespace paddle
#if CUDNN_VERSION >= 7401
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
fused_batch_norm_act
,
ops
::
FusedBatchNormActKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
FusedBatchNormActKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
FusedBatchNormActKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
fused_batch_norm_act_grad
,
ops
::
FusedBatchNormActGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
FusedBatchNormActGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
FusedBatchNormActGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
#endif
paddle/fluid/operators/fused/fused_bn_activation_op.h
0 → 100644
浏览文件 @
c63a63d5
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
class
FusedBatchNormActOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
;
};
class
FusedBatchNormActGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
FusedBatchNormActOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
template
<
typename
T
>
class
FusedBatchNormActGradOpMaker
:
public
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
protected:
std
::
unique_ptr
<
T
>
Apply
()
const
override
{
std
::
unique_ptr
<
T
>
op
(
new
T
());
op
->
SetType
(
this
->
ForwardOpType
()
+
"_grad"
);
op
->
SetInput
(
"X"
,
this
->
Input
(
"X"
));
op
->
SetInput
(
"Y"
,
this
->
Output
(
"Y"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Y"
),
this
->
OutputGrad
(
"Y"
));
op
->
SetInput
(
"Scale"
,
this
->
Input
(
"Scale"
));
op
->
SetInput
(
"Bias"
,
this
->
Input
(
"Bias"
));
op
->
SetInput
(
"SavedMean"
,
this
->
Output
(
"SavedMean"
));
op
->
SetInput
(
"SavedVariance"
,
this
->
Output
(
"SavedVariance"
));
op
->
SetInput
(
"ReserveSpace"
,
this
->
Output
(
"ReserveSpace"
));
op
->
SetAttrMap
(
this
->
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
this
->
InputGrad
(
"X"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"Scale"
),
this
->
InputGrad
(
"Scale"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"Bias"
),
this
->
InputGrad
(
"Bias"
));
return
op
;
}
};
class
FusedBatchNormActOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Y"
}};
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FusedBatchNormActKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
FusedBatchNormActGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
c63a63d5
...
...
@@ -1994,6 +1994,26 @@ All parameter, weight, gradient are variables in Paddle.
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_elewise_add_act_ops = True
)DOC"
)
.
def_property
(
"fuse_bn_act_ops"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
fuse_bn_act_ops_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
PADDLE_ENFORCE_EQ
(
!
self
.
IsFinalized
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"BuildStrategy is finlaized."
));
self
.
fuse_bn_act_ops_
=
b
;
},
R"DOC((bool, optional): fuse_bn_act_ops indicate whether
to fuse batch_norm and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle.fluid as fluid
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_bn_act_ops = True
)DOC"
)
.
def_property
(
"fuse_relu_depthwise_conv"
,
[](
const
BuildStrategy
&
self
)
{
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
c63a63d5
...
...
@@ -182,6 +182,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
list
(
REMOVE_ITEM TEST_OPS test_basic_lstm_api
)
list
(
REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_debug_string
)
list
(
REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass
)
if
(
APPLE OR WIN32
)
list
(
REMOVE_ITEM TEST_OPS test_dataset
)
...
...
@@ -301,6 +302,7 @@ py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_
py_test_modules
(
test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu
)
py_test_modules
(
test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
)
py_test_modules
(
test_data_norm_op MODULES test_data_norm_op
)
py_test_modules
(
test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000
)
if
(
NOT WIN32
)
py_test_modules
(
test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer
)
...
...
@@ -330,6 +332,6 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
test_parallel_executor_crf test_sync_batch_norm_op
test_parallel_executor_feed_persistable_var
test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
test_data_norm_op test_imperative_using_non_zero_gpu
test_data_norm_op test_imperative_using_non_zero_gpu
test_fuse_bn_act_pass
test_optimizer_in_control_flow
test_buffer_shared_memory_reuse_pass PROPERTIES LABELS
"RUN_TYPE=DIST"
)
python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
0 → 100644
浏览文件 @
c63a63d5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid
as
fluid
import
unittest
class
TestFuseBatchNormActPass
(
unittest
.
TestCase
):
def
build_program
(
self
,
main_program
,
startup_program
,
use_cuda
,
seed
=
1
):
main_program
.
random_seed
=
seed
startup_program
.
random_seed
=
seed
with
fluid
.
program_guard
(
main_program
,
startup_program
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
,
28
,
28
],
dtype
=
'float32'
)
y
=
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden1
=
fluid
.
layers
.
conv2d
(
input
=
x
,
filter_size
=
3
,
num_filters
=
32
,
stride
=
1
,
padding
=
1
,
act
=
None
,
bias_attr
=
False
,
data_format
=
'NHWC'
)
param_attr
=
fluid
.
ParamAttr
(
name
=
'batch_norm_w'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
bias_attr
=
fluid
.
ParamAttr
(
name
=
'batch_norm_b'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.0
))
hidden2
=
fluid
.
layers
.
batch_norm
(
input
=
hidden1
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
'relu'
,
data_layout
=
'NHWC'
)
hidden3
=
fluid
.
layers
.
fc
(
input
=
hidden2
,
size
=
128
,
act
=
'relu'
)
hidden4
=
fluid
.
layers
.
batch_norm
(
input
=
hidden3
,
act
=
'relu'
,
data_layout
=
'NHWC'
)
prediction
=
fluid
.
layers
.
fc
(
input
=
hidden4
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
y
)
loss
=
fluid
.
layers
.
mean
(
loss
)
sgd
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.001
)
if
use_cuda
:
sgd
=
fluid
.
contrib
.
mixed_precision
.
decorate
(
sgd
,
use_dynamic_loss_scaling
=
True
,
init_loss_scaling
=
128.0
)
sgd
.
minimize
(
loss
)
return
x
,
y
,
loss
def
check
(
self
,
place
,
use_cuda
):
main_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
x
,
y
,
loss
=
self
.
build_program
(
main_program
,
startup_program
,
use_cuda
)
exe
=
fluid
.
Executor
(
place
)
iters
=
10
batch_size
=
16
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
x
,
y
],
place
=
place
)
# close fused_bn_act_ops
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
fuse_bn_act_ops
=
False
binary
=
fluid
.
CompiledProgram
(
main_program
).
with_data_parallel
(
loss_name
=
loss
.
name
,
build_strategy
=
build_strategy
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
batch_size
)
loss_vals
=
[]
scope
=
fluid
.
Scope
()
with
fluid
.
scope_guard
(
scope
):
exe
.
run
(
startup_program
)
for
_
in
range
(
iters
):
data
=
next
(
train_reader
())
loss_v
=
exe
.
run
(
binary
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
loss
])
loss_vals
.
append
(
loss_v
[
0
][
0
])
# open fused_bn_act_ops
build_strategy_fused
=
fluid
.
BuildStrategy
()
build_strategy_fused
.
fuse_bn_act_ops
=
True
binary_fused
=
fluid
.
CompiledProgram
(
main_program
).
with_data_parallel
(
loss_name
=
loss
.
name
,
build_strategy
=
build_strategy_fused
)
train_reader_fused
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
batch_size
)
loss_vals_fused
=
[]
scope_fused
=
fluid
.
Scope
()
with
fluid
.
scope_guard
(
scope_fused
):
exe
.
run
(
startup_program
)
for
_
in
range
(
iters
):
data
=
next
(
train_reader_fused
())
loss_v
=
exe
.
run
(
binary_fused
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
loss
])
loss_vals_fused
.
append
(
loss_v
[
0
][
0
])
# check loss
for
i
in
range
(
iters
):
self
.
assertAlmostEqual
(
loss_vals
[
i
],
loss_vals_fused
[
i
],
delta
=
1e-5
)
def
test_fuse_bn_act_pass_cpu
(
self
):
place
=
fluid
.
CPUPlace
()
self
.
check
(
place
,
use_cuda
=
False
)
def
test_fuse_bn_act_pass_cuda
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
place
=
fluid
.
CUDAPlace
(
0
)
self
.
check
(
place
,
use_cuda
=
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录