From 46189b166d8e9e9ef4a73f06ba3775a11cf27bd4 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Fri, 10 Jan 2020 16:14:50 +0800 Subject: [PATCH] Add bn and relu fuse pass (#22048) * add bn and relu fuse pass * add op attr assert and dtype assert * fix some inputs&&outputs bugs for the fused op and pattern. * add the unittest for fuse_bn_act_pass. test=develop * use normative enforce statements. test=develop * add the cpu test. test=develop * add the support of batch_size=1 for the bn with relu op. test=develop * add the error type for paddle throws. test=develop * add fused_batch_norm_act and fused_batch_norm_act_grad to op_has_unsed_vars_white_list. test=develop --- cmake/operators.cmake | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 8 + .../fluid/framework/details/build_strategy.h | 1 + paddle/fluid/framework/ir/CMakeLists.txt | 1 + paddle/fluid/framework/ir/fuse_bn_act_pass.cc | 332 ++++++++++++++ paddle/fluid/framework/ir/fuse_bn_act_pass.h | 64 +++ .../framework/ir/graph_pattern_detector.cc | 129 ++++++ .../framework/ir/graph_pattern_detector.h | 61 +++ paddle/fluid/framework/unused_var_check.cc | 2 + paddle/fluid/operators/fused/CMakeLists.txt | 6 + .../operators/fused/fused_bn_activation_op.cc | 296 ++++++++++++ .../operators/fused/fused_bn_activation_op.cu | 432 ++++++++++++++++++ .../operators/fused/fused_bn_activation_op.h | 109 +++++ paddle/fluid/pybind/pybind.cc | 20 + .../fluid/tests/unittests/CMakeLists.txt | 4 +- .../tests/unittests/test_fuse_bn_act_pass.py | 121 +++++ 17 files changed, 1587 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/ir/fuse_bn_act_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_bn_act_pass.h create mode 100644 paddle/fluid/operators/fused/fused_bn_activation_op.cc create mode 100644 paddle/fluid/operators/fused/fused_bn_activation_op.cu create mode 100644 paddle/fluid/operators/fused/fused_bn_activation_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index afad22ca2b..0534da57a5 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -118,7 +118,7 @@ function(op_library TARGET) "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" -"multihead_matmul_op" "fusion_group_op") +"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 6366a2b3e5..5f4f31abea 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -100,7 +100,7 @@ endif() cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass + fuse_elewise_add_act_pass fuse_bn_act_pass multi_batch_merge_pass fuse_relu_depthwise_conv_pass lock_free_optimize_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index c621905507..ca6871cc3e 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -167,6 +167,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "fuse_relu_depthwise_conv_pass"); AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); + AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. // coalesce_grad_tensor_pass should be before of MultiDevPass. AppendPassWithCheck(strategy_.fuse_all_reduce_ops_, @@ -369,6 +370,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, "GPU, skipped."; continue; } + } else if (pass->Type() == "fuse_bn_act_pass") { + if (!use_cuda) { + LOG(WARNING) << "fuse_bn_act_pass is only supported on " + "GPU, skipped."; + continue; + } } else if (pass->Type() == "mkldnn_placement_pass") { pass->Set("mkldnn_enabled_op_types", new std::unordered_set(mkldnn_enabled_op_types_)); @@ -394,6 +401,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, USE_PASS(sync_batch_norm_pass); USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_elewise_add_act_pass); +USE_PASS(fuse_bn_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); USE_PASS(reduce_mode_multi_devices_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 8ba6039220..0b12f16727 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -87,6 +87,7 @@ struct BuildStrategy { // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have // cycle. bool fuse_elewise_add_act_ops_{false}; + bool fuse_bn_act_ops_{false}; // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients // should not be sparse types boost::optional fuse_all_optimizer_ops_{false}; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 403c3dc4e9..73264140f9 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -106,6 +106,7 @@ if(WITH_NGRAPH) set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "") endif() +cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc new file mode 100644 index 0000000000..7d6ef5b902 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_bn_act_pass.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace ir { + +void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const { +#ifdef PADDLE_WITH_CUDA +#if CUDNN_VERSION_MIN(7, 4, 1) + // forward + std::unordered_set act_types = {"relu"}; + graph = FuseBatchNormAct(graph, act_types); + // backward + std::unordered_set act_grad_types = {"relu_grad"}; + graph = FuseBatchNormActGrad(graph, act_grad_types); +#endif +#endif +} + +// act(bn(x)) +ir::Graph *FuseBatchNormActPass::FuseBatchNormAct( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument( + "The input graph of FuseBatchNormAct should not be nullptr.")); + FusePassBase::Init("bn_act", graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("bn_act/x") + ->AsInput() + ->assert_is_op_input("batch_norm", "X") + ->assert_var_dtype(proto::VarType::FP16); + patterns::BatchNormAct bn_act_pattern(gpd.mutable_pattern(), "bn_act"); + + bn_act_pattern(x, act_types); + + int found_bn_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseBatchNormAct fuse"; + // BN inputs + GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, bn_act_pattern); + // BN outputs + GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, + bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_reserve_space, bn_reserve_space, + bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, bn_act_pattern); + // ACT output + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_act_pattern); + // ops + GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, bn_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern); + + std::string bn_x_n = subgraph.at(x)->Name(); + std::string bn_scale_n = bn_scale->Name(); + std::string bn_bias_n = bn_bias->Name(); + std::string bn_variance_n = bn_variance->Name(); + std::string bn_mean_n = bn_mean->Name(); + std::string bn_mean_out_n = bn_mean_out->Name(); + std::string bn_variance_out_n = bn_variance_out->Name(); + std::string bn_saved_variance_n = bn_saved_variance->Name(); + std::string bn_saved_mean_n = bn_saved_mean->Name(); + std::string bn_reserve_space_n = bn_reserve_space->Name(); + std::string bn_out_n = bn_out->Name(); + std::string act_out_n = act_out->Name(); + + Node *fused_bn_act_node = CreateFusedBatchNormActNode( + g, act, batch_norm, bn_x_n, bn_scale_n, bn_bias_n, bn_variance_n, + bn_mean_n, bn_mean_out_n, bn_variance_out_n, bn_saved_variance_n, + bn_saved_mean_n, bn_reserve_space_n, act_out_n); + + VLOG(4) << "\n\t " << bn_x_n << ", " << bn_scale_n << ", " << bn_bias_n + << ", " << bn_variance_n << " and " << bn_mean_n << " -> " + << batch_norm->Name() << " -> " << bn_mean_out_n << ", " + << bn_variance_out_n << ", " << bn_saved_variance_n << ", " + << bn_saved_mean_n << ", " << bn_reserve_space_n << " and " + << bn_out_n << "\n" + << "\t " << bn_out_n << " -> " << act->Name() << " -> " + << act_out_n; + + ReLinkNodes(g, bn_out, batch_norm, act, fused_bn_act_node); + found_bn_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_bn_act_count); + return graph; +} + +Node *FuseBatchNormActPass::CreateFusedBatchNormActNode( + Graph *g, const Node *act, const Node *bn, const std::string &bn_x_n, + const std::string &bn_scale_n, const std::string &bn_bias_n, + const std::string &bn_variance_n, const std::string &bn_mean_n, + const std::string &bn_mean_out_n, const std::string &bn_variance_out_n, + const std::string &bn_saved_variance_n, const std::string &bn_saved_mean_n, + const std::string &bn_reserve_space_n, const std::string &act_out_n) const { + OpDesc desc; + desc.SetInput("X", std::vector({bn_x_n})); + desc.SetInput("Scale", std::vector({bn_scale_n})); + desc.SetInput("Bias", std::vector({bn_bias_n})); + desc.SetInput("Mean", std::vector({bn_mean_n})); + desc.SetInput("Variance", std::vector({bn_variance_n})); + + desc.SetOutput("Y", std::vector({act_out_n})); + desc.SetOutput("MeanOut", std::vector({bn_mean_out_n})); + desc.SetOutput("VarianceOut", std::vector({bn_variance_out_n})); + desc.SetOutput("SavedMean", std::vector({bn_saved_mean_n})); + desc.SetOutput("SavedVariance", + std::vector({bn_saved_variance_n})); + desc.SetOutput("ReserveSpace", + std::vector({bn_reserve_space_n})); + desc.SetType("fused_batch_norm_act"); + + desc.SetAttr("act_type", act->Name()); + // Set attrs + for (auto &n : {act->Op(), bn->Op()}) { + for (auto &m : n->GetAttrMap()) { + desc.SetAttr(m.first, m.second); + } + } + + auto fused_bn_act_node = g->CreateOpNode(&desc); + return fused_bn_act_node; +} + +// the backward of act(bn(x)) +// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] +// bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance", +// "ReserveSpace"], +// out["X@GRAD", "Scale@GRAD", "Bias@GRAD"] +ir::Graph *FuseBatchNormActPass::FuseBatchNormActGrad( + ir::Graph *graph, + const std::unordered_set &act_grad_types) const { + PADDLE_ENFORCE_NOT_NULL( + graph, + platform::errors::InvalidArgument( + "The input graph of FuseBatchNormActGrad should not be nullptr.")); + FusePassBase::Init("bn_act_grad", graph); + + GraphPatternDetector gpd; + auto *d_act_out = + gpd.mutable_pattern() + ->NewNode("bn_act_grad/x") + ->AsInput() + ->assert_is_ops_input(act_grad_types, GradVarName("Out")); + patterns::BatchNormActGrad bn_act_grad_pattern(gpd.mutable_pattern(), + "bn_act_grad"); + bn_act_grad_pattern(d_act_out, act_grad_types); + + int found_bn_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseBatchNormActGrad fuse"; + GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(batch_norm_grad, batch_norm_grad, + bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out, + bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_x, bn_x, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, + bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, + bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bn_reserve_space, bn_reserve_space, + bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_bn_x, d_bn_x, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_bn_scale, d_bn_scale, bn_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_bn_bias, d_bn_bias, bn_act_grad_pattern); + + std::string d_act_out_n = subgraph.at(d_act_out)->Name(); // Y@GRAD + std::string act_out_n = act_out->Name(); // Y + std::string d_itermediate_out_n = d_itermediate_out->Name(); + std::string bn_x_n = bn_x->Name(); + std::string bn_scale_n = bn_scale->Name(); + std::string bn_bias_n = bn_bias->Name(); + std::string bn_saved_mean_n = bn_saved_mean->Name(); + std::string bn_saved_variance_n = bn_saved_variance->Name(); + std::string bn_reserve_space_n = bn_reserve_space->Name(); + std::string d_bn_x_n = d_bn_x->Name(); + std::string d_bn_scale_n = d_bn_scale->Name(); + std::string d_bn_bias_n = d_bn_bias->Name(); + + OpDesc desc; + desc.SetType("fused_batch_norm_act_grad"); + desc.SetInput("X", {bn_x_n}); + desc.SetInput("Y", std::vector({act_out_n})); + desc.SetInput(GradVarName("Y"), std::vector({d_act_out_n})); + desc.SetInput("Scale", std::vector({bn_scale_n})); + desc.SetInput("Bias", std::vector({bn_bias_n})); + desc.SetInput("SavedMean", std::vector({bn_saved_mean_n})); + desc.SetInput("SavedVariance", + std::vector({bn_saved_variance_n})); + desc.SetInput("ReserveSpace", + std::vector({bn_reserve_space_n})); + desc.SetOutput(GradVarName("X"), std::vector({d_bn_x_n})); + desc.SetOutput(GradVarName("Scale"), + std::vector({d_bn_scale_n})); + desc.SetOutput(GradVarName("Bias"), + std::vector({d_bn_bias_n})); + std::string act = act_grad->Name(); + act = act.substr(0, act.length() - 5); // remove "_grad" + desc.SetAttr("act_type", act); + + for (auto &n : {act_grad->Op(), batch_norm_grad->Op()}) { + for (auto &m : n->GetAttrMap()) { + desc.SetAttr(m.first, m.second); + } + } + + auto fused_node = g->CreateOpNode(&desc); + + VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> " + << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t " + << bn_x_n << ", " << d_itermediate_out_n << ", " << bn_scale_n + << ", " << bn_bias_n << ", " << bn_saved_mean_n << ", " + << bn_saved_variance_n << " and " << bn_reserve_space_n << " -> " + << batch_norm_grad->Name() << " -> " << d_bn_x_n << ", " + << d_bn_scale_n << " and " << d_bn_bias_n; + + ReLinkNodes(g, d_itermediate_out, act_grad, batch_norm_grad, fused_node); + found_bn_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_bn_act_count); + return graph; +} + +void FuseBatchNormActPass::ReLinkNodes(Graph *graph, + const Node *intermediate_out, Node *op_1, + Node *op_2, + Node *fused_op) const { // delete act + for (auto &in : op_1->inputs) { + fused_op->inputs.emplace_back(in); + in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs); + } + + std::unordered_set nodes2delete; + for (auto &out : op_1->outputs) { + // intermediate_out or ctr_var + auto result_iter = + std::find_if(op_2->inputs.begin(), op_2->inputs.end(), + [&out](const Node *node) -> bool { return node == out; }); + + if (result_iter == op_2->inputs.end()) { + IR_OP_VAR_LINK(fused_op, out); + } else { + nodes2delete.emplace(out); + } + } + + for (auto &in : op_2->inputs) { + if (in == intermediate_out || nodes2delete.count(in)) { + continue; + } + fused_op->inputs.emplace_back(in); + in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs); + } + + for (auto &out : op_2->outputs) { + IR_OP_VAR_LINK(fused_op, out); + } + + nodes2delete.insert(std::move(op_1)); + nodes2delete.insert(std::move(op_2)); + + GraphSafeRemoveNodes(graph, nodes2delete); +} + +std::vector FuseBatchNormActPass::ReplaceNode( + Node *cur_node, Node *new_node, const std::vector &nodes) const { + std::vector new_list(nodes.size()); + bool has_replaced = false; + std::transform(nodes.begin(), nodes.end(), new_list.begin(), + [&](Node *node) -> Node * { + if (node == cur_node) { + has_replaced = true; + return new_node; + } + return node; + }); + PADDLE_ENFORCE_EQ(has_replaced, true, + platform::errors::NotFound("Not find %s in the node list.", + cur_node->Name())); + return new_list; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_bn_act_pass, paddle::framework::ir::FuseBatchNormActPass); diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.h b/paddle/fluid/framework/ir/fuse_bn_act_pass.h new file mode 100644 index 0000000000..427ff03a80 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.h @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the BatchNorm and activation. + */ +class FuseBatchNormActPass : public FusePassBase { + public: + virtual ~FuseBatchNormActPass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseBatchNormAct( + ir::Graph *graph, const std::unordered_set &act_types) const; + + ir::Graph *FuseBatchNormActGrad( + ir::Graph *graph, + const std::unordered_set &act_grad_types) const; + + std::vector ReplaceNode(Node *cur_node, Node *new_node, + const std::vector &nodes) const; + + void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1, + Node *op_2, Node *fused_op) const; + Node *CreateFusedBatchNormActNode( + Graph *g, const Node *act, const Node *bn, const std::string &bn_x_n, + const std::string &bn_scale_n, const std::string &bn_bias_n, + const std::string &bn_variance_n, const std::string &bn_mean_n, + const std::string &bn_mean_out_n, const std::string &bn_variance_out_n, + const std::string &bn_saved_variance_n, + const std::string &bn_saved_mean_n, const std::string &bn_reserve_space_n, + const std::string &act_out_n) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d407016325..96a37977ee 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -383,6 +383,13 @@ PDNode *PDNode::assert_is_var() { return this; } +PDNode *PDNode::assert_var_dtype(proto::VarType::Type dtype) { + assert_is_var(); + asserts_.emplace_back( + [dtype](Node *x) { return x->Var()->GetDataType() == dtype; }); + return this; +} + PDNode *PDNode::assert_is_not_ctrl_var() { asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); }); return this; @@ -476,6 +483,7 @@ PDNode *PDNode::assert_is_op_output(const std::string &op_type, assert_is_op_nth_output(op_type, argument, 0); return this; } + PDNode *PDNode::assert_is_op_input(const std::string &op_type) { assert_is_var(); asserts_.emplace_back([=](Node *x) { @@ -489,6 +497,16 @@ PDNode *PDNode::assert_is_op_input(const std::string &op_type) { return this; } +PDNode *PDNode::assert_is_not_op_input(const std::string &argument) { + assert_is_op(); + asserts_.emplace_back([=](Node *x) { + auto &ins = x->Op()->Inputs(); + auto iter = ins.find(argument); + return iter == ins.end() || iter->second.empty(); + }); + return this; +} + PDNode *PDNode::assert_is_op_input(const std::string &op_type, const std::string &argument) { assert_is_var(); @@ -1048,6 +1066,117 @@ PDNode *patterns::ActElewiseAdd::operator()( return elewise_add_out; } +PDNode *patterns::BatchNormAct::operator()( + paddle::framework::ir::PDNode *bn_x_var, + std::unordered_set act_types) { + auto *bn_scale_var = pattern->NewNode(bn_scale_repr()) + ->assert_is_op_input("batch_norm", "Scale"); + auto *bn_bias_var = pattern->NewNode(bn_bias_repr()) + ->assert_is_op_input("batch_norm", "Bias"); + auto *bn_variance_var = pattern->NewNode(bn_variance_repr()) + ->assert_is_op_input("batch_norm", "Variance"); + auto *bn_mean_var = pattern->NewNode(bn_mean_repr()) + ->assert_is_op_input("batch_norm", "Mean"); + + auto *bn = pattern->NewNode(batch_norm_repr()) + ->assert_is_op("batch_norm") + ->assert_is_not_op_input("MomentumTensor") + ->assert_op_attr("is_test", false) + ->assert_op_attr("use_global_stats", false) + ->assert_op_attr("data_layout", "NHWC"); + + auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr()) + ->assert_is_op_output("batch_norm", "MeanOut"); + auto *bn_variance_out_var = + pattern->NewNode(bn_variance_out_repr()) + ->assert_is_op_output("batch_norm", "VarianceOut"); + auto *bn_saved_variance_var = + pattern->NewNode(bn_saved_variance_repr()) + ->assert_is_op_output("batch_norm", "SavedVariance"); + auto *bn_saved_mean_var = + pattern->NewNode(bn_saved_mean_repr()) + ->assert_is_op_output("batch_norm", "SavedMean"); + auto *bn_reserve_space = + pattern->NewNode(bn_reserve_space_repr()) + ->assert_is_op_output("batch_norm", "ReserveSpace"); + auto *bn_out_var = pattern->NewNode(bn_out_repr()) + ->assert_is_op_output("batch_norm", "Y") + ->assert_has_n_outputs(1); + + bn_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + bn->LinksFrom( + {bn_x_var, bn_scale_var, bn_bias_var, bn_variance_var, bn_mean_var}) + .LinksTo({bn_mean_out_var, bn_variance_out_var, bn_saved_variance_var, + bn_saved_mean_var, bn_reserve_space, bn_out_var}); + act->LinksFrom({bn_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::BatchNormActGrad::operator()( + paddle::framework::ir::PDNode *d_act_out_var, + std::unordered_set act_grad_types) { + auto *act_grad = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + auto *bn_grad = pattern->NewNode(batch_norm_grad_repr()) + ->assert_is_op("batch_norm_grad") + ->assert_op_attr("use_global_stats", false) + ->assert_op_attr("data_layout", "NHWC"); + + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_ops_input(act_grad_types, "Out"); + auto *d_intermediate_var = + pattern->NewNode(d_itermediate_out_repr()) + ->assert_is_ops_output(act_grad_types, GradVarName("X")) + ->assert_has_n_outputs(1); + auto *bn_x_var = pattern->NewNode(bn_x_repr()) + ->assert_is_op_input("batch_norm_grad", "X") + ->assert_var_dtype(proto::VarType::FP16); + auto *bn_scale_var = pattern->NewNode(bn_scale_repr()) + ->assert_is_op_input("batch_norm_grad", "Scale"); + auto *bn_bias_var = pattern->NewNode(bn_bias_repr()) + ->assert_is_op_input("batch_norm_grad", "Bias"); + auto *bn_saved_mean_var = + pattern->NewNode(bn_saved_mean_repr()) + ->assert_is_op_input("batch_norm_grad", "SavedMean"); + auto *bn_saved_variance_var = + pattern->NewNode(bn_saved_variance_repr()) + ->assert_is_op_input("batch_norm_grad", "SavedVariance"); + // ReserveSpace as the output is equal to: + // data_layout == 'NHWC' && FLAGS_cudnn_batchnorm_spatial_persistent == true + auto *bn_reserve_space = + pattern->NewNode(bn_reserve_space_repr()) + ->assert_is_op_input("batch_norm_grad", "ReserveSpace"); + auto *d_bn_x_var = + pattern->NewNode(d_bn_x_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("batch_norm_grad", GradVarName("X")); + auto *d_bn_scale_var = + pattern->NewNode(d_bn_scale_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("batch_norm_grad", GradVarName("Scale")); + auto *d_bn_bias_var = + pattern->NewNode(d_bn_bias_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("batch_norm_grad", GradVarName("Bias")); + + act_grad->LinksFrom({d_act_out_var, act_out_var}) + .LinksTo({d_intermediate_var}); + + bn_grad + ->LinksFrom({bn_x_var, d_intermediate_var, bn_scale_var, bn_bias_var, + bn_saved_mean_var, bn_saved_variance_var, bn_reserve_space}) + .LinksTo({d_bn_x_var, d_bn_scale_var, d_bn_bias_var}); + + return bn_grad; +} + PDNode *patterns::ElewiseAddAct::operator()( paddle::framework::ir::PDNode *ele_x_var, std::unordered_set act_types) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index ec1db60cc4..d0f65a88b7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -27,6 +27,7 @@ #include #include #include +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/inference/analysis/dot.h" @@ -100,6 +101,7 @@ struct PDNode { PDNode* assert_is_op(); PDNode* assert_is_op(const std::string& op_type); PDNode* assert_is_var(); + PDNode* assert_var_dtype(proto::VarType::Type dtype); PDNode* assert_is_not_ctrl_var(); PDNode* assert_var_not_persistable(); PDNode* assert_is_persistable_var(); @@ -111,6 +113,7 @@ struct PDNode { const std::string& argument); PDNode* assert_is_op_nth_input(const std::string& op_type, const std::string& argument, int nth); + PDNode* assert_is_not_op_input(const std::string& argument); PDNode* assert_is_op_nth_output(const std::string& op_type, const std::string& argument, int nth); PDNode* assert_is_only_input_of_op(const std::string& op_type); @@ -590,6 +593,64 @@ struct GRU : public PatternBase { PATTERN_DECL_NODE(Hidden); }; +// The following pattern is used to fuse batch_norm and act +// formula: act(bn(x)) +// op: batch_norm + act +struct BatchNormAct : public PatternBase { + BatchNormAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "bn_act") {} + + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(batch_norm); + PATTERN_DECL_NODE(act); + // declare variable node's name + // BN inputs + PATTERN_DECL_NODE(bn_scale); + PATTERN_DECL_NODE(bn_bias); + PATTERN_DECL_NODE(bn_variance); + PATTERN_DECL_NODE(bn_mean); + // BN outputs + PATTERN_DECL_NODE(bn_mean_out); + PATTERN_DECL_NODE(bn_variance_out); + PATTERN_DECL_NODE(bn_saved_variance); + PATTERN_DECL_NODE(bn_saved_mean); + PATTERN_DECL_NODE(bn_reserve_space); + PATTERN_DECL_NODE(bn_out); + // ACT output + PATTERN_DECL_NODE(act_out); +}; + +// the backward of act(bn(x)) +// op: batch_norm_grad + act_grad +struct BatchNormActGrad : public PatternBase { + BatchNormActGrad(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "bn_act_grad") {} + + // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] + // bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance", + // "ReserveSpace"], + // out["X@GRAD", "Scale@GRAD", "Bias@GRAD"] + PDNode* operator()(PDNode* x, std::unordered_set act_grad_types); + + // declare operator node's name + PATTERN_DECL_NODE(act_grad); + PATTERN_DECL_NODE(batch_norm_grad); + // declare variable node's name + PATTERN_DECL_NODE(act_out); + PATTERN_DECL_NODE(d_itermediate_out); + PATTERN_DECL_NODE(bn_x); + PATTERN_DECL_NODE(bn_scale); + PATTERN_DECL_NODE(bn_bias); + PATTERN_DECL_NODE(bn_saved_mean); + PATTERN_DECL_NODE(bn_saved_variance); + PATTERN_DECL_NODE(bn_reserve_space); + PATTERN_DECL_NODE(d_bn_x); + PATTERN_DECL_NODE(d_bn_scale); + PATTERN_DECL_NODE(d_bn_bias); +}; + // The following patterns are used to fuse elewise_add and act // formula: act(ele_add(x, y)) // op: elementwise_add + act diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index d33e9d8a76..e1df5b4ee9 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -30,6 +30,8 @@ const std::unordered_set op_has_unsed_vars_white_list = { "auc", "batch_norm", "batch_norm_grad", + "fused_batch_norm_act", + "fused_batch_norm_act_grad", "sync_batch_norm_grad", "center_loss_grad", "crop", diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index db3ff0883f..001760d3e6 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -1,5 +1,6 @@ include(operators) register_operators(EXCLUDES + fused_bn_activation_op conv_fusion_op fusion_transpose_flatten_concat_op fusion_conv_inception_op @@ -8,6 +9,11 @@ register_operators(EXCLUDES fusion_group_op) if (WITH_GPU) + # fused_bn_activation_op needs cudnn 7.4.1 above + if (NOT ${CUDNN_VERSION} VERSION_LESS 7401) + op_library(fused_bn_activation_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n") + endif() # conv_fusion_op needs cudnn 7 above if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) op_library(conv_fusion_op) diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc new file mode 100644 index 0000000000..97cd4d90be --- /dev/null +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc @@ -0,0 +1,296 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_bn_activation_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + platform::errors::InvalidArgument( + "Input(X) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + platform::errors::InvalidArgument( + "Input(Scale) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"), true, + platform::errors::InvalidArgument( + "Input(Bias) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Mean"), true, + platform::errors::InvalidArgument( + "Input(Mean) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Variance"), true, + platform::errors::InvalidArgument( + "Input(Variance) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true, + platform::errors::InvalidArgument( + "Output(Y) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("MeanOut"), true, + platform::errors::InvalidArgument( + "Output(MeanOut) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("VarianceOut"), true, + platform::errors::InvalidArgument( + "Output(VarianceOut) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("SavedMean"), true, + platform::errors::InvalidArgument( + "Output(SavedMean) of BatchNormOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("SavedVariance"), true, + platform::errors::InvalidArgument( + "Output(SavedVariance) of BatchNormOp should not be null.")); + + // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], + platform::errors::PreconditionNotMet( + "Mean and MeanOut should share the same memory")); + PADDLE_ENFORCE_EQ( + ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0], + platform::errors::PreconditionNotMet( + "Variance and VarianceOut should share the same memory")); + + const auto x_dims = ctx->GetInputDim("X"); + + PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::PreconditionNotMet( + "ShapeError: the dimension of input " + "X must greater than or equal to 2." + "But received: the shape of input X " + "= [%s], the dimension of input X =" + "[%d]", + x_dims, x_dims.size())); + PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::PreconditionNotMet( + "ShapeError: the dimension of input " + "X must smaller than or equal to 5." + "But received: the shape of input X " + "= [%s], the dimension of input X =" + "[%d]", + x_dims, x_dims.size())); + + const int64_t C = x_dims[x_dims.size() - 1]; + + auto scale_dim = ctx->GetInputDim("Scale"); + auto bias_dim = ctx->GetInputDim("Bias"); + + PADDLE_ENFORCE_EQ( + scale_dim.size(), 1UL, + platform::errors::PreconditionNotMet( + "ShapeError: the dimension of scale must equal to 1." + "But received: the shape of scale is [%s], the dimension " + "of scale is [%d]", + scale_dim, scale_dim.size())); + PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL, + platform::errors::PreconditionNotMet( + "ShapeError: the dimension of bias must equal to 1." + "But received: the shape of bias is [%s],the dimension " + "of bias is [%d]", + bias_dim, bias_dim.size())); + + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 || + framework::product(bias_dim) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], C, + platform::errors::PreconditionNotMet( + "ShapeError: the shape of scale must equal to [%d]" + "But received: the shape of scale is [%d]", + C, scale_dim[0])); + PADDLE_ENFORCE_EQ(bias_dim[0], C, + platform::errors::PreconditionNotMet( + "ShapeError: the shape of bias must equal to [%d]" + "But received: the shape of bias is [%d]", + C, bias_dim[0])); + } + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType FusedBatchNormActOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto bn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + bn_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), + platform::errors::PreconditionNotMet( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), + platform::errors::PreconditionNotMet( + "Bias input should be of float type")); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), + platform::errors::PreconditionNotMet( + "Mean input should be of float type")); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), + platform::errors::PreconditionNotMet( + "Variance input should be of float type")); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); +} + +framework::OpKernelType FusedBatchNormActOp::GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); +} + +void FusedBatchNormActOpMaker::Make() { + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true, + platform::errors::InvalidArgument( + "'epsilon' should be between 0.0 and 0.001.")); + }); + AddAttr("act_type", "The activation type to be fused.") + .SetDefault("relu"); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Mean", + "The global mean (for training) or " + "estimated mean (for testing)"); + AddInput("Variance", + "The global variance (for training) " + "or estimated Variance (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("ReserveSpace", + "Reserve GPU space for triggering the new semi-persistent " + "NHWC kernel"); + AddComment(R"DOC( +Fused Batch Normalization with activation. + +Batch Norm has been implemented as discussed in the paper: +https://arxiv.org/pdf/1502.03167.pdf +Batch Norm can be used as a normalizer function for conv2d and fully_connected operations. +Now, the required data format for FusedBatchNormActOp is NHWC `[batch, in_height, in_width, in_channels]`. + +)DOC"); +} + +void FusedBatchNormActGradOp::InferShape( + framework::InferShapeContext *ctx) const { + // check input + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + platform::errors::InvalidArgument("Input(X) should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Scale"), true, + platform::errors::InvalidArgument("Input(Scale) should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput(framework::GradVarName("Y")), true, + platform::errors::InvalidArgument("Input(Y@GRAD) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true, + platform::errors::InvalidArgument( + "Input(SavedMean) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true, + platform::errors::InvalidArgument( + "Input(SavedVariance) should not be null")); + + // check output + PADDLE_ENFORCE_EQ( + ctx->HasOutput(framework::GradVarName("X")), true, + platform::errors::InvalidArgument("Output(X@GRAD) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Scale")), true, + platform::errors::InvalidArgument( + "Output(Scale@GRAD) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true, + platform::errors::InvalidArgument( + "Output(Bias@GRAD) should not be null.")); + + const auto x_dims = ctx->GetInputDim("X"); + const int C = x_dims[x_dims.size() - 1]; + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + // has_scale_grad == has_bias_grad, judge has_scale_grad is enough + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); +} + +framework::OpKernelType FusedBatchNormActGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW(platform::errors::NotFound( + "Can not find Y@GRAD in the execution context.")); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW( + platform::errors::NotFound("Can not get the tensor value of Y@GRAD.")); + } + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout, + library); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + fused_batch_norm_act, ops::FusedBatchNormActOp, + ops::FusedBatchNormActOpMaker, ops::FusedBatchNormActOpInferVarType, + ops::FusedBatchNormActGradOpMaker, + ops::FusedBatchNormActGradOpMaker); +REGISTER_OPERATOR(fused_batch_norm_act_grad, ops::FusedBatchNormActGradOp); diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu new file mode 100644 index 0000000000..2e30865793 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -0,0 +1,432 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/fused/fused_bn_activation_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/norm_utils.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +template +using CudnnDataType = platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +class FusedBatchNormActKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + double epsilon = static_cast(ctx.Attr("epsilon")); + float momentum = ctx.Attr("momentum"); + std::string act_type = ctx.Attr("act_type"); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + // Get the size for each dimension. + // NHWC [batch_size, in_height, in_width, in_channels] + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5, true, + platform::errors::PreconditionNotMet( + "The Input dim size should be between 2 and 5")); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + // Run training mode. + // obtain running mean and running inv var, and see if we need to + // initialize them. + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + int N, C, H, W, D; + const DataLayout data_layout = DataLayout::kNHWC; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + auto &dev_ctx = ctx.template device_context(); + if ((N * H * W * D) == 1) { + // Only 1 element in normalization dimension, + // skip the batch norm calculation, let y = act(x). + auto x_v = framework::EigenVector::Flatten(*x); + auto y_v = framework::EigenVector::Flatten(*y); + auto &dev = *dev_ctx.eigen_device(); + if (act_type == "relu") { + ReluFunctor()(dev, x_v, y_v); + } else { + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported activation type")); + } + return; + } + + // ------------------- cudnn descriptors --------------------- + auto handle = dev_ctx.cudnn_handle(); + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&data_desc_).")); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&bn_param_desc_).")); + + VLOG(3) << "Setting descriptors."; + std::vector dims = {N, C, H, W, D}; + std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()), + platform::errors::External( + "The error has happened when calling cudnnSetTensorNdDescriptor.")); + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, + data_desc_, mode_), + platform::errors::External("The error has happened when calling " + "cudnnDeriveBNTensorDescriptor.")); + + double this_factor = 1. - momentum; + cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; + platform::ScopedActivationDescriptor scope_act_desc; + cudnnActivationDescriptor_t activation_desc_ = + scope_act_desc.descriptor(act_type); + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + Tensor workspace_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + auto *reserve_space = ctx.Output("ReserveSpace"); + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + platform::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload:: + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*xDesc=*/data_desc_, + /*zDesc=*/nullptr, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/activation_desc_, + /*sizeInBytes=*/&workspace_size), + platform::errors::External( + "The error has happened when calling " + "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize.")); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*activationDesc=*/activation_desc_, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size), + platform::errors::External( + "The error has happened when calling " + "cudnnGetBatchNormalizationTrainingExReserveSpaceSize.")); + + reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(), + reserve_space_size); + workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), + workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, mode_, bnOps_, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + nullptr, nullptr, data_desc_, y->template data(), bn_param_desc_, + scale->template data>(), + bias->template data>(), this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()), + activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, + reserve_space_size), + platform::errors::External( + "The error has happened when calling " + "cudnnBatchNormalizationForwardTrainingEx.")); + + // clean when exit. + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(data_desc_).")); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(bn_param_desc_).")); + } +}; + +template +class FusedBatchNormActGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + double epsilon = static_cast(ctx.Attr("epsilon")); + std::string act_type = ctx.Attr("act_type"); + + const auto *x = ctx.Input("X"); + const auto *y = ctx.Input("Y"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *reserve_space = ctx.Input("ReserveSpace"); + + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5, true, + platform::errors::PreconditionNotMet( + "The Input dim size should be between 2 and 5")); + int N, C, H, W, D; + const DataLayout data_layout = DataLayout::kNHWC; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE_EQ( + d_scale && d_bias, true, + platform::errors::PreconditionNotMet( + "Both the scale grad and the bias grad must not be null.")); + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL, + platform::errors::PreconditionNotMet( + "The scale only has one dimension.")); + PADDLE_ENFORCE_EQ( + scale->dims()[0], C, + platform::errors::PreconditionNotMet( + "The size of scale is equal to the channel of Input(X).")); + + auto &dev_ctx = ctx.template device_context(); + if ((N * H * W * D) == 1) { + if (act_type == "relu") { + auto x_v = framework::EigenVector::Flatten(*x); + auto y_v = framework::EigenVector::Flatten(*y); + auto dx_v = framework::EigenVector::Flatten(*d_x); + auto dy_v = framework::EigenVector::Flatten(*d_y); + auto &dev = *dev_ctx.eigen_device(); + ReluGradFunctor()(dev, x_v, y_v, dy_v, dx_v); + } else { + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported activation type")); + } + math::SetConstant> + functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + + std::vector dims = {N, C, H, W, D}; + std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&data_desc_).")); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&bn_param_desc_).")); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()), + platform::errors::External( + "The error has happened when calling cudnnSetTensorNdDescriptor.")); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, + data_desc_, mode_), + platform::errors::External("The error has happened when calling " + "cudnnDeriveBNTensorDescriptor.")); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const auto *saved_mean_data = + saved_mean->template data>(); + const auto *saved_var_data = + saved_var->template data>(); + + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + Tensor workspace_tensor; + auto reserve_space_size = reserve_space->memory_size(); + cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; + platform::ScopedActivationDescriptor scope_act_desc; + cudnnActivationDescriptor_t activation_desc_ = + scope_act_desc.descriptor(act_type); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/nullptr, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/activation_desc_, + /*sizeInBytes=*/&workspace_size), + platform::errors::External( + "The error has happened when calling " + "cudnnGetBatchNormalizationBackwardExWorkspaceSize.")); + + workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), + workspace_size); + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/x->template data(), + /*yDesc=*/data_desc_, + /*yData=*/y->template data(), + /*dyDesc=*/data_desc_, + /*dyData=*/d_y->template data(), + /*dzDesc=*/nullptr, + /*dzData=*/nullptr, + /*dxDesc=*/data_desc_, + /*dxData=*/d_x->template mutable_data(ctx.GetPlace()), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale->template data>(), + /*bnBiasData=*/bias->template data>(), + /*dBnScaleData=*/d_scale + ->template mutable_data>(ctx.GetPlace()), + /*dBnBiasData=*/d_bias + ->template mutable_data>(ctx.GetPlace()), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesc=*/activation_desc_, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast(reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size), + platform::errors::External("The error has happened when calling " + "cudnnBatchNormalizationBackwardEx.")); + + // clean when exit. + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(data_desc_).")); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(bn_param_desc_).")); + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 7401 +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + fused_batch_norm_act, + ops::FusedBatchNormActKernel, + ops::FusedBatchNormActKernel, + ops::FusedBatchNormActKernel); +REGISTER_OP_CUDA_KERNEL( + fused_batch_norm_act_grad, + ops::FusedBatchNormActGradKernel, + ops::FusedBatchNormActGradKernel, + ops::FusedBatchNormActGradKernel); +#endif diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h new file mode 100644 index 0000000000..5cfa8def62 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedBatchNormActOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override; +}; + +class FusedBatchNormActGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusedBatchNormActOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +template +class FusedBatchNormActGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new T()); + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("Y", this->Output("Y")); + op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); + + op->SetInput("Scale", this->Input("Scale")); + op->SetInput("Bias", this->Input("Bias")); + op->SetInput("SavedMean", this->Output("SavedMean")); + op->SetInput("SavedVariance", this->Output("SavedVariance")); + op->SetInput("ReserveSpace", this->Output("ReserveSpace")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias")); + + return op; + } +}; + +class FusedBatchNormActOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + +template +class FusedBatchNormActKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class FusedBatchNormActGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 77c35466ed..3762b5a419 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1994,6 +1994,26 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = fluid.BuildStrategy() build_strategy.fuse_elewise_add_act_ops = True )DOC") + .def_property( + "fuse_bn_act_ops", + [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + platform::errors::PreconditionNotMet( + "BuildStrategy is finlaized.")); + self.fuse_bn_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_bn_act_ops indicate whether + to fuse batch_norm and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + build_strategy = fluid.BuildStrategy() + build_strategy.fuse_bn_act_ops = True + )DOC") .def_property( "fuse_relu_depthwise_conv", [](const BuildStrategy &self) { diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0c1388b2e6..7e7beb494b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -182,6 +182,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_lstm_api) list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op) list(REMOVE_ITEM TEST_OPS test_imperative_debug_string) +list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass) if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_dataset) @@ -301,6 +302,7 @@ py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_ py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu) py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu) py_test_modules(test_data_norm_op MODULES test_data_norm_op) +py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000) if(NOT WIN32) py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer) @@ -330,5 +332,5 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu test_parallel_executor_crf test_sync_batch_norm_op test_parallel_executor_feed_persistable_var test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass - test_data_norm_op test_imperative_using_non_zero_gpu + test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST") diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py new file mode 100644 index 0000000000..62eef67a56 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py @@ -0,0 +1,121 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid as fluid +import unittest + + +class TestFuseBatchNormActPass(unittest.TestCase): + def build_program(self, main_program, startup_program, use_cuda, seed=1): + main_program.random_seed = seed + startup_program.random_seed = seed + with fluid.program_guard(main_program, startup_program): + x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32') + y = fluid.layers.data(name="y", shape=[1], dtype='int64') + hidden1 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + bias_attr=False, + data_format='NHWC') + param_attr = fluid.ParamAttr( + name='batch_norm_w', + initializer=fluid.initializer.Constant(value=1.0)) + bias_attr = fluid.ParamAttr( + name='batch_norm_b', + initializer=fluid.initializer.Constant(value=0.0)) + hidden2 = fluid.layers.batch_norm( + input=hidden1, + param_attr=param_attr, + bias_attr=bias_attr, + act='relu', + data_layout='NHWC') + hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu') + hidden4 = fluid.layers.batch_norm( + input=hidden3, act='relu', data_layout='NHWC') + prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=y) + loss = fluid.layers.mean(loss) + sgd = fluid.optimizer.SGD(learning_rate=0.001) + if use_cuda: + sgd = fluid.contrib.mixed_precision.decorate( + sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0) + sgd.minimize(loss) + return x, y, loss + + def check(self, place, use_cuda): + main_program = fluid.Program() + startup_program = fluid.Program() + x, y, loss = self.build_program(main_program, startup_program, use_cuda) + exe = fluid.Executor(place) + iters = 10 + batch_size = 16 + feeder = fluid.DataFeeder(feed_list=[x, y], place=place) + + # close fused_bn_act_ops + build_strategy = fluid.BuildStrategy() + build_strategy.fuse_bn_act_ops = False + binary = fluid.CompiledProgram(main_program).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + loss_vals = [] + scope = fluid.Scope() + with fluid.scope_guard(scope): + exe.run(startup_program) + for _ in range(iters): + data = next(train_reader()) + loss_v = exe.run(binary, + feed=feeder.feed(data), + fetch_list=[loss]) + loss_vals.append(loss_v[0][0]) + + # open fused_bn_act_ops + build_strategy_fused = fluid.BuildStrategy() + build_strategy_fused.fuse_bn_act_ops = True + binary_fused = fluid.CompiledProgram(main_program).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy_fused) + train_reader_fused = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + loss_vals_fused = [] + scope_fused = fluid.Scope() + with fluid.scope_guard(scope_fused): + exe.run(startup_program) + for _ in range(iters): + data = next(train_reader_fused()) + loss_v = exe.run(binary_fused, + feed=feeder.feed(data), + fetch_list=[loss]) + loss_vals_fused.append(loss_v[0][0]) + + # check loss + for i in range(iters): + self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5) + + def test_fuse_bn_act_pass_cpu(self): + place = fluid.CPUPlace() + self.check(place, use_cuda=False) + + def test_fuse_bn_act_pass_cuda(self): + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + self.check(place, use_cuda=True) + + +if __name__ == '__main__': + unittest.main() -- GitLab