From a2be4b4d91461d9037bc0fdb9b63a8cd5fc14b1e Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 23 Apr 2019 22:32:40 +0800 Subject: [PATCH] Add fuse momenutum ops (#16745) * Add fuse momenutum ops --- paddle/fluid/framework/details/CMakeLists.txt | 4 +- .../fluid/framework/details/build_strategy.cc | 34 +- .../framework/details/fuse_adam_op_pass.cc | 337 +++++++++--------- .../framework/details/fuse_adam_op_pass.h | 55 --- .../details/fuse_momentum_op_pass.cc | 94 +++++ .../details/fuse_optimizer_op_pass.cc | 20 +- .../framework/details/fuse_sgd_op_pass.cc | 79 ++-- .../framework/details/fuse_sgd_op_pass.h | 50 --- .../unittests/test_fuse_optimizer_pass.py | 27 +- 9 files changed, 363 insertions(+), 337 deletions(-) delete mode 100644 paddle/fluid/framework/details/fuse_adam_op_pass.h create mode 100644 paddle/fluid/framework/details/fuse_momentum_op_pass.cc delete mode 100644 paddle/fluid/framework/details/fuse_sgd_op_pass.h diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index ae89f03186..2f6a816cbf 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -14,6 +14,7 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) +cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper) @@ -126,4 +127,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS fuse_relu_depthwise_conv_pass memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass - fuse_adam_op_pass fuse_sgd_op_pass record_skip_memory_opt_vars_pass) + fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass + record_skip_memory_opt_vars_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index e5dc89ee69..26680eeb29 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -57,7 +57,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("record_skip_memory_opt_vars_pass"); if (strategy_.enable_sequential_execution_) { - VLOG(10) << "Add sequential_execution_pass"; + VLOG(5) << "Add sequential_execution_pass"; AppendPass("sequential_execution_pass"); } @@ -68,7 +68,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add op fusion. if (strategy.fuse_relu_depthwise_conv_) { - VLOG(10) << "Add fuse_relu_depthwise_conv_pass"; + VLOG(5) << "Add fuse_relu_depthwise_conv_pass"; AppendPass("fuse_relu_depthwise_conv_pass"); } @@ -80,19 +80,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add automatically inplace. if (strategy_.enable_inplace_) { - VLOG(10) << "Add inplace_pass"; + VLOG(5) << "Add inplace_pass"; AppendPass("inplace_pass"); } if (strategy_.fuse_elewise_add_act_ops_) { - VLOG(10) << "Add fuse_elewise_add_act_pass"; + VLOG(5) << "Add fuse_elewise_add_act_pass"; AppendPass("fuse_elewise_add_act_pass"); } // for single card training, fuse_all_reduce_ops is unnecessary. // alloc_continuous_space_for_grad_pass should be before of MultiDevPass. if (strategy_.fuse_all_reduce_ops_) { - VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; + VLOG(5) << "Add alloc_continuous_space_for_grad_pass"; AppendPass("alloc_continuous_space_for_grad_pass"); } @@ -107,10 +107,12 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // NOTE: fuse_all_xx_ops will count the number of xx operator first, // if the number is zero, fuse_all_reduce_ops will do nothing. // Currently, only one type of optimization algorithm can be fused. - VLOG(10) << "Add fuse_adam_op_pass"; + VLOG(5) << "Add fuse_adam_op_pass"; AppendPass("fuse_adam_op_pass"); - VLOG(10) << "Add fuse_sgd_op_pass"; + VLOG(5) << "Add fuse_sgd_op_pass"; AppendPass("fuse_sgd_op_pass"); + VLOG(5) << "Add fuse_momentum_op_pass"; + AppendPass("fuse_momentum_op_pass"); } } @@ -139,7 +141,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. if (strategy_.memory_optimize_) { - VLOG(10) << "Add memory_optimize_pass"; + VLOG(5) << "Add memory_optimize_pass"; AppendPass("memory_optimize_pass"); } @@ -147,7 +149,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // all original and fused operators. But no operators can be enabled this // attr if putting it after MultiDevPass. if (strategy_.cache_runtime_context_) { - VLOG(10) << "Add runtime_context_cache_pass"; + VLOG(5) << "Add runtime_context_cache_pass"; AppendPass("runtime_context_cache_pass"); } @@ -161,7 +163,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (strategy_.fuse_all_reduce_ops_) { // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator // first, if the number is zero, fuse_all_reduce_ops will do nothing. - VLOG(10) << "Add fuse_all_reduce_op_pass"; + VLOG(5) << "Add fuse_all_reduce_op_pass"; AppendPass("fuse_all_reduce_op_pass"); } @@ -182,12 +184,12 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (!strategy_.enable_parallel_graph_ && (SeqOnlyAllReduceOps(strategy_) || strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) { - VLOG(10) << "Add all_reduce_deps_pass"; + VLOG(5) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } if (strategy_.remove_unnecessary_lock_) { - VLOG(10) << "Add modify_op_lock_and_record_event_pass"; + VLOG(5) << "Add modify_op_lock_and_record_event_pass"; AppendPass("modify_op_lock_and_record_event_pass"); } @@ -202,16 +204,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (strategy_.async_mode_) { multi_devices_pass = AppendPass("async_multi_devices_pass").get(); } else if (strategy_.is_distribution_) { - VLOG(10) + VLOG(5) << "Add dist_multi_devices_pass, multi device parameter server mode"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - VLOG(10) << "Add all_reduce_mode_multi_devices_pass"; + VLOG(5) << "Add all_reduce_mode_multi_devices_pass"; multi_devices_pass = AppendPass("all_reduce_mode_multi_devices_pass").get(); } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - VLOG(10) << "Add reduce_mode_multi_devices_pass"; + VLOG(5) << "Add reduce_mode_multi_devices_pass"; multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); } else { PADDLE_THROW("Unknown reduce strategy."); @@ -277,6 +279,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" || pass->Type() == "fuse_adam_op_pass" || pass->Type() == "fuse_sgd_op_pass" || + pass->Type() == "fuse_momentum_op_pass" || pass->Type() == "fuse_all_reduce_op_pass") { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -341,6 +344,7 @@ USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(graph_to_program_pass); USE_PASS(fuse_adam_op_pass); USE_PASS(fuse_sgd_op_pass); +USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); USE_PASS(expected_kernel_cache_pass); diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc index f95d93fd55..26315009f8 100644 --- a/paddle/fluid/framework/details/fuse_adam_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc @@ -11,9 +11,15 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/framework/details/fuse_adam_op_pass.h" #include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -21,175 +27,182 @@ namespace paddle { namespace framework { namespace details { -const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } - -const std::vector FuseAdamOpPass::GetAuxiliaryVarNames() const { - return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; -} - -void FuseAdamOpPass::FuseOptimizerOps( - const std::unordered_map> - &aux_var_set, - const std::unordered_map &fused_vars_name, - const std::vector &adam_ops, ir::Graph *graph) const { - FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); - FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), - adam_ops, graph); - FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), - adam_ops, graph); -} - -void FuseAdamOpPass::FuseAdamOps( - const std::unordered_map> &vars_set, - const std::unordered_map &fused_vars_name, - const std::vector &adam_ops, ir::Graph *graph) const { - PADDLE_ENFORCE_GT(adam_ops.size(), static_cast(0)); - - // Check attributions - // NOTE: If new attribution is added, the following code maybe need change. - int op_role = boost::get( - adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - float beta1 = boost::get(adam_ops[0]->Op()->GetAttr("beta1")); - float beta2 = boost::get(adam_ops[0]->Op()->GetAttr("beta2")); - float epsilon = boost::get(adam_ops[0]->Op()->GetAttr("epsilon")); - bool lazy_mode = boost::get(adam_ops[0]->Op()->GetAttr("lazy_mode")); - int64_t min_row_size_to_use_multithread = boost::get( - adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread")); - for (auto &adam_op : adam_ops) { - PADDLE_ENFORCE_EQ(beta1, - boost::get(adam_op->Op()->GetAttr("beta1"))); - PADDLE_ENFORCE_EQ(beta2, - boost::get(adam_op->Op()->GetAttr("beta2"))); - PADDLE_ENFORCE_EQ(epsilon, - boost::get(adam_op->Op()->GetAttr("epsilon"))); - PADDLE_ENFORCE_EQ(lazy_mode, - boost::get(adam_op->Op()->GetAttr("lazy_mode"))); - PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread, - boost::get(adam_op->Op()->GetAttr( - "min_row_size_to_use_multithread"))); - PADDLE_ENFORCE_EQ(op_role, boost::get(adam_op->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName()))); +class FuseAdamOpPass : public FuseOptimizerOpPass { + private: + const std::string GetOpType() const { return "adam"; } + + const std::vector GetAuxiliaryVarNames() const { + return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; } - // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var - // node. - - VLOG(10) << "Insert adam to graph "; - OpDesc adam_desc(adam_ops[0]->Op()->Block()); - adam_desc.SetType("adam"); - adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); - adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); - adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); - adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); - // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. - adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate)); - adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); - adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); - - adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); - adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); - adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); - adam_desc.SetAttr("beta1", beta1); - adam_desc.SetAttr("beta2", beta2); - adam_desc.SetAttr("epsilon", epsilon); - adam_desc.SetAttr("lazy_mode", lazy_mode); - adam_desc.SetAttr("min_row_size_to_use_multithread", - min_row_size_to_use_multithread); - adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); - - auto adam_node = graph->CreateOpNode(&adam_desc); - - InserInputAndOutputForOptOps(adam_ops, adam_node); -} - -void FuseAdamOpPass::FuseScaleOps(const std::vector &beta_name, - const std::string &fused_var_name, - const std::vector &adam_ops, - ir::Graph *graph) const { - PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); - const std::string scale_op_name = "scale"; - - // Get the scale_ops of dealing the adam's beta var. - std::vector scale_ops; - scale_ops.reserve(beta_name.size()); - for (size_t i = 0; i < adam_ops.size(); ++i) { - auto &beta_1_pow_name = beta_name[i]; - auto beta_pow_iter = std::find_if( - adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(), - [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool { - return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name; - }); - PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end()); - - auto beta_pow_node = *beta_pow_iter; - auto scale_op_iter = std::find_if( - beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(), - [&scale_op_name](ir::Node *op_node) -> bool { - return op_node->Op() && op_node->Op()->Type() == scale_op_name; - }); - PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end()); - - scale_ops.emplace_back(*scale_op_iter); + void FuseOptimizerOps( + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), + adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), + adam_ops, graph); } - PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); - - // Check attributions - // NOTE: If new attribution is added, the following code maybe need change. - int op_role = boost::get( - scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - float scale = boost::get(scale_ops[0]->Op()->GetAttr("scale")); - float bias = boost::get(scale_ops[0]->Op()->GetAttr("bias")); - bool bias_after_scale = - boost::get(scale_ops[0]->Op()->GetAttr("bias_after_scale")); - for (auto &scale_op : scale_ops) { - PADDLE_ENFORCE_EQ(scale, - boost::get(scale_op->Op()->GetAttr("scale"))); - PADDLE_ENFORCE_EQ(bias, boost::get(scale_op->Op()->GetAttr("bias"))); - PADDLE_ENFORCE_EQ( - bias_after_scale, - boost::get(scale_op->Op()->GetAttr("bias_after_scale"))); - PADDLE_ENFORCE_EQ(op_role, boost::get(scale_op->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName()))); + + void FuseAdamOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(adam_ops.size(), static_cast(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float beta1 = boost::get(adam_ops[0]->Op()->GetAttr("beta1")); + float beta2 = boost::get(adam_ops[0]->Op()->GetAttr("beta2")); + float epsilon = boost::get(adam_ops[0]->Op()->GetAttr("epsilon")); + bool lazy_mode = boost::get(adam_ops[0]->Op()->GetAttr("lazy_mode")); + int64_t min_row_size_to_use_multithread = boost::get( + adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread")); + for (auto &adam_op : adam_ops) { + PADDLE_ENFORCE_EQ(beta1, + boost::get(adam_op->Op()->GetAttr("beta1"))); + PADDLE_ENFORCE_EQ(beta2, + boost::get(adam_op->Op()->GetAttr("beta2"))); + PADDLE_ENFORCE_EQ(epsilon, + boost::get(adam_op->Op()->GetAttr("epsilon"))); + PADDLE_ENFORCE_EQ(lazy_mode, + boost::get(adam_op->Op()->GetAttr("lazy_mode"))); + PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread, + boost::get(adam_op->Op()->GetAttr( + "min_row_size_to_use_multithread"))); + PADDLE_ENFORCE_EQ(op_role, + boost::get(adam_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + VLOG(7) << "Insert adam to graph "; + OpDesc adam_desc(adam_ops[0]->Op()->Block()); + adam_desc.SetType("adam"); + adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); + adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate)); + adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); + adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); + + adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); + adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); + adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); + adam_desc.SetAttr("beta1", beta1); + adam_desc.SetAttr("beta2", beta2); + adam_desc.SetAttr("epsilon", epsilon); + adam_desc.SetAttr("lazy_mode", lazy_mode); + adam_desc.SetAttr("min_row_size_to_use_multithread", + min_row_size_to_use_multithread); + adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto adam_node = graph->CreateOpNode(&adam_desc); + + InserInputAndOutputForOptOps(adam_ops, adam_node); } - // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var - // node. - - VLOG(10) << "Insert fused scale to graph."; - OpDesc scale_desc(scale_ops[0]->Op()->Block()); - scale_desc.SetType("scale"); - scale_desc.SetInput("X", {fused_var_name}); - scale_desc.SetOutput("Out", {fused_var_name}); - scale_desc.SetAttr("scale", scale); - scale_desc.SetAttr("bias", bias); - scale_desc.SetAttr("bias_after_scale", bias_after_scale); - scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); - auto scale_node = graph->CreateOpNode(&scale_desc); - - for (auto scale_op : scale_ops) { - // set inputs - scale_node->inputs.insert(scale_node->inputs.begin(), - scale_op->inputs.begin(), scale_op->inputs.end()); - for (auto &input : scale_op->inputs) { - std::replace(input->outputs.begin(), input->outputs.end(), scale_op, - scale_node); + void FuseScaleOps(const std::vector &beta_name, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const { + PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); + const std::string scale_op_name = "scale"; + + // Get the scale_ops of dealing the adam's beta var. + std::vector scale_ops; + scale_ops.reserve(beta_name.size()); + for (size_t i = 0; i < adam_ops.size(); ++i) { + auto &beta_1_pow_name = beta_name[i]; + auto beta_pow_iter = std::find_if( + adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(), + [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool { + return var_node->Var() && + var_node->Var()->Name() == beta_1_pow_name; + }); + PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end()); + + auto beta_pow_node = *beta_pow_iter; + auto scale_op_iter = std::find_if( + beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(), + [&scale_op_name](ir::Node *op_node) -> bool { + return op_node->Op() && op_node->Op()->Type() == scale_op_name; + }); + PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end()); + + scale_ops.emplace_back(*scale_op_iter); } - // set outputs - scale_node->outputs.insert(scale_node->outputs.begin(), - scale_op->outputs.begin(), - scale_op->outputs.end()); - for (auto &output : scale_op->outputs) { - std::replace(output->inputs.begin(), output->inputs.end(), scale_op, - scale_node); + PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float scale = boost::get(scale_ops[0]->Op()->GetAttr("scale")); + float bias = boost::get(scale_ops[0]->Op()->GetAttr("bias")); + bool bias_after_scale = + boost::get(scale_ops[0]->Op()->GetAttr("bias_after_scale")); + for (auto &scale_op : scale_ops) { + PADDLE_ENFORCE_EQ(scale, + boost::get(scale_op->Op()->GetAttr("scale"))); + PADDLE_ENFORCE_EQ(bias, + boost::get(scale_op->Op()->GetAttr("bias"))); + PADDLE_ENFORCE_EQ( + bias_after_scale, + boost::get(scale_op->Op()->GetAttr("bias_after_scale"))); + PADDLE_ENFORCE_EQ(op_role, + boost::get(scale_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); } - } - // Delete scale_ops - for (auto &scale_op : scale_ops) { - graph->RemoveNode(scale_op); - } -} + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + VLOG(7) << "Insert fused scale to graph."; + OpDesc scale_desc(scale_ops[0]->Op()->Block()); + scale_desc.SetType("scale"); + scale_desc.SetInput("X", {fused_var_name}); + scale_desc.SetOutput("Out", {fused_var_name}); + scale_desc.SetAttr("scale", scale); + scale_desc.SetAttr("bias", bias); + scale_desc.SetAttr("bias_after_scale", bias_after_scale); + scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + auto scale_node = graph->CreateOpNode(&scale_desc); + + for (auto scale_op : scale_ops) { + // set inputs + scale_node->inputs.insert(scale_node->inputs.begin(), + scale_op->inputs.begin(), + scale_op->inputs.end()); + for (auto &input : scale_op->inputs) { + std::replace(input->outputs.begin(), input->outputs.end(), scale_op, + scale_node); + } + // set outputs + scale_node->outputs.insert(scale_node->outputs.begin(), + scale_op->outputs.begin(), + scale_op->outputs.end()); + for (auto &output : scale_op->outputs) { + std::replace(output->inputs.begin(), output->inputs.end(), scale_op, + scale_node); + } + } + // Delete scale_ops + for (auto &scale_op : scale_ops) { + graph->RemoveNode(scale_op); + } + } +}; } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.h b/paddle/fluid/framework/details/fuse_adam_op_pass.h deleted file mode 100644 index 5866c37552..0000000000 --- a/paddle/fluid/framework/details/fuse_adam_op_pass.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace details { - -class FuseAdamOpPass : public FuseOptimizerOpPass { - private: - virtual const std::string GetOpType() const; - - virtual const std::vector GetAuxiliaryVarNames() const; - - // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow" - virtual void FuseOptimizerOps( - const std::unordered_map> &vars_set, - const std::unordered_map &fused_vars_name, - const std::vector &adam_ops, ir::Graph *graph) const; - - void FuseAdamOps( - const std::unordered_map> &vars_set, - const std::unordered_map &fused_vars_name, - const std::vector &adam_ops, ir::Graph *graph) const; - - void FuseScaleOps(const std::vector &aux_var_set, - const std::string &fused_var_name, - const std::vector &adam_ops, - ir::Graph *graph) const; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_momentum_op_pass.cc b/paddle/fluid/framework/details/fuse_momentum_op_pass.cc new file mode 100644 index 0000000000..c287cdfd09 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_momentum_op_pass.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseMomentumOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const { return "momentum"; } + + virtual const std::vector GetAuxiliaryVarNames() const { + return {"Velocity"}; + } + + // Fuse Momentum Ops + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &momentum_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(momentum_ops.size(), static_cast(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get(momentum_ops[0]->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())); + float mu = boost::get(momentum_ops[0]->Op()->GetAttr("mu")); + bool use_nesterov = + boost::get(momentum_ops[0]->Op()->GetAttr("use_nesterov")); + + for (auto &momentum_op : momentum_ops) { + PADDLE_ENFORCE_EQ(mu, + boost::get(momentum_op->Op()->GetAttr("mu"))); + PADDLE_ENFORCE_EQ( + use_nesterov, + boost::get(momentum_op->Op()->GetAttr("use_nesterov"))); + PADDLE_ENFORCE_EQ(op_role, + boost::get(momentum_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + VLOG(7) << "Insert momentum to graph "; + OpDesc momentum_desc(momentum_ops[0]->Op()->Block()); + momentum_desc.SetType("momentum"); + momentum_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + momentum_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + momentum_desc.SetInput("Velocity", {fused_vars_name.at("Velocity")}); + // TODO(zcd): The LearningRate should be equal. + momentum_desc.SetInput(kLearningRate, + momentum_ops[0]->Op()->Input(kLearningRate)); + + momentum_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); + momentum_desc.SetOutput("VelocityOut", {fused_vars_name.at("Velocity")}); + momentum_desc.SetAttr("mu", mu); + momentum_desc.SetAttr("use_nesterov", use_nesterov); + momentum_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto momentum_node = graph->CreateOpNode(&momentum_desc); + + InserInputAndOutputForOptOps(momentum_ops, momentum_node); + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_momentum_op_pass, + paddle::framework::details::FuseMomentumOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc index 25aa3019d1..262d968c3b 100644 --- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc @@ -42,15 +42,14 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { &aux_var_set); } - VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size(); + VLOG(6) << "Find " << fuse_op_type << " operators: " << opt_ops.size(); if (opt_ops.size() == 0) { return; } if (result.Has(kFusedOptType)) { - VLOG(10) - << "Currently only support fusing one type optimizer op. Has fused " - << result.Get(kFusedOptType); + VLOG(6) << "Currently only support fusing one type optimizer op. Has fused " + << result.Get(kFusedOptType); return; } else { result.Set(kFusedOptType, new FusedOptType); @@ -70,7 +69,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { for (auto &var_name : aux_var_names) { auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + aux_var_set[var_name][0]; - VLOG(10) << fused_var_name; + VLOG(6) << var_name << ": " << fused_var_name; fused_vars_name.emplace(var_name, fused_var_name); PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); fused_var_set.insert(fused_var_name); @@ -151,7 +150,7 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( // Init Grads for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) { auto &scope = *it; - VLOG(10) << "Init " << fused_grad_name; + VLOG(6) << "Init: " << fused_grad_name; PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr, "%s has existed in scope.", fused_grad_name); scope->Var(fused_grad_name)->GetMutable(); @@ -211,13 +210,12 @@ void FuseOptimizerOpPass::RunInitOps(const std::vector &places, void FuseOptimizerOpPass::InitVars(const std::vector &local_scopes, const std::string &fused_var_name) const { - VLOG(10) << "Init FusedVars."; // Alloc parameters and auxiliary vars in the respective scope. size_t idx = local_scopes.size(); for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); ++iter, --idx) { auto &scope = *iter; - VLOG(10) << "Init " << fused_var_name; + VLOG(6) << "Init: " << fused_var_name; PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, "%s has exist in scope[%d]", fused_var_name, idx); scope->Var(fused_var_name)->GetMutable(); @@ -253,7 +251,7 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( for (auto &var_name : aux_vars.second) { out << var_name << " "; } - VLOG(10) << aux_vars.first << ": " << out.str(); + VLOG(6) << aux_vars.first << ": " << out.str(); } std::vector sorted_ops; @@ -271,12 +269,14 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( const { if (node->Op()->Type() != op_type) return; + std::stringstream out; for (auto &var_n : aux_vars_name) { auto arg_names = node->Op()->Input(var_n); PADDLE_ENFORCE_EQ(arg_names.size(), static_cast(1)); (*aux_args_name)[var_n].emplace_back(arg_names[0]); - VLOG(10) << var_n << ", " << arg_names[0]; + out << var_n << ", " << arg_names[0] << "; "; } + VLOG(7) << out.str(); ops->emplace_back(node); } diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc index 2219f3209f..4dd1860e25 100644 --- a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc @@ -11,60 +11,61 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h" #include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" - namespace paddle { namespace framework { namespace details { -const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } - -const std::vector FuseSgdOpPass::GetAuxiliaryVarNames() const { - return {}; -} - -void FuseSgdOpPass::FuseOptimizerOps( - const std::unordered_map> - &aux_var_set, - const std::unordered_map &fused_vars_name, - const std::vector &sgd_ops, ir::Graph *graph) const { - FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph); -} +class FuseSgdOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const { return "sgd"; } -void FuseSgdOpPass::FuseSgdOps( - const std::unordered_map> &vars_set, - const std::unordered_map &fused_vars_name, - const std::vector &sgd_ops, ir::Graph *graph) const { - PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast(0)); + virtual const std::vector GetAuxiliaryVarNames() const { + return {}; + } - // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var - // node. + // Fuse Sgd Ops + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast(0)); - int op_role = boost::get( - sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - VLOG(10) << "Insert sgd to graph "; - // Add fused scale - OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); - Sgd_desc.SetType("sgd"); - Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); - Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); - Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. - // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. - Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate)); + int op_role = boost::get( + sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + VLOG(7) << "Insert sgd to graph "; + // Add fused scale + OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); + Sgd_desc.SetType("sgd"); + Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); - // NOTE: multi_devices_pass requires that every op should have a role. - Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + // TODO(zcd): The LearningRate should be equal. + Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate)); - auto sgd_node = graph->CreateOpNode(&Sgd_desc); + // NOTE: multi_devices_pass requires that every op should have a role. + Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); - InserInputAndOutputForOptOps(sgd_ops, sgd_node); -} + auto sgd_node = graph->CreateOpNode(&Sgd_desc); + InserInputAndOutputForOptOps(sgd_ops, sgd_node); + } +}; } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.h b/paddle/fluid/framework/details/fuse_sgd_op_pass.h deleted file mode 100644 index b3aa6a203b..0000000000 --- a/paddle/fluid/framework/details/fuse_sgd_op_pass.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace details { - -class FuseSgdOpPass : public FuseOptimizerOpPass { - private: - virtual const std::string GetOpType() const; - - virtual const std::vector GetAuxiliaryVarNames() const; - - // Fuse Sgd Ops - virtual void FuseOptimizerOps( - const std::unordered_map> &vars_set, - const std::unordered_map &fused_vars_name, - const std::vector &sgd_ops, ir::Graph *graph) const; - - void FuseSgdOps( - const std::unordered_map> &vars_set, - const std::unordered_map &fused_vars_name, - const std::vector &sgd_ops, ir::Graph *graph) const; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py index 510be19af4..b92324d8a7 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -31,18 +31,17 @@ class TestFuseAdamOps(TestParallelExecutorBase): if use_cuda and not core.is_compiled_with_cuda(): return img, label = init_data() + feed_dict = {"image": img, "label": label} not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( model, - feed_dict={"image": img, - "label": label}, + feed_dict=feed_dict, use_cuda=use_cuda, fuse_all_optimizer_ops=False, memory_opt=False, # avoid the gradient's name changed in Python side. optimizer=optimizer) fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( model, - feed_dict={"image": img, - "label": label}, + feed_dict=feed_dict, use_cuda=use_cuda, fuse_all_optimizer_ops=True, memory_opt=False, # avoid the gradient's name changed in Python side. @@ -63,7 +62,7 @@ class TestFuseAdamOps(TestParallelExecutorBase): class TestFuseSGDOps(TestFuseAdamOps): - def sgd_optimizer(self, learning_rate=1e-4): + def sgd_optimizer(self, learning_rate=1e-3): return fluid.optimizer.SGD(learning_rate=learning_rate) def test_simple_fc_with_fuse_op(self): @@ -79,5 +78,23 @@ class TestFuseSGDOps(TestFuseAdamOps): fc_with_batchnorm, False, optimizer=self.sgd_optimizer) +class TestFuseMomentumOps(TestFuseAdamOps): + def momentum_optimizer(self, learning_rate=1e-3): + return fluid.optimizer.Momentum( + learning_rate=learning_rate, momentum=0.1) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + simple_fc_net, True, optimizer=self.momentum_optimizer) + self._compare_fused_optimizer_ops( + simple_fc_net, False, optimizer=self.momentum_optimizer) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + fc_with_batchnorm, True, optimizer=self.momentum_optimizer) + self._compare_fused_optimizer_ops( + fc_with_batchnorm, False, optimizer=self.momentum_optimizer) + + if __name__ == '__main__': unittest.main() -- GitLab