diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 20cfa75292cf52a01bf794a2714deaac1e821f50..196603bbff1db79e46ebbe8b18f1092fcbaac7f9 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "mode."; strategy_.fuse_all_optimizer_ops_ = false; } else { - VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; - AppendPass("alloc_continuous_space_for_grad_pass"); // NOTE: fuse_all_xx_ops will count the number of xx operator first, // if the number is zero, fuse_all_reduce_ops will do nothing. // Currently, only one type of optimization algorithm can be fused. diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc index 0ef75e319244e2ccc63dfa3f93f0cd764cf67633..f95d93fd5575ae538274c4c0322cf661c631849a 100644 --- a/paddle/fluid/framework/details/fuse_adam_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc @@ -24,7 +24,7 @@ namespace details { const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } const std::vector FuseAdamOpPass::GetAuxiliaryVarNames() const { - return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; + return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; } void FuseAdamOpPass::FuseOptimizerOps( @@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps( VLOG(10) << "Insert adam to graph "; OpDesc adam_desc(adam_ops[0]->Op()->Block()); adam_desc.SetType("adam"); - adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); - adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. - adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); + adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate)); adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); - adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); adam_desc.SetAttr("beta1", beta1); diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc index b49f095d428a017dd1a3bed2788a048af9afa6bb..25aa3019d102293725d836cf1f8e9fce8462408b 100644 --- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc @@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { auto &local_scopes = Get>(kLocalScopes); const std::string fuse_op_type = GetOpType(); - const std::vector aux_var_names = GetAuxiliaryVarNames(); + std::vector aux_var_names = GetAuxiliaryVarNames(); + aux_var_names.emplace_back(kParam); + aux_var_names.emplace_back(kGrad); // Step 1: Get the specified op and auxiliary variables. std::vector topo_nodes = ir::TopologySortOperations(result); @@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { result.Set(kFusedVars, new FusedVars); } std::unordered_map fused_vars_name; - fused_vars_name.reserve(aux_var_names.size() + 1); + fused_vars_name.reserve(aux_var_names.size()); auto &fused_var_set = result.Get(kFusedVars); const std::string prefix(kFusedVarNamePrefix); // NOTE: the fused_var_name should be unique. @@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { } // Step 3: Get the fused Gradient's name - auto ¶ms_grads = result.Get(kParamsAndGrads); - if (!result.Has(kFusedGrads)) { - PADDLE_THROW( - "The alloc_continuous_space_for_grad_pass should be called before this " - "pass."); - } - auto &fused_grad = result.Get(kFusedGrads); - auto &fused_vars = result.Get(kFusedVars); - auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); - PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); - fused_vars_name.emplace("Grad", fused_grad); - - // Step 4: Sort the parameters and auxiliary variables according - // to parameters' name to make variables' name correspond correctly. - PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads."); - PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), - "The size of params_grads and aux_var_set are not equal."); - SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); - - // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. + bool grad_fused = false; + if (result.Has(kParamsAndGrads)) { + auto ¶ms_grads = result.Get(kParamsAndGrads); + PADDLE_ENFORCE_EQ( + params_grads.size(), aux_var_set.at(kGrad).size(), + "The number of gradients and optimizer ops is not equal."); + std::unordered_set opt_grad_set(aux_var_set.at(kGrad).begin(), + aux_var_set.at(kGrad).end()); + size_t same_grad_num = 0; + for (auto &p_g : params_grads) { + if (opt_grad_set.count(p_g.second)) { + ++same_grad_num; + } + } + + // NOTE(zcd): the gradient of kParamsAndGrads may be different with the + // kGrad. + if (same_grad_num == aux_var_set.at(kGrad).size()) { + if (!result.Has(kFusedGrads)) { + PADDLE_THROW( + "The alloc_continuous_space_for_grad_pass should be called before " + "this pass."); + } + auto &fused_grad = result.Get(kFusedGrads); + auto &fused_vars = result.Get(kFusedVars); + auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); + PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + fused_vars_name[kGrad] = fused_grad; + + // Sort the parameters and auxiliary variables according + // to parameters' name to make variables' name correspond correctly. + SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); + grad_fused = true; + } + } + + // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. + aux_var_names.pop_back(); + if (!grad_fused) { + InitFusedGradsAndAllocSpaceForGrads( + places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad), + fused_vars_name.at(kGrad), &result); + } InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, aux_var_set, fused_vars_name); - // Step 6: Fuse optimizer Ops and Scale Ops + // Step 5: Fuse optimizer Ops and Scale Ops FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); - // Step 7: Remove optimizer Ops + // Step 6: Remove optimizer Ops for (auto &opt_op : opt_ops) { graph->RemoveNode(opt_op); } } +void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( + const std::vector &places, + const std::vector &local_scopes, + const std::vector ¶ms, + const std::vector &grads, const std::string &fused_grad_name, + ir::Graph *result) const { + // Get Var Nodes + std::unordered_map vars; + for (ir::Node *node : result->Nodes()) { + if (node->IsVar() && node->Var()) { + // Note: The graph may have the same name node. For example, parameter + // is the input of operator and it also is the output of optimizer; + vars.emplace(node->Var()->Name(), node); + } + } + // Init Grads + for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) { + auto &scope = *it; + VLOG(10) << "Init " << fused_grad_name; + PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr, + "%s has existed in scope.", fused_grad_name); + scope->Var(fused_grad_name)->GetMutable(); + + for (auto &grad_var_name : grads) { + auto iter = vars.find(grad_var_name); + PADDLE_ENFORCE(iter != vars.end()); + PADDLE_ENFORCE_NOT_NULL(iter->second->Var()); + PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(), + proto::VarType::LOD_TENSOR); + scope->Var(grad_var_name)->GetMutable(); + } + } + // Define Ops + ProgramDesc program_desc; + auto *global_block = program_desc.MutableBlock(0); + AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block, + false, false); + // Run Ops + RunInitOps(places, local_scopes, *global_block); +} + void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( const std::vector &places, const std::vector &local_scopes, @@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( const std::unordered_map> &aux_var_set, const std::unordered_map &fused_vars_name) const { - VLOG(10) << "Init FusedVars."; - // Alloc parameters and auxiliary vars in the respective scope. - size_t idx = local_scopes.size(); - for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); - ++iter, --idx) { - auto &scope = *iter; - for (auto &var_name : aux_var_names) { - auto fused_var_name = fused_vars_name.at(var_name); - VLOG(10) << "Init " << fused_var_name; - PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, - "%s has exist in scope[%d]", fused_var_name, idx); - scope->Var(fused_var_name)->GetMutable(); - } + // Init Vars + for (auto &var_name : aux_var_names) { + auto &fused_var_name = fused_vars_name.at(var_name); + InitVars(local_scopes, fused_var_name); } - + // Define Ops ProgramDesc program_desc; auto *global_block = program_desc.MutableBlock(0); for (auto &var_name : aux_var_names) { - AppendAllocContinuousSpace(aux_var_set.at(var_name), - fused_vars_name.at(var_name), true, - global_block); + AppendAllocContinuousSpace( + aux_var_set.at(var_name), aux_var_set.at(var_name), + fused_vars_name.at(var_name), global_block, true); } + // Run Ops + RunInitOps(places, local_scopes, *global_block); +} +void FuseOptimizerOpPass::RunInitOps(const std::vector &places, + const std::vector &local_scopes, + const BlockDesc &global_block) const { for (size_t i = 0; i < local_scopes.size(); ++i) { - for (auto &op_desc : global_block->AllOps()) { + for (auto &op_desc : global_block.AllOps()) { auto op = OpRegistry::CreateOp(*op_desc); op->Run(*local_scopes[i], places[i]); } } } +void FuseOptimizerOpPass::InitVars(const std::vector &local_scopes, + const std::string &fused_var_name) const { + VLOG(10) << "Init FusedVars."; + // Alloc parameters and auxiliary vars in the respective scope. + size_t idx = local_scopes.size(); + for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); + ++iter, --idx) { + auto &scope = *iter; + VLOG(10) << "Init " << fused_var_name; + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has exist in scope[%d]", fused_var_name, idx); + scope->Var(fused_var_name)->GetMutable(); + } +} + void FuseOptimizerOpPass::SortParametersAndAuxVars( const std::vector> ¶ms_grads, std::unordered_map> *aux_vars_set, @@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( } void FuseOptimizerOpPass::AppendAllocContinuousSpace( - const std::vector &args, const std::string &out_arg, - bool copy_data, BlockDesc *global_block) const { + const std::vector &in_args, + const std::vector &out_args, const std::string &fused_out_arg, + BlockDesc *global_block, bool copy_data, bool check_name) const { auto op_desc = global_block->AppendOp(); op_desc->SetType("alloc_continuous_space"); - op_desc->SetInput("Input", args); - op_desc->SetOutput("Output", args); - op_desc->SetOutput("FusedOutput", {out_arg}); + op_desc->SetInput("Input", in_args); + op_desc->SetOutput("Output", out_args); + op_desc->SetOutput("FusedOutput", {fused_out_arg}); op_desc->SetAttr("copy_data", copy_data); - op_desc->SetAttr("check_name", true); + op_desc->SetAttr("check_name", check_name); } void FuseOptimizerOpPass::InserInputAndOutputForOptOps( diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h index 0240f1594d7ef9d855eb6e96e8e8a32ee1d957ba..47efc1693dd31ca88787da3a9d6d06aa7ef65786 100644 --- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h @@ -27,6 +27,10 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGrad[] = "Grad"; +constexpr char kParam[] = "Param"; +constexpr char kLearningRate[] = "LearningRate"; + class FuseOptimizerOpPass : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const override; @@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass { std::unordered_map> *aux_args_name) const; - void AppendAllocContinuousSpace(const std::vector &args, - const std::string &out_arg, bool copy_data, - BlockDesc *global_block) const; + void AppendAllocContinuousSpace(const std::vector &in_args, + const std::vector &out_args, + const std::string &fused_out_arg, + BlockDesc *global_block, bool copy_data, + bool check_name = true) const; + + void InitFusedGradsAndAllocSpaceForGrads( + const std::vector &places, + const std::vector &local_scopes, + const std::vector ¶ms, + const std::vector &grads, const std::string &fused_grad_name, + ir::Graph *result) const; void InitFusedVarsAndAllocSpaceForVars( const std::vector &places, @@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass { &aux_var_set, const std::unordered_map &fused_vars_name) const; + + void RunInitOps(const std::vector &places, + const std::vector &local_scopes, + const BlockDesc &global_block) const; + + void InitVars(const std::vector &local_scopes, + const std::string &fused_var_name) const; }; } // namespace details diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc index f91c21e3cc869de1a6d67146eb99f27a2ca5497c..2219f3209f77de5cb34abfb9edb8bdea6a8eebb0 100644 --- a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc @@ -24,7 +24,7 @@ namespace details { const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } const std::vector FuseSgdOpPass::GetAuxiliaryVarNames() const { - return {"Param"}; + return {}; } void FuseSgdOpPass::FuseOptimizerOps( @@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps( // Add fused scale OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); Sgd_desc.SetType("sgd"); - Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); - Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); - Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. - Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); + Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate)); // NOTE: multi_devices_pass requires that every op should have a role. Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);