未验证 提交 9f004548 编写于 作者: C Chen Weihang 提交者: GitHub

Add pre-condition check for fuse optimizer op pass (#21005) (#21305)

* add pre condition check for fuse optimizer op pass, test=develop

* add log & set init to zero, test=develop

* fix test_fuse_all_reduce_pass failed, test=develop

* polish details, test=develop

* refine PADDLE_ENFORCE & remove needless VLOG, test=develop

* refactor op check method, test=develop
上级 9110c896
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -40,7 +41,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -40,7 +41,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
for (auto &node : topo_nodes) { for (auto &node : topo_nodes) {
if (node->Op()->Type() == fuse_op_type) { if (node->Op()->Type() == fuse_op_type) {
auto grad_name = node->Op()->Input(kGrad); auto grad_name = node->Op()->Input(kGrad);
PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1)); PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1),
"The %s operator has multiple gradient input. Expected "
"it to only have one gradient input.",
fuse_op_type);
if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) { if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
opt_nodes.emplace_back(node); opt_nodes.emplace_back(node);
} }
...@@ -50,30 +54,24 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -50,30 +54,24 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num
<< ", and " << opt_nodes.size() << " for dense gradients."; << ", and " << opt_nodes.size() << " for dense gradients.";
if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) {
if (result.Has(details::kFusedOptType)) { if (opt_nodes.size() == 0) return;
auto &opt_type = if (result.Has(details::kFusedOptType)) {
result.Get<details::FusedOptType>(details::kFusedOptType); auto &opt_type = result.Get<details::FusedOptType>(details::kFusedOptType);
VLOG(6) << "Currently only support fusing one type optimizer op. " VLOG(6) << "Currently only support fusing one type of optimizer op, "
"Has fused " << opt_type << " has been fused.";
<< opt_type;
}
return; return;
} }
// There should not have no-ctr-var between the op_nodes that link the op_node // There should not have no-ctr-var between the opt_nodes that link the
// of op_nodes. // op_node
// of opt_nodes.
if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) { if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) {
VLOG(6) << "There are interdependent variables among these optimization " VLOG(6) << "There are interdependent variables among these optimization "
"operators, which can not be handled well at present."; "operators, which can not be handled well at present.";
return; return;
} }
LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num
<< ", and " << opt_nodes.size() << " for dense gradients. "
<< "To make the speed faster, those optimization are fused "
"during training.";
result.Set(details::kFusedOptType, new details::FusedOptType); result.Set(details::kFusedOptType, new details::FusedOptType);
result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type; result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
if (!result.Has(details::kProgramDescs)) { if (!result.Has(details::kProgramDescs)) {
...@@ -85,8 +83,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -85,8 +83,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
if (!result.Has(details::kFusedVars)) { if (!result.Has(details::kFusedVars)) {
result.Set(details::kFusedVars, new details::FusedVars); result.Set(details::kFusedVars, new details::FusedVars);
} }
std::unordered_map<std::string, std::vector<std::string>> aux_var_set; std::unordered_map<std::string, std::vector<std::string>> aux_var_map;
GetSpecifiedOpsAndVars(aux_var_names, opt_nodes, &aux_var_set); GetFusingVarNamesMap(aux_var_names, opt_nodes, &aux_var_map);
std::unordered_map<std::string, std::string> fused_vars_name; std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size()); fused_vars_name.reserve(aux_var_names.size());
auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars); auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars);
...@@ -94,9 +92,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -94,9 +92,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
// NOTE: the fused_var_name should be unique. // NOTE: the fused_var_name should be unique.
auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
aux_var_set[var_name][0]; aux_var_map[var_name][0];
VLOG(6) << var_name << ": " << fused_var_name; VLOG(6) << var_name << ": " << fused_var_name;
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
"The fused variable already existed.");
fused_var_set.insert(fused_var_name); fused_var_set.insert(fused_var_name);
fused_vars_name.emplace(var_name, fused_var_name); fused_vars_name.emplace(var_name, fused_var_name);
} }
...@@ -109,16 +108,16 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -109,16 +108,16 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &params_and_dense_grads = auto &params_and_dense_grads =
result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads); result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
params_and_dense_grads.size(), aux_var_set.at(kGrad).size(), params_and_dense_grads.size(), aux_var_map.at(kGrad).size(),
"The number of dense gradients should be little than optimizer ops."); "The number of dense gradients should be little than optimizer ops.");
std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).size()); std::unordered_set<std::string> opt_grad_set(aux_var_map.at(kGrad).size());
for (auto &p_g : params_and_dense_grads) { for (auto &p_g : params_and_dense_grads) {
opt_grad_set.insert(p_g.second); opt_grad_set.insert(p_g.second);
} }
std::vector<size_t> new_grad_idx; std::vector<size_t> new_grad_idx;
for (size_t idx = 0; idx < aux_var_set.at(kGrad).size(); ++idx) { for (size_t idx = 0; idx < aux_var_map.at(kGrad).size(); ++idx) {
auto &grad = aux_var_set.at(kGrad).at(idx); auto &grad = aux_var_map.at(kGrad).at(idx);
if (!opt_grad_set.count(grad)) { if (!opt_grad_set.count(grad)) {
new_grad_idx.emplace_back(idx); new_grad_idx.emplace_back(idx);
} }
...@@ -137,20 +136,22 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -137,20 +136,22 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads); auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
PADDLE_ENFORCE_NE(fused_grad.size(), 0, PADDLE_ENFORCE_NE(fused_grad.size(), 0,
"The fused gradient should not be empty."); "The fused gradient should not be empty.");
PADDLE_ENFORCE_EQ(fused_grad.size(), 1, if (fused_grad.size() > 1) {
"Because the dtype of those gradients " // Note(chenweihang): Because the dtype of those gradients is not
"is not unified, so the number of fused gradients is " // unified,so the number of fused gradients is more than one,
"more than one, but it is not supported currently."); // but it is not supported currently.
return;
}
auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars); auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars);
auto iter = auto iter =
std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front()); std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front());
PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true, PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true,
"Not find the fused_grad."); "Not found the fused gradient variable.");
fused_vars_name[kGrad] = fused_grad.front(); fused_vars_name[kGrad] = fused_grad.front();
// Sort the parameters and auxiliary variables according // Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly. // to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars(params_and_dense_grads, &aux_var_set, SortParametersAndAuxVars(params_and_dense_grads, &aux_var_map,
&opt_nodes); &opt_nodes);
grad_fused = true; grad_fused = true;
} else { } else {
...@@ -158,33 +159,54 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -158,33 +159,54 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
if (new_grad_idx.size() == 1) return; if (new_grad_idx.size() == 1) return;
// NOTE(zcd): If the gradients of backward stage and optimization stage // NOTE(zcd): If the gradients of backward stage and optimization stage
// have diff, Only take care of the the gradient of optimization stage. // have diff, Only take care of the the gradient of optimization stage.
GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_set); GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_map);
} }
} }
// Check dtype // Pass pre-condition check: check dtype of fusing vars
auto dtype = GetDtypeOfVar(vars_info, aux_var_set.at(kParam).front()); auto fusing_var_dtype =
for (auto vars : aux_var_set) { GetDtypeOfVar(vars_info, aux_var_map.at(kParam).front());
for (auto vars : aux_var_map) {
for (auto &var_name : vars.second) { for (auto &var_name : vars.second) {
PADDLE_ENFORCE_EQ(dtype, GetDtypeOfVar(vars_info, var_name)); if (fusing_var_dtype != GetDtypeOfVar(vars_info, var_name)) {
// Note(chenweihang): Currently the fuse_optimizer_ops strategy
// in mixed precision scenarios is not yet supported.
return;
}
}
}
// Pass pre-condition check: gradients generated op kernel
auto fusing_grad_var_names = aux_var_map.at(kGrad);
for (auto grad_var_name : fusing_grad_var_names) {
if (!GradGeneratedOpKernelCheck(vars_info, grad_var_name)) {
// Note(chenweihang): Currently the fuse_optimizer_ops strategy is risky
// when gradient generated operator with kernel just support CPU or
// GPU device, so close it.
return;
} }
} }
LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num
<< ", and " << opt_nodes.size() << " for dense gradients. "
<< "To make the speed faster, those optimization are fused "
"during training.";
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
// separately. // separately.
if (!grad_fused) { if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads( FuseGradientsToContinuousSpace(
aux_var_set.at(kParam), aux_var_set.at(kGrad), aux_var_map.at(kParam), aux_var_map.at(kGrad),
fused_vars_name.at(kGrad), dtype, &result); fused_vars_name.at(kGrad), fusing_var_dtype, &result);
} }
aux_var_names.pop_back(); aux_var_names.pop_back();
InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name, FuseVarsToContinuousSpace(aux_var_names, aux_var_map, fused_vars_name,
dtype, &result); fusing_var_dtype, &result);
// Step 5: Fuse optimizer Ops and Scale Ops // Step 5: Fuse optimizer Ops and Scale Ops
auto *fused_opt_node = auto *fused_opt_node =
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result); FuseOptimizerOps(aux_var_map, fused_vars_name, opt_nodes, &result);
InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node); InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node);
// Step 6: Remove optimizer Ops // Step 6: Remove optimizer Ops
...@@ -231,11 +253,54 @@ bool FuseOptimizerOpPass::HasVarDepsBetweenOps( ...@@ -231,11 +253,54 @@ bool FuseOptimizerOpPass::HasVarDepsBetweenOps(
return false; return false;
} }
bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
const std::string &op_type) const {
auto &all_kernels = OperatorWithKernel::AllOpKernels();
auto it = all_kernels.find(op_type);
// skip op not has kernel
if (it != all_kernels.end()) {
bool support_cpu = false;
bool support_gpu = false;
for (auto &kernel_pair : it->second) {
if (platform::is_cpu_place(kernel_pair.first.place_)) {
support_cpu = true;
}
if (platform::is_gpu_place(kernel_pair.first.place_)) {
support_gpu = true;
}
}
VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
<< ", support GPU: " << support_gpu;
return support_cpu && support_gpu;
}
return true;
}
bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &grad_var_name) const {
auto grad_var_nodes = vars_info.at(grad_var_name);
std::unordered_set<std::string> check_op_set;
for (auto var_node : grad_var_nodes) {
for (auto in_node : var_node->inputs) {
if (in_node->IsOp() && in_node->Op()) {
check_op_set.emplace(in_node->Op()->Type());
}
}
}
for (auto op_type : check_op_set) {
if (!OpWithKernelSupportCPUAndGPU(op_type)) {
return false;
}
}
return true;
}
void FuseOptimizerOpPass::GradientsFilter( void FuseOptimizerOpPass::GradientsFilter(
const std::vector<size_t> &new_grad_idx, std::vector<Node *> *opt_nodes, const std::vector<size_t> &new_grad_idx, std::vector<Node *> *opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_set) std::unordered_map<std::string, std::vector<std::string>> *aux_var_map)
const { const {
for (auto &aux_vars : *aux_var_set) { for (auto &aux_vars : *aux_var_map) {
std::vector<std::string> sorted_vars; std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size()); sorted_vars.reserve(aux_vars.second.size());
for (size_t i : new_grad_idx) { for (size_t i : new_grad_idx) {
...@@ -257,7 +322,7 @@ void FuseOptimizerOpPass::GradientsFilter( ...@@ -257,7 +322,7 @@ void FuseOptimizerOpPass::GradientsFilter(
std::swap(*opt_nodes, sorted_ops); std::swap(*opt_nodes, sorted_ops);
} }
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( void FuseOptimizerOpPass::FuseGradientsToContinuousSpace(
const std::vector<std::string> &params, const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name, const std::vector<std::string> &grads, const std::string &fused_grad_name,
const proto::VarType::Type &dtype, ir::Graph *result) const { const proto::VarType::Type &dtype, ir::Graph *result) const {
...@@ -268,11 +333,12 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( ...@@ -268,11 +333,12 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
// The Gradients should not be reused during memory optimization. // The Gradients should not be reused during memory optimization.
for (auto &grad_var_name : grads) { for (auto &grad_var_name : grads) {
auto iter = vars_info.find(grad_var_name); auto iter = vars_info.find(grad_var_name);
PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.", PADDLE_ENFORCE_EQ(iter != vars_info.end(), true,
grad_var_name); "The gradient variable %s is not found.", grad_var_name);
PADDLE_ENFORCE_EQ(!iter->second.empty(), true, "%s is not found.", PADDLE_ENFORCE_EQ(!iter->second.empty(), true,
grad_var_name); "The gradient var node %s is not found.", grad_var_name);
PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var()); PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var(),
"The gradient var node is null.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
IsLoDTensorType(iter->second.front()->Var()->GetType()), true, IsLoDTensorType(iter->second.front()->Var()->GetType()), true,
"Currently the gradient type only should be LoDTensor when " "Currently the gradient type only should be LoDTensor when "
...@@ -287,8 +353,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( ...@@ -287,8 +353,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
ProgramDesc &program_desc = ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back(); result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, dtype, AppendCoalesceTensorOp(params, grads, fused_grad_name, dtype, global_block,
global_block, false, false); false, false);
} }
std::unordered_map<std::string, std::vector<Node *>> std::unordered_map<std::string, std::vector<Node *>>
...@@ -297,7 +363,7 @@ FuseOptimizerOpPass::GetVarInfo(const Graph &result) const { ...@@ -297,7 +363,7 @@ FuseOptimizerOpPass::GetVarInfo(const Graph &result) const {
for (Node *node : result.Nodes()) { for (Node *node : result.Nodes()) {
if (node->IsVar() && node->Var()) { if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter // Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer; // is the input of optimizer and it also is the output of optimizer;
vars[node->Var()->Name()].emplace_back(node); vars[node->Var()->Name()].emplace_back(node);
} }
} }
...@@ -314,11 +380,12 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo( ...@@ -314,11 +380,12 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info, const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const { const std::string &var_name) const {
auto grad_iter = vars_info.find(var_name); auto grad_iter = vars_info.find(var_name);
PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.", PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true,
var_name); "The gradient varibale %s is not found.", var_name);
PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.", PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
var_name); "The gradient var node %s is not found.", var_name);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var()); PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(),
"The gradient var node is null.");
return grad_iter->second.front()->Var(); return grad_iter->second.front()->Var();
} }
...@@ -336,10 +403,10 @@ proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar( ...@@ -336,10 +403,10 @@ proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
return var_desc->GetType(); return var_desc->GetType();
} }
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( void FuseOptimizerOpPass::FuseVarsToContinuousSpace(
const std::vector<std::string> &aux_var_names, const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_map,
const std::unordered_map<std::string, std::string> &fused_vars_name, const std::unordered_map<std::string, std::string> &fused_vars_name,
const proto::VarType::Type &dtype, ir::Graph *result) const { const proto::VarType::Type &dtype, ir::Graph *result) const {
// Define Ops // Define Ops
...@@ -348,18 +415,19 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( ...@@ -348,18 +415,19 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
result->Get<details::ProgramDescs>(details::kProgramDescs).back(); result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace( AppendCoalesceTensorOp(aux_var_map.at(var_name), aux_var_map.at(var_name),
aux_var_set.at(var_name), aux_var_set.at(var_name), fused_vars_name.at(var_name), dtype, global_block,
fused_vars_name.at(var_name), dtype, global_block, true); true);
} }
} }
void FuseOptimizerOpPass::SortParametersAndAuxVars( void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads, const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set, std::unordered_map<std::string, std::vector<std::string>> *aux_var_map,
std::vector<ir::Node *> *ops) const { std::vector<ir::Node *> *ops) const {
PADDLE_ENFORCE_NE(aux_vars_set->count(kGrad), static_cast<size_t>(0)); PADDLE_ENFORCE_NE(aux_var_map->count(kGrad), static_cast<size_t>(0),
auto &grad_vec = aux_vars_set->at(kGrad); "The gradient variable doesn‘t exist.");
auto &grad_vec = aux_var_map->at(kGrad);
std::vector<size_t> grad_sort_idx; std::vector<size_t> grad_sort_idx;
grad_sort_idx.reserve(grad_vec.size()); grad_sort_idx.reserve(grad_vec.size());
...@@ -367,12 +435,12 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( ...@@ -367,12 +435,12 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
for (auto &p_g : params_grads) { for (auto &p_g : params_grads) {
auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second); auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second);
PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true, PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true,
"%s is not found in grad_vec", p_g.second); "%s is not found in gradient vector", p_g.second);
auto idx = std::distance(grad_vec.begin(), iter); auto idx = std::distance(grad_vec.begin(), iter);
grad_sort_idx.emplace_back(idx); grad_sort_idx.emplace_back(idx);
} }
for (auto &aux_vars : *aux_vars_set) { for (auto &aux_vars : *aux_var_map) {
std::vector<std::string> sorted_vars; std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size()); sorted_vars.reserve(aux_vars.second.size());
for (size_t i = 0; i < aux_vars.second.size(); ++i) { for (size_t i = 0; i < aux_vars.second.size(); ++i) {
...@@ -397,23 +465,24 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( ...@@ -397,23 +465,24 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
std::swap(*ops, sorted_ops); std::swap(*ops, sorted_ops);
} }
void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( void FuseOptimizerOpPass::GetFusingVarNamesMap(
const std::vector<std::string> &aux_vars_name, const std::vector<std::string> &aux_vars_name,
const std::vector<ir::Node *> &opt_nodes, const std::vector<ir::Node *> &opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const { const {
for (auto &node : opt_nodes) { for (auto &node : opt_nodes) {
std::stringstream out;
for (auto &var_n : aux_vars_name) { for (auto &var_n : aux_vars_name) {
auto arg_names = node->Op()->Input(var_n); auto arg_names = node->Op()->Input(var_n);
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1)); PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1),
"The input variable of optimizer to be fused is "
"invalid. Excepted %s only has one %s input.",
node->Op()->Type(), var_n);
(*aux_args_name)[var_n].emplace_back(arg_names[0]); (*aux_args_name)[var_n].emplace_back(arg_names[0]);
out << var_n << ", " << arg_names[0] << "; ";
} }
} }
} }
void FuseOptimizerOpPass::AppendAllocContinuousSpace( void FuseOptimizerOpPass::AppendCoalesceTensorOp(
const std::vector<std::string> &in_args, const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args, const std::string &fused_out_arg, const std::vector<std::string> &out_args, const std::string &fused_out_arg,
const proto::VarType::Type &dtype, BlockDesc *global_block, bool copy_data, const proto::VarType::Type &dtype, BlockDesc *global_block, bool copy_data,
...@@ -452,9 +521,11 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode( ...@@ -452,9 +521,11 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars, auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars,
&fused_opt_node](ir::Node *ctr_var_node) { &fused_opt_node](ir::Node *ctr_var_node) {
PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1); PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1,
"The control var node has nultiple inputs.");
if (ctr_var_node->inputs.front() == fused_opt_node) { if (ctr_var_node->inputs.front() == fused_opt_node) {
PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0); PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0,
"The control var node has no output.");
auto output_ops = ctr_var_node->outputs; auto output_ops = ctr_var_node->outputs;
output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(), output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(),
[&fused_opt_node](const ir::Node *node) { [&fused_opt_node](const ir::Node *node) {
......
...@@ -55,25 +55,26 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -55,25 +55,26 @@ class FuseOptimizerOpPass : public ir::Pass {
const std::unordered_map<std::string, std::string> &fused_vars_name, const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0; const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
void GetSpecifiedOpsAndVars( void GetFusingVarNamesMap(
const std::vector<std::string> &aux_vars_name, const std::vector<std::string> &aux_vars_name,
const std::vector<ir::Node *> &opt_nodes, const std::vector<ir::Node *> &opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const; const;
void AppendAllocContinuousSpace(const std::vector<std::string> &in_args, void AppendCoalesceTensorOp(const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args, const std::vector<std::string> &out_args,
const std::string &fused_out_arg, const std::string &fused_out_arg,
const proto::VarType::Type &dtype, const proto::VarType::Type &dtype,
BlockDesc *global_block, bool copy_data, BlockDesc *global_block, bool copy_data,
bool check_name = true) const; bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads( void FuseGradientsToContinuousSpace(const std::vector<std::string> &params,
const std::vector<std::string> &params, const std::vector<std::string> &grads,
const std::vector<std::string> &grads, const std::string &fused_grad_name, const std::string &fused_grad_name,
const proto::VarType::Type &dtype, ir::Graph *result) const; const proto::VarType::Type &dtype,
ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars( void FuseVarsToContinuousSpace(
const std::vector<std::string> &aux_var_names, const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
...@@ -83,6 +84,12 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -83,6 +84,12 @@ class FuseOptimizerOpPass : public ir::Pass {
std::unordered_map<std::string, std::vector<Node *>> GetVarInfo( std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const; const Graph &result) const;
bool OpWithKernelSupportCPUAndGPU(const std::string &op_type) const;
bool GradGeneratedOpKernelCheck(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &grad_var_name) const;
proto::VarType::Type GetDtypeOfVar( proto::VarType::Type GetDtypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info, const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const; const std::string &name) const;
......
...@@ -24,7 +24,7 @@ namespace paddle { ...@@ -24,7 +24,7 @@ namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class CoalesceTensorOp : public framework::OpKernel<T> { class CoalesceTensorOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto &in_var_names = context.Inputs("Input"); auto &in_var_names = context.Inputs("Input");
...@@ -32,24 +32,39 @@ class CoalesceTensorOp : public framework::OpKernel<T> { ...@@ -32,24 +32,39 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
auto &in_vars = context.MultiInputVar("Input"); auto &in_vars = context.MultiInputVar("Input");
auto out_vars = context.MultiOutputVar("Output"); auto out_vars = context.MultiOutputVar("Output");
PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0)); PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size()); "The CoalesceTensorOp has no input.");
PADDLE_ENFORCE_EQ(
in_var_names.size(), out_var_names.size(),
"The number of CoalesceTensorOp's input and output is not match.");
// Input & Output check: only support LoDTensor
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
// Only support LoDTensor PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,", in_vars[i],
in_var_names[i]); "The input variable %s of CoalesceTensorOp does not exist.",
PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,", in_var_names[i]);
out_var_names[i]); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensor>()); out_vars[i],
PADDLE_ENFORCE(out_vars[i]->IsType<framework::LoDTensor>()); "The output variable %s of CoalesceTensorOp does not exist.",
out_var_names[i]);
PADDLE_ENFORCE_EQ(
in_vars[i]->IsType<framework::LoDTensor>(), true,
"The input variable %s of CoalesceTensorOp is not LoDTensor.",
in_var_names[i]);
PADDLE_ENFORCE_EQ(
out_vars[i]->IsType<framework::LoDTensor>(), true,
"The output variable %s of CoalesceTensorOp is not LoDTensor.",
in_var_names[i]);
} }
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input"); auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
if (context.Attr<bool>("check_name")) { if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]); PADDLE_ENFORCE_EQ(
in_var_names[i], out_var_names[i],
"The input and output variable of CoalesceTensorOp is different.");
} }
} else { } else {
// Init the output as input // Init the output as input
...@@ -124,8 +139,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> { ...@@ -124,8 +139,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
std::stringstream ss; std::stringstream ss;
ss << "alloc_space_for_vars: "; ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < var_names.size(); ++i) { for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
var_names[i]); "%s is not initialized.", var_names[i]);
auto size = lod_tensors[i]->numel(); auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
...@@ -140,14 +155,14 @@ class CoalesceTensorOp : public framework::OpKernel<T> { ...@@ -140,14 +155,14 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
} }
}; };
class AllocContinuousSpaceOp : public framework::OperatorWithKernel { class CoalesceTensorOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {} void InferShape(framework::InferShapeContext *ctx) const override {}
}; };
class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Input", AddInput("Input",
...@@ -179,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -179,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
"they are the same separately.") "they are the same separately.")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
AllocContinuousSpace Operator. CoalesceTensor Operator.
coalesce_tensor is used to make the address of Output coalesce_tensor is used to make the address of Output
continuous according to the Input. This Op will alloc a big tensor continuous according to the Input. This Op will alloc a big tensor
...@@ -200,22 +215,22 @@ setting the Output with a constant value. ...@@ -200,22 +215,22 @@ setting the Output with a constant value.
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp, REGISTER_OPERATOR(coalesce_tensor, paddle::operators::CoalesceTensorOp,
paddle::operators::AllocContinuousSpaceOpMaker); paddle::operators::CoalesceTensorOpMaker);
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
coalesce_tensor, coalesce_tensor,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, plat::float16>, ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, int>, ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, float>, ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, double>);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
coalesce_tensor, coalesce_tensor,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, plat::float16>, ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, int>, plat::float16>,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, float>, ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, double>); ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif #endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册