diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index d5d4c03d9b4368243adfcdf5640330b8b99a6c1d..f75ba947e902bb219f3081d8e41b9b9ad81a837f 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -18,6 +18,7 @@ #include #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace framework { @@ -40,7 +41,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { for (auto &node : topo_nodes) { if (node->Op()->Type() == fuse_op_type) { auto grad_name = node->Op()->Input(kGrad); - PADDLE_ENFORCE_EQ(grad_name.size(), static_cast(1)); + PADDLE_ENFORCE_EQ(grad_name.size(), static_cast(1), + "The %s operator has multiple gradient input. Expected " + "it to only have one gradient input.", + fuse_op_type); if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) { opt_nodes.emplace_back(node); } @@ -50,30 +54,24 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num << ", and " << opt_nodes.size() << " for dense gradients."; - if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) { - if (result.Has(details::kFusedOptType)) { - auto &opt_type = - result.Get(details::kFusedOptType); - VLOG(6) << "Currently only support fusing one type optimizer op. " - "Has fused " - << opt_type; - } + + if (opt_nodes.size() == 0) return; + if (result.Has(details::kFusedOptType)) { + auto &opt_type = result.Get(details::kFusedOptType); + VLOG(6) << "Currently only support fusing one type of optimizer op, " + << opt_type << " has been fused."; return; } - // There should not have no-ctr-var between the op_nodes that link the op_node - // of op_nodes. + // There should not have no-ctr-var between the opt_nodes that link the + // op_node + // of opt_nodes. if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) { VLOG(6) << "There are interdependent variables among these optimization " "operators, which can not be handled well at present."; return; } - LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num - << ", and " << opt_nodes.size() << " for dense gradients. " - << "To make the speed faster, those optimization are fused " - "during training."; - result.Set(details::kFusedOptType, new details::FusedOptType); result.Get(details::kFusedOptType) = fuse_op_type; if (!result.Has(details::kProgramDescs)) { @@ -85,8 +83,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { if (!result.Has(details::kFusedVars)) { result.Set(details::kFusedVars, new details::FusedVars); } - std::unordered_map> aux_var_set; - GetSpecifiedOpsAndVars(aux_var_names, opt_nodes, &aux_var_set); + std::unordered_map> aux_var_map; + GetFusingVarNamesMap(aux_var_names, opt_nodes, &aux_var_map); std::unordered_map fused_vars_name; fused_vars_name.reserve(aux_var_names.size()); auto &fused_var_set = result.Get(details::kFusedVars); @@ -94,9 +92,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { for (auto &var_name : aux_var_names) { // NOTE: the fused_var_name should be unique. auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + - aux_var_set[var_name][0]; + aux_var_map[var_name][0]; VLOG(6) << var_name << ": " << fused_var_name; - PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0, + "The fused variable already existed."); fused_var_set.insert(fused_var_name); fused_vars_name.emplace(var_name, fused_var_name); } @@ -109,16 +108,16 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { auto ¶ms_and_dense_grads = result.Get(details::kParamsAndDenseGrads); PADDLE_ENFORCE_LE( - params_and_dense_grads.size(), aux_var_set.at(kGrad).size(), + params_and_dense_grads.size(), aux_var_map.at(kGrad).size(), "The number of dense gradients should be little than optimizer ops."); - std::unordered_set opt_grad_set(aux_var_set.at(kGrad).size()); + std::unordered_set opt_grad_set(aux_var_map.at(kGrad).size()); for (auto &p_g : params_and_dense_grads) { opt_grad_set.insert(p_g.second); } std::vector new_grad_idx; - for (size_t idx = 0; idx < aux_var_set.at(kGrad).size(); ++idx) { - auto &grad = aux_var_set.at(kGrad).at(idx); + for (size_t idx = 0; idx < aux_var_map.at(kGrad).size(); ++idx) { + auto &grad = aux_var_map.at(kGrad).at(idx); if (!opt_grad_set.count(grad)) { new_grad_idx.emplace_back(idx); } @@ -137,20 +136,22 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { auto &fused_grad = result.Get(details::kFusedGrads); PADDLE_ENFORCE_NE(fused_grad.size(), 0, "The fused gradient should not be empty."); - PADDLE_ENFORCE_EQ(fused_grad.size(), 1, - "Because the dtype of those gradients " - "is not unified, so the number of fused gradients is " - "more than one, but it is not supported currently."); + if (fused_grad.size() > 1) { + // Note(chenweihang): Because the dtype of those gradients is not + // unified,so the number of fused gradients is more than one, + // but it is not supported currently. + return; + } auto &fused_vars = result.Get(details::kFusedVars); auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front()); PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true, - "Not find the fused_grad."); + "Not found the fused gradient variable."); fused_vars_name[kGrad] = fused_grad.front(); // Sort the parameters and auxiliary variables according // to parameters' name to make variables' name correspond correctly. - SortParametersAndAuxVars(params_and_dense_grads, &aux_var_set, + SortParametersAndAuxVars(params_and_dense_grads, &aux_var_map, &opt_nodes); grad_fused = true; } else { @@ -158,33 +159,54 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { if (new_grad_idx.size() == 1) return; // NOTE(zcd): If the gradients of backward stage and optimization stage // have diff, Only take care of the the gradient of optimization stage. - GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_set); + GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_map); } } - // Check dtype - auto dtype = GetDtypeOfVar(vars_info, aux_var_set.at(kParam).front()); - for (auto vars : aux_var_set) { + // Pass pre-condition check: check dtype of fusing vars + auto fusing_var_dtype = + GetDtypeOfVar(vars_info, aux_var_map.at(kParam).front()); + for (auto vars : aux_var_map) { for (auto &var_name : vars.second) { - PADDLE_ENFORCE_EQ(dtype, GetDtypeOfVar(vars_info, var_name)); + if (fusing_var_dtype != GetDtypeOfVar(vars_info, var_name)) { + // Note(chenweihang): Currently the fuse_optimizer_ops strategy + // in mixed precision scenarios is not yet supported. + return; + } + } + } + + // Pass pre-condition check: gradients generated op kernel + auto fusing_grad_var_names = aux_var_map.at(kGrad); + for (auto grad_var_name : fusing_grad_var_names) { + if (!GradGeneratedOpKernelCheck(vars_info, grad_var_name)) { + // Note(chenweihang): Currently the fuse_optimizer_ops strategy is risky + // when gradient generated operator with kernel just support CPU or + // GPU device, so close it. + return; } } + LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num + << ", and " << opt_nodes.size() << " for dense gradients. " + << "To make the speed faster, those optimization are fused " + "during training."; + // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops // separately. if (!grad_fused) { - InitFusedGradsAndAllocSpaceForGrads( - aux_var_set.at(kParam), aux_var_set.at(kGrad), - fused_vars_name.at(kGrad), dtype, &result); + FuseGradientsToContinuousSpace( + aux_var_map.at(kParam), aux_var_map.at(kGrad), + fused_vars_name.at(kGrad), fusing_var_dtype, &result); } aux_var_names.pop_back(); - InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name, - dtype, &result); + FuseVarsToContinuousSpace(aux_var_names, aux_var_map, fused_vars_name, + fusing_var_dtype, &result); // Step 5: Fuse optimizer Ops and Scale Ops auto *fused_opt_node = - FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result); + FuseOptimizerOps(aux_var_map, fused_vars_name, opt_nodes, &result); InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node); // Step 6: Remove optimizer Ops @@ -231,11 +253,54 @@ bool FuseOptimizerOpPass::HasVarDepsBetweenOps( return false; } +bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU( + const std::string &op_type) const { + auto &all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + // skip op not has kernel + if (it != all_kernels.end()) { + bool support_cpu = false; + bool support_gpu = false; + for (auto &kernel_pair : it->second) { + if (platform::is_cpu_place(kernel_pair.first.place_)) { + support_cpu = true; + } + if (platform::is_gpu_place(kernel_pair.first.place_)) { + support_gpu = true; + } + } + VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu + << ", support GPU: " << support_gpu; + return support_cpu && support_gpu; + } + return true; +} + +bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck( + const std::unordered_map> &vars_info, + const std::string &grad_var_name) const { + auto grad_var_nodes = vars_info.at(grad_var_name); + std::unordered_set check_op_set; + for (auto var_node : grad_var_nodes) { + for (auto in_node : var_node->inputs) { + if (in_node->IsOp() && in_node->Op()) { + check_op_set.emplace(in_node->Op()->Type()); + } + } + } + for (auto op_type : check_op_set) { + if (!OpWithKernelSupportCPUAndGPU(op_type)) { + return false; + } + } + return true; +} + void FuseOptimizerOpPass::GradientsFilter( const std::vector &new_grad_idx, std::vector *opt_nodes, - std::unordered_map> *aux_var_set) + std::unordered_map> *aux_var_map) const { - for (auto &aux_vars : *aux_var_set) { + for (auto &aux_vars : *aux_var_map) { std::vector sorted_vars; sorted_vars.reserve(aux_vars.second.size()); for (size_t i : new_grad_idx) { @@ -257,7 +322,7 @@ void FuseOptimizerOpPass::GradientsFilter( std::swap(*opt_nodes, sorted_ops); } -void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( +void FuseOptimizerOpPass::FuseGradientsToContinuousSpace( const std::vector ¶ms, const std::vector &grads, const std::string &fused_grad_name, const proto::VarType::Type &dtype, ir::Graph *result) const { @@ -268,11 +333,12 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( // The Gradients should not be reused during memory optimization. for (auto &grad_var_name : grads) { auto iter = vars_info.find(grad_var_name); - PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.", - grad_var_name); - PADDLE_ENFORCE_EQ(!iter->second.empty(), true, "%s is not found.", - grad_var_name); - PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var()); + PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, + "The gradient variable %s is not found.", grad_var_name); + PADDLE_ENFORCE_EQ(!iter->second.empty(), true, + "The gradient var node %s is not found.", grad_var_name); + PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var(), + "The gradient var node is null."); PADDLE_ENFORCE_EQ( IsLoDTensorType(iter->second.front()->Var()->GetType()), true, "Currently the gradient type only should be LoDTensor when " @@ -287,8 +353,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( ProgramDesc &program_desc = result->Get(details::kProgramDescs).back(); auto *global_block = program_desc.MutableBlock(0); - AppendAllocContinuousSpace(params, grads, fused_grad_name, dtype, - global_block, false, false); + AppendCoalesceTensorOp(params, grads, fused_grad_name, dtype, global_block, + false, false); } std::unordered_map> @@ -297,7 +363,7 @@ FuseOptimizerOpPass::GetVarInfo(const Graph &result) const { for (Node *node : result.Nodes()) { if (node->IsVar() && node->Var()) { // Note: The graph may have the same name node. For example, parameter - // is the input of operator and it also is the output of optimizer; + // is the input of optimizer and it also is the output of optimizer; vars[node->Var()->Name()].emplace_back(node); } } @@ -314,11 +380,12 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo( const std::unordered_map> &vars_info, const std::string &var_name) const { auto grad_iter = vars_info.find(var_name); - PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.", - var_name); - PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.", - var_name); - PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var()); + PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, + "The gradient varibale %s is not found.", var_name); + PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, + "The gradient var node %s is not found.", var_name); + PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(), + "The gradient var node is null."); return grad_iter->second.front()->Var(); } @@ -336,10 +403,10 @@ proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar( return var_desc->GetType(); } -void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( +void FuseOptimizerOpPass::FuseVarsToContinuousSpace( const std::vector &aux_var_names, const std::unordered_map> - &aux_var_set, + &aux_var_map, const std::unordered_map &fused_vars_name, const proto::VarType::Type &dtype, ir::Graph *result) const { // Define Ops @@ -348,18 +415,19 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( result->Get(details::kProgramDescs).back(); auto *global_block = program_desc.MutableBlock(0); for (auto &var_name : aux_var_names) { - AppendAllocContinuousSpace( - aux_var_set.at(var_name), aux_var_set.at(var_name), - fused_vars_name.at(var_name), dtype, global_block, true); + AppendCoalesceTensorOp(aux_var_map.at(var_name), aux_var_map.at(var_name), + fused_vars_name.at(var_name), dtype, global_block, + true); } } void FuseOptimizerOpPass::SortParametersAndAuxVars( const std::vector> ¶ms_grads, - std::unordered_map> *aux_vars_set, + std::unordered_map> *aux_var_map, std::vector *ops) const { - PADDLE_ENFORCE_NE(aux_vars_set->count(kGrad), static_cast(0)); - auto &grad_vec = aux_vars_set->at(kGrad); + PADDLE_ENFORCE_NE(aux_var_map->count(kGrad), static_cast(0), + "The gradient variable doesn‘t exist."); + auto &grad_vec = aux_var_map->at(kGrad); std::vector grad_sort_idx; grad_sort_idx.reserve(grad_vec.size()); @@ -367,12 +435,12 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( for (auto &p_g : params_grads) { auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second); PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true, - "%s is not found in grad_vec", p_g.second); + "%s is not found in gradient vector", p_g.second); auto idx = std::distance(grad_vec.begin(), iter); grad_sort_idx.emplace_back(idx); } - for (auto &aux_vars : *aux_vars_set) { + for (auto &aux_vars : *aux_var_map) { std::vector sorted_vars; sorted_vars.reserve(aux_vars.second.size()); for (size_t i = 0; i < aux_vars.second.size(); ++i) { @@ -397,23 +465,24 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( std::swap(*ops, sorted_ops); } -void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( +void FuseOptimizerOpPass::GetFusingVarNamesMap( const std::vector &aux_vars_name, const std::vector &opt_nodes, std::unordered_map> *aux_args_name) const { for (auto &node : opt_nodes) { - std::stringstream out; for (auto &var_n : aux_vars_name) { auto arg_names = node->Op()->Input(var_n); - PADDLE_ENFORCE_EQ(arg_names.size(), static_cast(1)); + PADDLE_ENFORCE_EQ(arg_names.size(), static_cast(1), + "The input variable of optimizer to be fused is " + "invalid. Excepted %s only has one %s input.", + node->Op()->Type(), var_n); (*aux_args_name)[var_n].emplace_back(arg_names[0]); - out << var_n << ", " << arg_names[0] << "; "; } } } -void FuseOptimizerOpPass::AppendAllocContinuousSpace( +void FuseOptimizerOpPass::AppendCoalesceTensorOp( const std::vector &in_args, const std::vector &out_args, const std::string &fused_out_arg, const proto::VarType::Type &dtype, BlockDesc *global_block, bool copy_data, @@ -452,9 +521,11 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode( auto deal_with_ctrl_vars = [&out_dep_vars, ¬_useful_vars, &fused_opt_node](ir::Node *ctr_var_node) { - PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1); + PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1, + "The control var node has nultiple inputs."); if (ctr_var_node->inputs.front() == fused_opt_node) { - PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0); + PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0, + "The control var node has no output."); auto output_ops = ctr_var_node->outputs; output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(), [&fused_opt_node](const ir::Node *node) { diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h index 2c381ebcf48fe0495e0932b82ac474ed4396e3e5..0b5bf8a3a4ba5d2d819529718cf018530d67d017 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h @@ -55,25 +55,26 @@ class FuseOptimizerOpPass : public ir::Pass { const std::unordered_map &fused_vars_name, const std::vector &adam_ops, ir::Graph *graph) const = 0; - void GetSpecifiedOpsAndVars( + void GetFusingVarNamesMap( const std::vector &aux_vars_name, const std::vector &opt_nodes, std::unordered_map> *aux_args_name) const; - void AppendAllocContinuousSpace(const std::vector &in_args, - const std::vector &out_args, - const std::string &fused_out_arg, - const proto::VarType::Type &dtype, - BlockDesc *global_block, bool copy_data, - bool check_name = true) const; + void AppendCoalesceTensorOp(const std::vector &in_args, + const std::vector &out_args, + const std::string &fused_out_arg, + const proto::VarType::Type &dtype, + BlockDesc *global_block, bool copy_data, + bool check_name = true) const; - void InitFusedGradsAndAllocSpaceForGrads( - const std::vector ¶ms, - const std::vector &grads, const std::string &fused_grad_name, - const proto::VarType::Type &dtype, ir::Graph *result) const; + void FuseGradientsToContinuousSpace(const std::vector ¶ms, + const std::vector &grads, + const std::string &fused_grad_name, + const proto::VarType::Type &dtype, + ir::Graph *result) const; - void InitFusedVarsAndAllocSpaceForVars( + void FuseVarsToContinuousSpace( const std::vector &aux_var_names, const std::unordered_map> &aux_var_set, @@ -83,6 +84,12 @@ class FuseOptimizerOpPass : public ir::Pass { std::unordered_map> GetVarInfo( const Graph &result) const; + bool OpWithKernelSupportCPUAndGPU(const std::string &op_type) const; + + bool GradGeneratedOpKernelCheck( + const std::unordered_map> &vars_info, + const std::string &grad_var_name) const; + proto::VarType::Type GetDtypeOfVar( const std::unordered_map> &vars_info, const std::string &name) const; diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 7ed4f7cc9e7c2c6ae84ea1129b2cdf4a618435c3..886345919bc2d1a859e068001fd1586029a720f6 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -24,7 +24,7 @@ namespace paddle { namespace operators { template -class CoalesceTensorOp : public framework::OpKernel { +class CoalesceTensorOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto &in_var_names = context.Inputs("Input"); @@ -32,24 +32,39 @@ class CoalesceTensorOp : public framework::OpKernel { auto &in_vars = context.MultiInputVar("Input"); auto out_vars = context.MultiOutputVar("Output"); - PADDLE_ENFORCE_GT(in_var_names.size(), static_cast(0)); - PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size()); + PADDLE_ENFORCE_GT(in_var_names.size(), static_cast(0), + "The CoalesceTensorOp has no input."); + PADDLE_ENFORCE_EQ( + in_var_names.size(), out_var_names.size(), + "The number of CoalesceTensorOp's input and output is not match."); + // Input & Output check: only support LoDTensor for (size_t i = 0; i < in_var_names.size(); ++i) { - // Only support LoDTensor - PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,", - in_var_names[i]); - PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,", - out_var_names[i]); - PADDLE_ENFORCE(in_vars[i]->IsType()); - PADDLE_ENFORCE(out_vars[i]->IsType()); + PADDLE_ENFORCE_NOT_NULL( + in_vars[i], + "The input variable %s of CoalesceTensorOp does not exist.", + in_var_names[i]); + PADDLE_ENFORCE_NOT_NULL( + out_vars[i], + "The output variable %s of CoalesceTensorOp does not exist.", + out_var_names[i]); + PADDLE_ENFORCE_EQ( + in_vars[i]->IsType(), true, + "The input variable %s of CoalesceTensorOp is not LoDTensor.", + in_var_names[i]); + PADDLE_ENFORCE_EQ( + out_vars[i]->IsType(), true, + "The output variable %s of CoalesceTensorOp is not LoDTensor.", + in_var_names[i]); } auto in_tensors = context.MultiInput("Input"); if (context.Attr("check_name")) { for (size_t i = 0; i < in_var_names.size(); ++i) { - PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]); + PADDLE_ENFORCE_EQ( + in_var_names[i], out_var_names[i], + "The input and output variable of CoalesceTensorOp is different."); } } else { // Init the output as input @@ -124,8 +139,8 @@ class CoalesceTensorOp : public framework::OpKernel { std::stringstream ss; ss << "alloc_space_for_vars: "; for (size_t i = 0; i < var_names.size(); ++i) { - PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", - var_names[i]); + PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true, + "%s is not initialized.", var_names[i]); auto size = lod_tensors[i]->numel(); PADDLE_ENFORCE_GT(size, 0); @@ -140,14 +155,14 @@ class CoalesceTensorOp : public framework::OpKernel { } }; -class AllocContinuousSpaceOp : public framework::OperatorWithKernel { +class CoalesceTensorOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override {} }; -class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { +class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Input", @@ -179,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { "they are the same separately.") .SetDefault(false); AddComment(R"DOC( -AllocContinuousSpace Operator. +CoalesceTensor Operator. coalesce_tensor is used to make the address of Output continuous according to the Input. This Op will alloc a big tensor @@ -200,22 +215,22 @@ setting the Output with a constant value. } // namespace operators } // namespace paddle -REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp, - paddle::operators::AllocContinuousSpaceOpMaker); +REGISTER_OPERATOR(coalesce_tensor, paddle::operators::CoalesceTensorOp, + paddle::operators::CoalesceTensorOpMaker); namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CPU_KERNEL( coalesce_tensor, - ops::CoalesceTensorOp, - ops::CoalesceTensorOp, - ops::CoalesceTensorOp, - ops::CoalesceTensorOp); + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #ifdef PADDLE_WITH_CUDA REGISTER_OP_CUDA_KERNEL( coalesce_tensor, - ops::CoalesceTensorOp, - ops::CoalesceTensorOp, - ops::CoalesceTensorOp, - ops::CoalesceTensorOp); + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #endif