未验证 提交 826254f6 编写于 作者: C Chen Weihang 提交者: GitHub

Add pre-condition check for fuse optimizer op pass (#21005)

* add pre condition check for fuse optimizer op pass, test=develop

* add log & set init to zero, test=develop

* fix test_fuse_all_reduce_pass failed, test=develop

* polish details, test=develop

* refine PADDLE_ENFORCE & remove needless VLOG, test=develop

* refactor op check method, test=develop
上级 6cc544aa
......@@ -18,6 +18,7 @@
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace framework {
......@@ -40,7 +41,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
for (auto &node : topo_nodes) {
if (node->Op()->Type() == fuse_op_type) {
auto grad_name = node->Op()->Input(kGrad);
PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1));
PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1),
"The %s operator has multiple gradient input. Expected "
"it to only have one gradient input.",
fuse_op_type);
if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
opt_nodes.emplace_back(node);
}
......@@ -50,30 +54,24 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num
<< ", and " << opt_nodes.size() << " for dense gradients.";
if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) {
if (result.Has(details::kFusedOptType)) {
auto &opt_type =
result.Get<details::FusedOptType>(details::kFusedOptType);
VLOG(6) << "Currently only support fusing one type optimizer op. "
"Has fused "
<< opt_type;
}
if (opt_nodes.size() == 0) return;
if (result.Has(details::kFusedOptType)) {
auto &opt_type = result.Get<details::FusedOptType>(details::kFusedOptType);
VLOG(6) << "Currently only support fusing one type of optimizer op, "
<< opt_type << " has been fused.";
return;
}
// There should not have no-ctr-var between the op_nodes that link the op_node
// of op_nodes.
// There should not have no-ctr-var between the opt_nodes that link the
// op_node
// of opt_nodes.
if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) {
VLOG(6) << "There are interdependent variables among these optimization "
"operators, which can not be handled well at present.";
return;
}
LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num
<< ", and " << opt_nodes.size() << " for dense gradients. "
<< "To make the speed faster, those optimization are fused "
"during training.";
result.Set(details::kFusedOptType, new details::FusedOptType);
result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
if (!result.Has(details::kProgramDescs)) {
......@@ -85,8 +83,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
if (!result.Has(details::kFusedVars)) {
result.Set(details::kFusedVars, new details::FusedVars);
}
std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
GetSpecifiedOpsAndVars(aux_var_names, opt_nodes, &aux_var_set);
std::unordered_map<std::string, std::vector<std::string>> aux_var_map;
GetFusingVarNamesMap(aux_var_names, opt_nodes, &aux_var_map);
std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size());
auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars);
......@@ -94,9 +92,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
for (auto &var_name : aux_var_names) {
// NOTE: the fused_var_name should be unique.
auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
aux_var_set[var_name][0];
aux_var_map[var_name][0];
VLOG(6) << var_name << ": " << fused_var_name;
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
"The fused variable already existed.");
fused_var_set.insert(fused_var_name);
fused_vars_name.emplace(var_name, fused_var_name);
}
......@@ -109,16 +108,16 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &params_and_dense_grads =
result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
PADDLE_ENFORCE_LE(
params_and_dense_grads.size(), aux_var_set.at(kGrad).size(),
params_and_dense_grads.size(), aux_var_map.at(kGrad).size(),
"The number of dense gradients should be little than optimizer ops.");
std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).size());
std::unordered_set<std::string> opt_grad_set(aux_var_map.at(kGrad).size());
for (auto &p_g : params_and_dense_grads) {
opt_grad_set.insert(p_g.second);
}
std::vector<size_t> new_grad_idx;
for (size_t idx = 0; idx < aux_var_set.at(kGrad).size(); ++idx) {
auto &grad = aux_var_set.at(kGrad).at(idx);
for (size_t idx = 0; idx < aux_var_map.at(kGrad).size(); ++idx) {
auto &grad = aux_var_map.at(kGrad).at(idx);
if (!opt_grad_set.count(grad)) {
new_grad_idx.emplace_back(idx);
}
......@@ -137,20 +136,22 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
PADDLE_ENFORCE_NE(fused_grad.size(), 0,
"The fused gradient should not be empty.");
PADDLE_ENFORCE_EQ(fused_grad.size(), 1,
"Because the dtype of those gradients "
"is not unified, so the number of fused gradients is "
"more than one, but it is not supported currently.");
if (fused_grad.size() > 1) {
// Note(chenweihang): Because the dtype of those gradients is not
// unified,so the number of fused gradients is more than one,
// but it is not supported currently.
return;
}
auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars);
auto iter =
std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front());
PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true,
"Not find the fused_grad.");
"Not found the fused gradient variable.");
fused_vars_name[kGrad] = fused_grad.front();
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars(params_and_dense_grads, &aux_var_set,
SortParametersAndAuxVars(params_and_dense_grads, &aux_var_map,
&opt_nodes);
grad_fused = true;
} else {
......@@ -158,33 +159,54 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
if (new_grad_idx.size() == 1) return;
// NOTE(zcd): If the gradients of backward stage and optimization stage
// have diff, Only take care of the the gradient of optimization stage.
GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_set);
GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_map);
}
}
// Check dtype
auto dtype = GetDtypeOfVar(vars_info, aux_var_set.at(kParam).front());
for (auto vars : aux_var_set) {
// Pass pre-condition check: check dtype of fusing vars
auto fusing_var_dtype =
GetDtypeOfVar(vars_info, aux_var_map.at(kParam).front());
for (auto vars : aux_var_map) {
for (auto &var_name : vars.second) {
PADDLE_ENFORCE_EQ(dtype, GetDtypeOfVar(vars_info, var_name));
if (fusing_var_dtype != GetDtypeOfVar(vars_info, var_name)) {
// Note(chenweihang): Currently the fuse_optimizer_ops strategy
// in mixed precision scenarios is not yet supported.
return;
}
}
}
// Pass pre-condition check: gradients generated op kernel
auto fusing_grad_var_names = aux_var_map.at(kGrad);
for (auto grad_var_name : fusing_grad_var_names) {
if (!GradGeneratedOpKernelCheck(vars_info, grad_var_name)) {
// Note(chenweihang): Currently the fuse_optimizer_ops strategy is risky
// when gradient generated operator with kernel just support CPU or
// GPU device, so close it.
return;
}
}
LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num
<< ", and " << opt_nodes.size() << " for dense gradients. "
<< "To make the speed faster, those optimization are fused "
"during training.";
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
// separately.
if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads(
aux_var_set.at(kParam), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), dtype, &result);
FuseGradientsToContinuousSpace(
aux_var_map.at(kParam), aux_var_map.at(kGrad),
fused_vars_name.at(kGrad), fusing_var_dtype, &result);
}
aux_var_names.pop_back();
InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name,
dtype, &result);
FuseVarsToContinuousSpace(aux_var_names, aux_var_map, fused_vars_name,
fusing_var_dtype, &result);
// Step 5: Fuse optimizer Ops and Scale Ops
auto *fused_opt_node =
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
FuseOptimizerOps(aux_var_map, fused_vars_name, opt_nodes, &result);
InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node);
// Step 6: Remove optimizer Ops
......@@ -231,11 +253,54 @@ bool FuseOptimizerOpPass::HasVarDepsBetweenOps(
return false;
}
bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
const std::string &op_type) const {
auto &all_kernels = OperatorWithKernel::AllOpKernels();
auto it = all_kernels.find(op_type);
// skip op not has kernel
if (it != all_kernels.end()) {
bool support_cpu = false;
bool support_gpu = false;
for (auto &kernel_pair : it->second) {
if (platform::is_cpu_place(kernel_pair.first.place_)) {
support_cpu = true;
}
if (platform::is_gpu_place(kernel_pair.first.place_)) {
support_gpu = true;
}
}
VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
<< ", support GPU: " << support_gpu;
return support_cpu && support_gpu;
}
return true;
}
bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &grad_var_name) const {
auto grad_var_nodes = vars_info.at(grad_var_name);
std::unordered_set<std::string> check_op_set;
for (auto var_node : grad_var_nodes) {
for (auto in_node : var_node->inputs) {
if (in_node->IsOp() && in_node->Op()) {
check_op_set.emplace(in_node->Op()->Type());
}
}
}
for (auto op_type : check_op_set) {
if (!OpWithKernelSupportCPUAndGPU(op_type)) {
return false;
}
}
return true;
}
void FuseOptimizerOpPass::GradientsFilter(
const std::vector<size_t> &new_grad_idx, std::vector<Node *> *opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_set)
std::unordered_map<std::string, std::vector<std::string>> *aux_var_map)
const {
for (auto &aux_vars : *aux_var_set) {
for (auto &aux_vars : *aux_var_map) {
std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size());
for (size_t i : new_grad_idx) {
......@@ -257,7 +322,7 @@ void FuseOptimizerOpPass::GradientsFilter(
std::swap(*opt_nodes, sorted_ops);
}
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
void FuseOptimizerOpPass::FuseGradientsToContinuousSpace(
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
const proto::VarType::Type &dtype, ir::Graph *result) const {
......@@ -268,11 +333,12 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
// The Gradients should not be reused during memory optimization.
for (auto &grad_var_name : grads) {
auto iter = vars_info.find(grad_var_name);
PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.",
grad_var_name);
PADDLE_ENFORCE_EQ(!iter->second.empty(), true, "%s is not found.",
grad_var_name);
PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
PADDLE_ENFORCE_EQ(iter != vars_info.end(), true,
"The gradient variable %s is not found.", grad_var_name);
PADDLE_ENFORCE_EQ(!iter->second.empty(), true,
"The gradient var node %s is not found.", grad_var_name);
PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var(),
"The gradient var node is null.");
PADDLE_ENFORCE_EQ(
IsLoDTensorType(iter->second.front()->Var()->GetType()), true,
"Currently the gradient type only should be LoDTensor when "
......@@ -287,8 +353,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, dtype,
global_block, false, false);
AppendCoalesceTensorOp(params, grads, fused_grad_name, dtype, global_block,
false, false);
}
std::unordered_map<std::string, std::vector<Node *>>
......@@ -297,7 +363,7 @@ FuseOptimizerOpPass::GetVarInfo(const Graph &result) const {
for (Node *node : result.Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
// is the input of optimizer and it also is the output of optimizer;
vars[node->Var()->Name()].emplace_back(node);
}
}
......@@ -314,11 +380,12 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const {
auto grad_iter = vars_info.find(var_name);
PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.",
var_name);
PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.",
var_name);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true,
"The gradient varibale %s is not found.", var_name);
PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
"The gradient var node %s is not found.", var_name);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(),
"The gradient var node is null.");
return grad_iter->second.front()->Var();
}
......@@ -336,10 +403,10 @@ proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
return var_desc->GetType();
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
void FuseOptimizerOpPass::FuseVarsToContinuousSpace(
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
&aux_var_map,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const proto::VarType::Type &dtype, ir::Graph *result) const {
// Define Ops
......@@ -348,18 +415,19 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(
aux_var_set.at(var_name), aux_var_set.at(var_name),
fused_vars_name.at(var_name), dtype, global_block, true);
AppendCoalesceTensorOp(aux_var_map.at(var_name), aux_var_map.at(var_name),
fused_vars_name.at(var_name), dtype, global_block,
true);
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_map,
std::vector<ir::Node *> *ops) const {
PADDLE_ENFORCE_NE(aux_vars_set->count(kGrad), static_cast<size_t>(0));
auto &grad_vec = aux_vars_set->at(kGrad);
PADDLE_ENFORCE_NE(aux_var_map->count(kGrad), static_cast<size_t>(0),
"The gradient variable doesn‘t exist.");
auto &grad_vec = aux_var_map->at(kGrad);
std::vector<size_t> grad_sort_idx;
grad_sort_idx.reserve(grad_vec.size());
......@@ -367,12 +435,12 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
for (auto &p_g : params_grads) {
auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second);
PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true,
"%s is not found in grad_vec", p_g.second);
"%s is not found in gradient vector", p_g.second);
auto idx = std::distance(grad_vec.begin(), iter);
grad_sort_idx.emplace_back(idx);
}
for (auto &aux_vars : *aux_vars_set) {
for (auto &aux_vars : *aux_var_map) {
std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size());
for (size_t i = 0; i < aux_vars.second.size(); ++i) {
......@@ -397,23 +465,24 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
std::swap(*ops, sorted_ops);
}
void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
void FuseOptimizerOpPass::GetFusingVarNamesMap(
const std::vector<std::string> &aux_vars_name,
const std::vector<ir::Node *> &opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const {
for (auto &node : opt_nodes) {
std::stringstream out;
for (auto &var_n : aux_vars_name) {
auto arg_names = node->Op()->Input(var_n);
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1),
"The input variable of optimizer to be fused is "
"invalid. Excepted %s only has one %s input.",
node->Op()->Type(), var_n);
(*aux_args_name)[var_n].emplace_back(arg_names[0]);
out << var_n << ", " << arg_names[0] << "; ";
}
}
}
void FuseOptimizerOpPass::AppendAllocContinuousSpace(
void FuseOptimizerOpPass::AppendCoalesceTensorOp(
const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args, const std::string &fused_out_arg,
const proto::VarType::Type &dtype, BlockDesc *global_block, bool copy_data,
......@@ -452,9 +521,11 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars,
&fused_opt_node](ir::Node *ctr_var_node) {
PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1);
PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1,
"The control var node has nultiple inputs.");
if (ctr_var_node->inputs.front() == fused_opt_node) {
PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0);
PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0,
"The control var node has no output.");
auto output_ops = ctr_var_node->outputs;
output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(),
[&fused_opt_node](const ir::Node *node) {
......
......@@ -55,25 +55,26 @@ class FuseOptimizerOpPass : public ir::Pass {
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
void GetSpecifiedOpsAndVars(
void GetFusingVarNamesMap(
const std::vector<std::string> &aux_vars_name,
const std::vector<ir::Node *> &opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const;
void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args,
const std::string &fused_out_arg,
const proto::VarType::Type &dtype,
BlockDesc *global_block, bool copy_data,
bool check_name = true) const;
void AppendCoalesceTensorOp(const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args,
const std::string &fused_out_arg,
const proto::VarType::Type &dtype,
BlockDesc *global_block, bool copy_data,
bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
const proto::VarType::Type &dtype, ir::Graph *result) const;
void FuseGradientsToContinuousSpace(const std::vector<std::string> &params,
const std::vector<std::string> &grads,
const std::string &fused_grad_name,
const proto::VarType::Type &dtype,
ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars(
void FuseVarsToContinuousSpace(
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
......@@ -83,6 +84,12 @@ class FuseOptimizerOpPass : public ir::Pass {
std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const;
bool OpWithKernelSupportCPUAndGPU(const std::string &op_type) const;
bool GradGeneratedOpKernelCheck(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &grad_var_name) const;
proto::VarType::Type GetDtypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const;
......
......@@ -24,7 +24,7 @@ namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class CoalesceTensorOp : public framework::OpKernel<T> {
class CoalesceTensorOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto &in_var_names = context.Inputs("Input");
......@@ -32,24 +32,39 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
auto &in_vars = context.MultiInputVar("Input");
auto out_vars = context.MultiOutputVar("Output");
PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0));
PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size());
PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
"The CoalesceTensorOp has no input.");
PADDLE_ENFORCE_EQ(
in_var_names.size(), out_var_names.size(),
"The number of CoalesceTensorOp's input and output is not match.");
// Input & Output check: only support LoDTensor
for (size_t i = 0; i < in_var_names.size(); ++i) {
// Only support LoDTensor
PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,",
in_var_names[i]);
PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,",
out_var_names[i]);
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensor>());
PADDLE_ENFORCE(out_vars[i]->IsType<framework::LoDTensor>());
PADDLE_ENFORCE_NOT_NULL(
in_vars[i],
"The input variable %s of CoalesceTensorOp does not exist.",
in_var_names[i]);
PADDLE_ENFORCE_NOT_NULL(
out_vars[i],
"The output variable %s of CoalesceTensorOp does not exist.",
out_var_names[i]);
PADDLE_ENFORCE_EQ(
in_vars[i]->IsType<framework::LoDTensor>(), true,
"The input variable %s of CoalesceTensorOp is not LoDTensor.",
in_var_names[i]);
PADDLE_ENFORCE_EQ(
out_vars[i]->IsType<framework::LoDTensor>(), true,
"The output variable %s of CoalesceTensorOp is not LoDTensor.",
in_var_names[i]);
}
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]);
PADDLE_ENFORCE_EQ(
in_var_names[i], out_var_names[i],
"The input and output variable of CoalesceTensorOp is different.");
}
} else {
// Init the output as input
......@@ -124,8 +139,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
std::stringstream ss;
ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
var_names[i]);
PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
"%s is not initialized.", var_names[i]);
auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size, 0);
......@@ -140,14 +155,14 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
}
};
class AllocContinuousSpaceOp : public framework::OperatorWithKernel {
class CoalesceTensorOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {}
};
class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input",
......@@ -179,7 +194,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
"they are the same separately.")
.SetDefault(false);
AddComment(R"DOC(
AllocContinuousSpace Operator.
CoalesceTensor Operator.
coalesce_tensor is used to make the address of Output
continuous according to the Input. This Op will alloc a big tensor
......@@ -200,22 +215,22 @@ setting the Output with a constant value.
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp,
paddle::operators::AllocContinuousSpaceOpMaker);
REGISTER_OPERATOR(coalesce_tensor, paddle::operators::CoalesceTensorOp,
paddle::operators::CoalesceTensorOpMaker);
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CPU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, plat::float16>,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, int>,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, float>,
ops::CoalesceTensorOp<paddle::platform::CPUDeviceContext, double>);
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, plat::float16>,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, int>,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, float>,
ops::CoalesceTensorOp<paddle::platform::CUDADeviceContext, double>);
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册