提交 101a2b61 编写于 作者: C chengduo 提交者: gongweibao

Add dtype for coalesce_tensor_op (#20016)

Add dtype for coalesce_tensor_op
上级 f04f2b23
......@@ -276,7 +276,7 @@ class CoalesceGradTensorPass : public ir::Pass {
}
auto dtype =
GetDtypeOfVar(vars_info, group_params_grads->at(i).front().first);
GetDtypeOfVar(vars_info, group_params_grads->at(i).front().second);
VLOG(10) << out.str()
<< ", group size:" << group_params_grads->at(i).size()
<< ", group memory size:" << static_cast<double>(gps_size) / kMB
......@@ -465,28 +465,34 @@ class CoalesceGradTensorPass : public ir::Pass {
std::vector<std::string> params_name;
grads_name.reserve(params_grads.size());
params_name.reserve(params_grads.size());
auto dtype = GetDtypeOfVar(vars_info, params_grads.front().second);
for (auto &p_g : params_grads) {
params_name.emplace_back(p_g.first);
grads_name.emplace_back(p_g.second);
auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
PADDLE_ENFORCE_EQ(next_dtype, dtype);
}
result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, dtype,
global_block);
}
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
const proto::VarType::Type &dtype,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("coalesce_tensor");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
op_desc->SetAttr("dtype", static_cast<int>(dtype));
}
};
} // namespace ir
......
......@@ -162,17 +162,25 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
}
}
// Check dtype
auto dtype = GetDtypeOfVar(vars_info, aux_var_set.at(kParam).front());
for (auto vars : aux_var_set) {
for (auto &var_name : vars.second) {
PADDLE_ENFORCE_EQ(dtype, GetDtypeOfVar(vars_info, var_name));
}
}
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
// separately.
if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads(aux_var_set.at(kParam),
aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), &result);
InitFusedGradsAndAllocSpaceForGrads(
aux_var_set.at(kParam), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), dtype, &result);
}
aux_var_names.pop_back();
InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name,
&result);
dtype, &result);
// Step 5: Fuse optimizer Ops and Scale Ops
auto *fused_opt_node =
......@@ -252,7 +260,7 @@ void FuseOptimizerOpPass::GradientsFilter(
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const {
const proto::VarType::Type &dtype, ir::Graph *result) const {
auto &pinned_var_set =
result->GetOrInit<details::PinnedVars>(details::kPinnedVars);
......@@ -279,8 +287,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
false, false);
AppendAllocContinuousSpace(params, grads, fused_grad_name, dtype,
global_block, false, false);
}
std::unordered_map<std::string, std::vector<Node *>>
......@@ -302,15 +310,30 @@ bool FuseOptimizerOpPass::IsLoDTensorType(
return type == proto::VarType::LOD_TENSOR;
}
const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const {
auto grad_iter = vars_info.find(var_name);
PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.",
var_name);
PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.",
var_name);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
return grad_iter->second.front()->Var();
}
proto::VarType::Type FuseOptimizerOpPass::GetDtypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const {
auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
return var_desc->GetDataType();
}
proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &name) const {
auto grad_iter = var_nodes.find(name);
PADDLE_ENFORCE_EQ(grad_iter != var_nodes.end(), true, "%s is not found.",
name);
PADDLE_ENFORCE_GT(grad_iter->second.size(), 0);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
return grad_iter->second.front()->Var()->GetType();
auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
return var_desc->GetType();
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
......@@ -318,7 +341,7 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
ir::Graph *result) const {
const proto::VarType::Type &dtype, ir::Graph *result) const {
// Define Ops
result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc =
......@@ -327,7 +350,7 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(
aux_var_set.at(var_name), aux_var_set.at(var_name),
fused_vars_name.at(var_name), global_block, true);
fused_vars_name.at(var_name), dtype, global_block, true);
}
}
......@@ -393,7 +416,8 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args, const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data, bool check_name) const {
const proto::VarType::Type &dtype, BlockDesc *global_block, bool copy_data,
bool check_name) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("coalesce_tensor");
op_desc->SetInput("Input", in_args);
......@@ -401,6 +425,7 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
op_desc->SetOutput("FusedOutput", {fused_out_arg});
op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", check_name);
op_desc->SetAttr("dtype", static_cast<int>(dtype));
}
void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
......
......@@ -64,28 +64,37 @@ class FuseOptimizerOpPass : public ir::Pass {
void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args,
const std::string &fused_out_arg,
const proto::VarType::Type &dtype,
BlockDesc *global_block, bool copy_data,
bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const;
const proto::VarType::Type &dtype, ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
ir::Graph *result) const;
const proto::VarType::Type &dtype, ir::Graph *result) const;
std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const;
proto::VarType::Type GetDtypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const;
proto::VarType::Type GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::string &name) const;
const VarDesc *GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const;
void GradientsFilter(const std::vector<size_t> &new_grad_idx,
std::vector<Node *> *opt_nodes,
std::unordered_map<std::string, std::vector<std::string>>
......
......@@ -23,9 +23,6 @@
namespace paddle {
namespace operators {
static framework::proto::VarType::Type kDefaultDtype =
framework::proto::VarType::Type::VarType_Type_BOOL;
template <typename DeviceContext, typename T>
class CoalesceTensorOp : public framework::OpKernel<T> {
public:
......@@ -66,8 +63,10 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
// Get numel and dtype
size_t numel = 0;
auto dtype = kDefaultDtype;
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype,
auto dtype = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace());
// Alloc the continuous space
......@@ -78,7 +77,6 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
// Init the continuous space
auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
size_t offset = 0;
size_t size_of_dtype = framework::SizeOfType(dtype);
if (context.Attr<bool>("copy_data")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
size_t len = static_cast<size_t>(in_tensors[i]->numel());
......@@ -120,27 +118,15 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel,
framework::proto::VarType::Type *dtype,
const platform::Place &place) const {
const size_t &size_of_dtype, const platform::Place &place) const {
PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
*numel = 0;
size_t size_of_dtype = 0;
std::stringstream ss;
ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
var_names[i]);
auto p_dtype = lod_tensors[i]->type();
if (*dtype == kDefaultDtype) {
PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
var_names[i], kDefaultDtype);
*dtype = p_dtype;
size_of_dtype = framework::SizeOfType(p_dtype);
}
PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size, 0);
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
......@@ -178,6 +164,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor) The output tensor "
"of coalesce_tensor operator. And the tensors of"
" Output is sliced from the tensor of FusedOutput.");
AddAttr<int>("dtype", "The output data type.");
AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
.SetDefault(false);
AddAttr<bool>("set_constant",
......
......@@ -25,7 +25,7 @@ alignment = 256
class TestAllocContinuousSpace(OpTest):
def setUp(self):
self.op_type = "coalesce_tensor"
self.dtype = np.float32
self.dtype, self.fluid_dtype = self.init_dtype()
attrs = self.init_attr()
self.copy_data = attrs["copy_data"]
self.constant = attrs["constant"]
......@@ -38,7 +38,7 @@ class TestAllocContinuousSpace(OpTest):
self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
def init_dtype(self):
self.dtype = np.float32
return np.float32, int(core.VarDesc.VarType.FP32)
def init_input(self):
inputs = []
......@@ -51,7 +51,12 @@ class TestAllocContinuousSpace(OpTest):
return inputs
def init_attr(self):
return {"copy_data": True, "set_constant": False, "constant": 0.0}
return {
"copy_data": True,
"set_constant": False,
"constant": 0.0,
"dtype": self.fluid_dtype
}
def init_output(self, input_list, set_constant, constant):
inputs = []
......@@ -82,7 +87,12 @@ class TestAllocContinuousSpace(OpTest):
class TestAllocContinuousSpace2(TestAllocContinuousSpace):
def init_attr(self):
return {"copy_data": False, "set_constant": True, "constant": 0.5}
return {
"copy_data": False,
"set_constant": True,
"constant": 0.5,
"dtype": self.fluid_dtype
}
def test_check_output(self):
if core.is_compiled_with_cuda():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册