提交 101a2b61 编写于 作者: C chengduo 提交者: gongweibao

Add dtype for coalesce_tensor_op (#20016)

Add dtype for coalesce_tensor_op
上级 f04f2b23
...@@ -276,7 +276,7 @@ class CoalesceGradTensorPass : public ir::Pass { ...@@ -276,7 +276,7 @@ class CoalesceGradTensorPass : public ir::Pass {
} }
auto dtype = auto dtype =
GetDtypeOfVar(vars_info, group_params_grads->at(i).front().first); GetDtypeOfVar(vars_info, group_params_grads->at(i).front().second);
VLOG(10) << out.str() VLOG(10) << out.str()
<< ", group size:" << group_params_grads->at(i).size() << ", group size:" << group_params_grads->at(i).size()
<< ", group memory size:" << static_cast<double>(gps_size) / kMB << ", group memory size:" << static_cast<double>(gps_size) / kMB
...@@ -465,28 +465,34 @@ class CoalesceGradTensorPass : public ir::Pass { ...@@ -465,28 +465,34 @@ class CoalesceGradTensorPass : public ir::Pass {
std::vector<std::string> params_name; std::vector<std::string> params_name;
grads_name.reserve(params_grads.size()); grads_name.reserve(params_grads.size());
params_name.reserve(params_grads.size()); params_name.reserve(params_grads.size());
auto dtype = GetDtypeOfVar(vars_info, params_grads.front().second);
for (auto &p_g : params_grads) { for (auto &p_g : params_grads) {
params_name.emplace_back(p_g.first); params_name.emplace_back(p_g.first);
grads_name.emplace_back(p_g.second); grads_name.emplace_back(p_g.second);
auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
PADDLE_ENFORCE_EQ(next_dtype, dtype);
} }
result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back(); result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc = ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back(); result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, dtype,
global_block); global_block);
} }
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name, void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name, const std::vector<std::string> &grads_name,
const std::string &fused_var_name, const std::string &fused_var_name,
const proto::VarType::Type &dtype,
BlockDesc *global_block) const { BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("coalesce_tensor"); op_desc->SetType("coalesce_tensor");
op_desc->SetInput("Input", params_name); op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name); op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name}); op_desc->SetOutput("FusedOutput", {fused_var_name});
op_desc->SetAttr("dtype", static_cast<int>(dtype));
} }
}; };
} // namespace ir } // namespace ir
......
...@@ -162,17 +162,25 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -162,17 +162,25 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
} }
} }
// Check dtype
auto dtype = GetDtypeOfVar(vars_info, aux_var_set.at(kParam).front());
for (auto vars : aux_var_set) {
for (auto &var_name : vars.second) {
PADDLE_ENFORCE_EQ(dtype, GetDtypeOfVar(vars_info, var_name));
}
}
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops
// separately. // separately.
if (!grad_fused) { if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads(aux_var_set.at(kParam), InitFusedGradsAndAllocSpaceForGrads(
aux_var_set.at(kGrad), aux_var_set.at(kParam), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), &result); fused_vars_name.at(kGrad), dtype, &result);
} }
aux_var_names.pop_back(); aux_var_names.pop_back();
InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name, InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name,
&result); dtype, &result);
// Step 5: Fuse optimizer Ops and Scale Ops // Step 5: Fuse optimizer Ops and Scale Ops
auto *fused_opt_node = auto *fused_opt_node =
...@@ -252,7 +260,7 @@ void FuseOptimizerOpPass::GradientsFilter( ...@@ -252,7 +260,7 @@ void FuseOptimizerOpPass::GradientsFilter(
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const std::vector<std::string> &params, const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name, const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const { const proto::VarType::Type &dtype, ir::Graph *result) const {
auto &pinned_var_set = auto &pinned_var_set =
result->GetOrInit<details::PinnedVars>(details::kPinnedVars); result->GetOrInit<details::PinnedVars>(details::kPinnedVars);
...@@ -279,8 +287,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( ...@@ -279,8 +287,8 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
ProgramDesc &program_desc = ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back(); result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block, AppendAllocContinuousSpace(params, grads, fused_grad_name, dtype,
false, false); global_block, false, false);
} }
std::unordered_map<std::string, std::vector<Node *>> std::unordered_map<std::string, std::vector<Node *>>
...@@ -302,15 +310,30 @@ bool FuseOptimizerOpPass::IsLoDTensorType( ...@@ -302,15 +310,30 @@ bool FuseOptimizerOpPass::IsLoDTensorType(
return type == proto::VarType::LOD_TENSOR; return type == proto::VarType::LOD_TENSOR;
} }
const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const {
auto grad_iter = vars_info.find(var_name);
PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.",
var_name);
PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.",
var_name);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
return grad_iter->second.front()->Var();
}
proto::VarType::Type FuseOptimizerOpPass::GetDtypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const {
auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
return var_desc->GetDataType();
}
proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar( proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes, const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &name) const { const std::string &name) const {
auto grad_iter = var_nodes.find(name); auto var_desc = GetVarDescFromVarsInfo(vars_info, name);
PADDLE_ENFORCE_EQ(grad_iter != var_nodes.end(), true, "%s is not found.", return var_desc->GetType();
name);
PADDLE_ENFORCE_GT(grad_iter->second.size(), 0);
PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
return grad_iter->second.front()->Var()->GetType();
} }
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
...@@ -318,7 +341,7 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( ...@@ -318,7 +341,7 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name, const std::unordered_map<std::string, std::string> &fused_vars_name,
ir::Graph *result) const { const proto::VarType::Type &dtype, ir::Graph *result) const {
// Define Ops // Define Ops
result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back(); result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc = ProgramDesc &program_desc =
...@@ -327,7 +350,7 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( ...@@ -327,7 +350,7 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace( AppendAllocContinuousSpace(
aux_var_set.at(var_name), aux_var_set.at(var_name), aux_var_set.at(var_name), aux_var_set.at(var_name),
fused_vars_name.at(var_name), global_block, true); fused_vars_name.at(var_name), dtype, global_block, true);
} }
} }
...@@ -393,7 +416,8 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( ...@@ -393,7 +416,8 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
void FuseOptimizerOpPass::AppendAllocContinuousSpace( void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &in_args, const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args, const std::string &fused_out_arg, const std::vector<std::string> &out_args, const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data, bool check_name) const { const proto::VarType::Type &dtype, BlockDesc *global_block, bool copy_data,
bool check_name) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("coalesce_tensor"); op_desc->SetType("coalesce_tensor");
op_desc->SetInput("Input", in_args); op_desc->SetInput("Input", in_args);
...@@ -401,6 +425,7 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace( ...@@ -401,6 +425,7 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
op_desc->SetOutput("FusedOutput", {fused_out_arg}); op_desc->SetOutput("FusedOutput", {fused_out_arg});
op_desc->SetAttr("copy_data", copy_data); op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", check_name); op_desc->SetAttr("check_name", check_name);
op_desc->SetAttr("dtype", static_cast<int>(dtype));
} }
void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode( void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
......
...@@ -64,28 +64,37 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -64,28 +64,37 @@ class FuseOptimizerOpPass : public ir::Pass {
void AppendAllocContinuousSpace(const std::vector<std::string> &in_args, void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
const std::vector<std::string> &out_args, const std::vector<std::string> &out_args,
const std::string &fused_out_arg, const std::string &fused_out_arg,
const proto::VarType::Type &dtype,
BlockDesc *global_block, bool copy_data, BlockDesc *global_block, bool copy_data,
bool check_name = true) const; bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads( void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<std::string> &params, const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name, const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const; const proto::VarType::Type &dtype, ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars( void InitFusedVarsAndAllocSpaceForVars(
const std::vector<std::string> &aux_var_names, const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name, const std::unordered_map<std::string, std::string> &fused_vars_name,
ir::Graph *result) const; const proto::VarType::Type &dtype, ir::Graph *result) const;
std::unordered_map<std::string, std::vector<Node *>> GetVarInfo( std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const; const Graph &result) const;
proto::VarType::Type GetDtypeOfVar(
const std::unordered_map<std::string, std::vector<ir::Node *>> &vars_info,
const std::string &name) const;
proto::VarType::Type GetTypeOfVar( proto::VarType::Type GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes, const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::string &name) const; const std::string &name) const;
const VarDesc *GetVarDescFromVarsInfo(
const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
const std::string &var_name) const;
void GradientsFilter(const std::vector<size_t> &new_grad_idx, void GradientsFilter(const std::vector<size_t> &new_grad_idx,
std::vector<Node *> *opt_nodes, std::vector<Node *> *opt_nodes,
std::unordered_map<std::string, std::vector<std::string>> std::unordered_map<std::string, std::vector<std::string>>
......
...@@ -23,9 +23,6 @@ ...@@ -23,9 +23,6 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static framework::proto::VarType::Type kDefaultDtype =
framework::proto::VarType::Type::VarType_Type_BOOL;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class CoalesceTensorOp : public framework::OpKernel<T> { class CoalesceTensorOp : public framework::OpKernel<T> {
public: public:
...@@ -66,8 +63,10 @@ class CoalesceTensorOp : public framework::OpKernel<T> { ...@@ -66,8 +63,10 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
// Get numel and dtype // Get numel and dtype
size_t numel = 0; size_t numel = 0;
auto dtype = kDefaultDtype; auto dtype = static_cast<framework::proto::VarType::Type>(
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype, context.Attr<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace()); context.GetPlace());
// Alloc the continuous space // Alloc the continuous space
...@@ -78,7 +77,6 @@ class CoalesceTensorOp : public framework::OpKernel<T> { ...@@ -78,7 +77,6 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
// Init the continuous space // Init the continuous space
auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output"); auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
size_t offset = 0; size_t offset = 0;
size_t size_of_dtype = framework::SizeOfType(dtype);
if (context.Attr<bool>("copy_data")) { if (context.Attr<bool>("copy_data")) {
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
size_t len = static_cast<size_t>(in_tensors[i]->numel()); size_t len = static_cast<size_t>(in_tensors[i]->numel());
...@@ -120,27 +118,15 @@ class CoalesceTensorOp : public framework::OpKernel<T> { ...@@ -120,27 +118,15 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
void GetMemSizeAndDtype( void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors, const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel, const std::vector<std::string> var_names, size_t *numel,
framework::proto::VarType::Type *dtype, const size_t &size_of_dtype, const platform::Place &place) const {
const platform::Place &place) const {
PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
*numel = 0; *numel = 0;
size_t size_of_dtype = 0;
std::stringstream ss; std::stringstream ss;
ss << "alloc_space_for_vars: "; ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < var_names.size(); ++i) { for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
var_names[i]); var_names[i]);
auto p_dtype = lod_tensors[i]->type();
if (*dtype == kDefaultDtype) {
PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
var_names[i], kDefaultDtype);
*dtype = p_dtype;
size_of_dtype = framework::SizeOfType(p_dtype);
}
PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
auto size = lod_tensors[i]->numel(); auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
...@@ -178,6 +164,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -178,6 +164,7 @@ class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor) The output tensor " "(LoDTensor) The output tensor "
"of coalesce_tensor operator. And the tensors of" "of coalesce_tensor operator. And the tensors of"
" Output is sliced from the tensor of FusedOutput."); " Output is sliced from the tensor of FusedOutput.");
AddAttr<int>("dtype", "The output data type.");
AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.") AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("set_constant", AddAttr<bool>("set_constant",
......
...@@ -25,7 +25,7 @@ alignment = 256 ...@@ -25,7 +25,7 @@ alignment = 256
class TestAllocContinuousSpace(OpTest): class TestAllocContinuousSpace(OpTest):
def setUp(self): def setUp(self):
self.op_type = "coalesce_tensor" self.op_type = "coalesce_tensor"
self.dtype = np.float32 self.dtype, self.fluid_dtype = self.init_dtype()
attrs = self.init_attr() attrs = self.init_attr()
self.copy_data = attrs["copy_data"] self.copy_data = attrs["copy_data"]
self.constant = attrs["constant"] self.constant = attrs["constant"]
...@@ -38,7 +38,7 @@ class TestAllocContinuousSpace(OpTest): ...@@ -38,7 +38,7 @@ class TestAllocContinuousSpace(OpTest):
self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput} self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
def init_dtype(self): def init_dtype(self):
self.dtype = np.float32 return np.float32, int(core.VarDesc.VarType.FP32)
def init_input(self): def init_input(self):
inputs = [] inputs = []
...@@ -51,7 +51,12 @@ class TestAllocContinuousSpace(OpTest): ...@@ -51,7 +51,12 @@ class TestAllocContinuousSpace(OpTest):
return inputs return inputs
def init_attr(self): def init_attr(self):
return {"copy_data": True, "set_constant": False, "constant": 0.0} return {
"copy_data": True,
"set_constant": False,
"constant": 0.0,
"dtype": self.fluid_dtype
}
def init_output(self, input_list, set_constant, constant): def init_output(self, input_list, set_constant, constant):
inputs = [] inputs = []
...@@ -82,7 +87,12 @@ class TestAllocContinuousSpace(OpTest): ...@@ -82,7 +87,12 @@ class TestAllocContinuousSpace(OpTest):
class TestAllocContinuousSpace2(TestAllocContinuousSpace): class TestAllocContinuousSpace2(TestAllocContinuousSpace):
def init_attr(self): def init_attr(self):
return {"copy_data": False, "set_constant": True, "constant": 0.5} return {
"copy_data": False,
"set_constant": True,
"constant": 0.5,
"dtype": self.fluid_dtype
}
def test_check_output(self): def test_check_output(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册