diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 68fcc104d48b2b39929ed2198a2dd2eabae10e94..2cf14bd371831ab682166f4256d6966b5ab278c8 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -27,6 +27,7 @@ enum AttrType { BOOLEANS = 7; BLOCK = 8; LONG = 9; + BLOCKS = 10; } // OpDesc describes an instance of a C++ framework::OperatorBase @@ -46,6 +47,7 @@ message OpDesc { repeated bool bools = 11; optional int32 block_idx = 12; optional int64 l = 13; + repeated int32 blocks_idx = 14; }; message Var { diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index f92769192c218eb7cdc2350ff6e4721b45005806..a190199f1cb1361f67f20c755b8e7ef52c284adc 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) { need_update_ = true; } +void OpDesc::SetBlocksAttr(const std::string &name, + std::vector<BlockDesc *> blocks) { + this->attrs_[name] = blocks; + need_update_ = true; +} + void OpDesc::SetAttrMap( const std::unordered_map<std::string, Attribute> &attr_map) { attrs_ = attr_map; @@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { void operator()(const std::vector<bool> &v) const { VectorToRepeated(v, attr_->mutable_bools()); } + void operator()(const std::vector<BlockDesc *> &v) const { + std::vector<int> blocks_idx; + for (auto blk : v) { + blocks_idx.push_back(blk->ID()); + } + VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx()); + } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(int64_t v) const { attr_->set_l(v); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index a02d3e269129596f65a2fb346e76c1af7fbead95..74dd8ec002005dd080424b48b5db1a2574a6974f 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -77,6 +77,8 @@ class OpDesc { void SetBlockAttr(const std::string &name, BlockDesc *block); + void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks); + Attribute GetAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 4879209ece9fdfea91e484a4118c00a2a2a2b4f7..e099e40f121ff13657e563eb608feecbca0551be 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>; using Attribute = boost::variant<boost::blank, int, float, std::string, std::vector<int>, std::vector<float>, std::vector<std::string>, bool, - std::vector<bool>, BlockDesc*, int64_t>; + std::vector<bool>, BlockDesc*, int64_t, + std::vector<BlockDesc*>>; using AttributeMap = std::unordered_map<std::string, Attribute>; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index f840064ecaca4bc38191727da39d07676dc18ee1..d98bf807a9464c1c2294aa0601386a940ddc00f8 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -101,17 +101,16 @@ void ListenAndServOp::RunSyncLoop( framework::Scope *recv_scope, const std::vector<int> &prefetch_block_id_list) const { size_t num_blocks = program->Size(); + auto optimize_blocks = + Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); - std::vector<int> optimize_block_id_list; - for (int blkid = 1; blkid < num_blocks; ++blkid) { - if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(), - blkid) == prefetch_block_id_list.end()) { - optimize_block_id_list.push_back(blkid); - } + std::vector<int> optimize_blocks_idx; + for (auto blk : optimize_blocks) { + optimize_blocks_idx.push_back(blk->ID()); } - auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list); + auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx); // Insert placeholder for block0 which holds current op itself. optimize_prepared.insert( optimize_prepared.begin(), @@ -134,14 +133,14 @@ void ListenAndServOp::RunSyncLoop( // and this will still work. // The optimize blocks which have the same parent ID would run parallel // TODO(Yancey1989): need to use ParallelExecutor for future - int32_t last_parent_blkid = program->Block(1).Parent(); + int32_t last_parent_blkid = optimize_blocks[0]->Parent(); std::vector<size_t> parallel_blkids; - parallel_blkids.push_back(1); + parallel_blkids.push_back(optimize_blocks[0]->ID()); double ts = GetTimestamp(); - for (size_t i = 1; i < optimize_block_id_list.size(); ++i) { + for (size_t i = 1; i < optimize_blocks.size(); ++i) { // skip the first optimize block because it is already in the // parallel_blkids. - int blkid = optimize_block_id_list[i]; + int blkid = optimize_blocks[i]->ID(); if (program->Block(blkid).Parent() != last_parent_blkid) { ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); @@ -261,8 +260,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, rpc_service_->RegisterRPC(distributed::kRequestPrefetch, request_prefetch_handler_.get()); - auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock); - auto *program = optimize_block->Program(); + auto optimize_blocks = + Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); + PADDLE_ENFORCE(optimize_blocks.size() >= 1, + "optimize blocks should be 1 at least on the pserver side."); + auto *program = optimize_blocks[0]->Program(); framework::Executor executor(dev_place); // prepare for prefetch @@ -339,8 +341,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { "a map from grad name to it's optimize block id") .SetDefault({}); AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true); - AddAttr<framework::BlockDesc *>(kOptimizeBlock, - "BlockID to run on server side."); + AddAttr<std::vector<framework::BlockDesc *>>( + kOptimizeBlocks, "Optimize blocks to run on server side.") + .SetDefault({}); AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId, "prefetch blocks to run on server side.") .SetDefault({}); diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 9aa322ad602d7a72bb90aaa4a67e7f2f7a3a54cd..634c1b4f4b541be9f4950a9ef48f944863486705 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -30,7 +30,7 @@ limitations under the License. */ namespace paddle { namespace operators { -constexpr char kOptimizeBlock[] = "OptimizeBlock"; +constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; void RunServer(std::shared_ptr<distributed::RPCServer> service); diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index e550552b195b768d68ec64e9c3b5889b56ca719f..aee6180add5708d31f7ce927b37c4524a291fe3c 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) { // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); + std::vector<framework::BlockDesc *> optimize_blocks; auto *optimize_block = program.AppendBlock(root_block); + optimize_blocks.push_back(optimize_block); + auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensors, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block, @@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) { attrs.insert({"Fanin", 1}); attrs.insert({"ParamList", std::vector<std::string>({"Out"})}); attrs.insert({"GradList", std::vector<std::string>({"x1"})}); - attrs.insert({"OptimizeBlock", optimize_block}); + attrs.insert({"optimize_blocks", optimize_blocks}); attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"grad_to_block_id", std::vector<std::string>({""})}); attrs.insert({"sync_mode", true}); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index bcf6d4dd3087060c016e53722cde80704ef2e834..fcd3356d44ee592233c3883d439d0677714900b8 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) { .value("STRINGS", pd::proto::AttrType::STRINGS) .value("BOOL", pd::proto::AttrType::BOOLEAN) .value("BOOLS", pd::proto::AttrType::BOOLEANS) - .value("BLOCK", pd::proto::AttrType::BLOCK); + .value("BLOCK", pd::proto::AttrType::BLOCK) + .value("BLOCKS", pd::proto::AttrType::BLOCKS); pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", ""); op_desc @@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) { .def("set_attr", &pd::OpDesc::SetAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) + .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) .def("set_serialized_attr", [](pd::OpDesc &self, const std::string &name, const pybind11::bytes &seriralized) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4494e30eed74a6e8466a6d639151445b930f8931..4c1c8443a641cde40c392f1c647bc78d6cd3c13c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -558,15 +558,20 @@ class Operator(object): if (attr_name not in self.attrs) or ( self.attrs[attr_name] is None): continue - if isinstance(self.attrs[attr_name], Block): + attr_val = self.attrs[attr_name] + if isinstance(attr_val, Block): self.desc.set_block_attr(attr_name, self.attrs[attr_name].desc) - elif isinstance(self.attrs[attr_name], core.BlockDesc) or \ - isinstance(self.attrs[attr_name], core.ProgramDesc): + elif isinstance(attr_val, list) and attr_val and \ + all(isinstance(v, Block) for v in attr_val): + self.desc.set_blocks_attr(attr_name, + [v.desc for v in attr_val]) + elif isinstance(attr_val, core.BlockDesc) or \ + isinstance(attr_val, core.ProgramDesc): self.desc.set_serialized_attr( - attr_name, self.attrs[attr_name].serialize_to_string()) + attr_name, attr_val.serialize_to_string()) else: - self.desc.set_attr(attr_name, self.attrs[attr_name]) + self.desc.set_attr(attr_name, attr_val) self.desc.check_attrs() if self.has_kernel(type): self.desc.infer_var_type(self.block.desc) @@ -715,6 +720,9 @@ class Operator(object): self.attrs[name] = val if isinstance(val, Block): self.desc.set_block_attr(name, val.desc) + elif isinstance(val, list) and val and all( + isinstance(v, Block) for v in val): + self.desc.set_blocks_attr(name, [v.desc for v in val]) elif isinstance(val, core.BlockDesc) or \ isinstance(val, core.ProgramDesc): self.desc.set_serialized_attr(name, val.serialize_to_string()) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8d153b75cd49953770cfa89348914a375be82a82..f3ab47c96b1caa2facfd6d191af014b4c7380cbc 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -186,7 +186,6 @@ class ListenAndServ(object): main_program = self.helper.main_program current_block = main_program.current_block() parent_block = self.parent_block() - empty_block = Program().global_block() parent_block.append_op( type='listen_and_serv', @@ -195,8 +194,9 @@ class ListenAndServ(object): attrs={ 'endpoint': self.endpoint, 'Fanin': self.fan_in, - 'OptimizeBlock': current_block, - 'PrefetchBlock': empty_block, + 'optimize_blocks': [ + current_block + ], # did not support multiple optimize blocks in layers 'sync_mode': True, # did not support async now in layers 'grad_to_block_id': [""] }) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d8d6a7e9418e1c2a9f82d58b5c9650d58604d46e..bb61f82a9cf7f837f0403082165a2375d18b574e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -396,7 +396,7 @@ class DistributeTranspiler(object): return varname return "" - def __clone_lr_op_sub_block__(op, program, new_block): + def __clone_lr_op_sub_block__(op, program, lr_block): if not op.has_attr('sub_block'): return @@ -405,36 +405,41 @@ class DistributeTranspiler(object): assert isinstance(origin_block, Block) # we put the new sub block to new block to follow the block # hierarchy of the original blocks - new_sub_block = program.create_block(new_block.idx) + new_sub_block = program.create_block(lr_block.idx) # clone vars for var in origin_block.vars: new_sub_block.clone_variable(var) # clone ops - for op in origin_block.ops: - self._clone_lr_op(program, new_sub_block, op) + for origin_op in origin_block.ops: + cloned_op = self._clone_lr_op(program, new_sub_block, origin_op) # clone sub_block of op - __clone_lr_op_sub_block__(op, program, new_sub_block) + __clone_lr_op_sub_block__(cloned_op, program, new_sub_block) # reset the block of op op.set_attr('sub_block', new_sub_block) # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() + # record optimize blocks and we can run them on pserver parallel + optimize_blocks = [] if len(lr_ops) > 0: lr_decay_block = pserver_program.create_block( pserver_program.num_blocks - 1) + optimize_blocks.append(lr_decay_block) for _, op in enumerate(lr_ops): - self._append_pserver_non_opt_ops(lr_decay_block, op) + cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op) # append sub blocks to pserver_program in lr_decay_op - __clone_lr_op_sub_block__(op, pserver_program, lr_decay_block) + __clone_lr_op_sub_block__(cloned_op, pserver_program, + lr_decay_block) # append op to the current block grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = pserver_program.create_block(pre_block_idx) + optimize_blocks.append(per_opt_block) # append grad merging ops before clip and weight decay for _, op in enumerate(self.optimize_ops): # find the origin @GRAD var before clipping @@ -453,6 +458,7 @@ class DistributeTranspiler(object): if global_ops: opt_state_block = pserver_program.create_block( pserver_program.num_blocks - 1) + optimize_blocks.append(opt_state_block) for glb_op in global_ops: __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id, None) @@ -474,11 +480,11 @@ class DistributeTranspiler(object): assert len(prefetch_var_name_to_block_id) == 0 attrs = { - "OptimizeBlock": pserver_program.block(1), + "optimize_blocks": optimize_blocks, "endpoint": endpoint, "Fanin": self.trainer_num, "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id + "grad_to_block_id": grad_to_block_id, } if len(prefetch_var_name_to_block_id) > 0: attrs['prefetch_var_name_to_block_id'] \ @@ -1211,7 +1217,7 @@ class DistributeTranspiler(object): if var not in program.global_block().vars: block.clone_variable(var) - block.append_op( + return block.append_op( type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) def _append_pserver_non_opt_ops(self, optimize_block, opt_op): @@ -1249,7 +1255,7 @@ class DistributeTranspiler(object): elif not program.global_block().vars.has_key(var.name): program.global_block().clone_variable(var) - optimize_block.append_op( + return optimize_block.append_op( type=opt_op.type, inputs=inputs, outputs=outputs,