diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 9796fabdb6cd3331ce90dca26e3d5115623ae74c..d5eae2be79f95c78f66ca348261a3460790dca4a 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -54,6 +54,24 @@ static void CreateTensorFromMessageType(framework::Variable *var, } } +static void ParallelExecuteBlocks(const std::vector ¶llel_blkids, + framework::Executor *executor, + framework::ProgramDesc *program, + framework::Scope *scope) { + std::vector> fs; + for (size_t idx : parallel_blkids) { + fs.push_back(framework::Async([&executor, &program, &scope, idx]() { + int run_block = idx; // thread local + try { + executor->Run(*program, scope, run_block, false, false); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); +} + class ListenAndServOp : public framework::OperatorBase { public: ListenAndServOp(const std::string &type, @@ -135,34 +153,27 @@ class ListenAndServOp : public framework::OperatorBase { break; } - // put optimize blocks in the thread pool to start run, the last block - // should be global ops. // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads // and this will still work. - std::vector> fs; + // The optimize blocks which have the same parent ID would run parallel + // TODO(Yancey1989): need to use ParallelExecutor for future + size_t last_parent_blkid = program->Block(1).Parent(); + std::vector parallel_blkids; + parallel_blkids.push_back(1); double ts = detail::GetTimestamp(); - // block0 contains only listen_and_serv op, start run from block1. - for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { - fs.push_back( - framework::Async([&executor, &program, &recv_scope, blkid]() { - int run_block = blkid; // thread local - try { - executor.Run(*program, &recv_scope, run_block, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - })); - } - for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait(); - // Run global block at final step, or block1 if there are only 2 blocks - if (num_blocks >= 2) { - try { - executor.Run(*program, &recv_scope, num_blocks - 1, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); + for (size_t blkid = 2; blkid < num_blocks; ++blkid) { + if (program->Block(blkid).Parent() != last_parent_blkid) { + for (size_t idx : parallel_blkids) VLOG(3) << idx; + ParallelExecuteBlocks(parallel_blkids, &executor, program, + &recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); } + parallel_blkids.push_back(blkid); } + ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); + VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts; // Reset the received sparse variables, the sum operator would not @@ -178,10 +189,6 @@ class ListenAndServOp : public framework::OperatorBase { rpc_service_->WaitClientGet(fan_in); sparse_vars.clear(); } // while(true) - - // for (int i = 0; i < num_blocks; ++i) { - // delete blk_ctx_list[i]; - // } } protected: diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 62147d325b699a62bd39cfbaca44874b7fc19a0f..24297ffe33bc720ff7b4f2b0dbd82452dc7e0ae2 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -338,15 +338,24 @@ class DistributeTranspiler: else: self._append_pserver_non_opt_ops(block, op) + append_block = optimize_block + # append lr decay ops to the child block if exits + lr_ops = self._get_lr_ops() + if len(lr_ops) > 0: + for _, op in enumerate(lr_ops): + self._append_pserver_non_opt_ops(append_block, op) + + append_block = pserver_program.create_block(append_block.idx) + # append op to the current block - per_opt_block = optimize_block + per_opt_block = append_block for _, opt_op in enumerate(opt_op_on_pserver): for _, op in enumerate(self.optimize_ops): # optimizer is connected to itself if ufind.is_connected(op, opt_op) and \ op not in global_ops: __append_optimize_op__(op, per_opt_block) - per_opt_block = pserver_program.create_block(0) + per_opt_block = pserver_program.create_block(append_block.idx) # append global ops for glb_op in global_ops: @@ -786,3 +795,33 @@ class DistributeTranspiler: else: iomap[key] = vars return iomap + + def _get_lr_ops(self): + lr_ops = [] + # find learning rate variables by optimize op + lr_vars = set() + for op in self.optimize_ops: + if self._is_opt_op(op): + lr_vars.add(op.input("LearningRate")[0]) + + find_ops = [] + # find ops which output is lr var + block = self.program.global_block() + for op in block.ops: + if set(op.output_arg_names) & lr_vars: + find_ops.append(op) + # make a union find struct by the ops in default_main_program + ufind = UnionFind(block.ops) + for op1 in block.ops: + for op2 in block.ops: + # NOTE: we need to skip all optimize ops, since it is connected + # with forward/backward ops and lr ops, we only need the lr ops. + if op1 != op2 and self._is_op_connected(op1, op2) and \ + not self._is_opt_op(op1) and not self._is_opt_op(op2): + ufind.union(op1, op2) + # find all ops which is related with lr var + for op1 in block.ops: + for op2 in find_ops: + if ufind.is_connected(op1, op2): + lr_ops.append(op1) + return lr_ops