Merge pull request #9377 from typhoonzero/prepare_pserver_executor

prepare pserver executor

Merge pull request #9377 from typhoonzero/prepare_pserver_executor
prepare pserver executor
fb8c1cf0 · Wu Yi · GitHub · 66e0aed7 · 1f6e0448 · fb8c1cf0
3 changed file
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -279,6 +279,21 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  return std::unique_ptr<ExecutorPrepareContext>(ctx);
 }
+std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
+    const ProgramDesc& program, const std::vector<int>& block_ids) {
+  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  for (auto& bid : block_ids) {
+    auto* ctx = new ExecutorPrepareContext(program, bid);
+    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    auto& block = program.Block(bid);
+    for (auto& op_desc : block.AllOps()) {
+      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
+    }
+    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+  }
+  return result;
+}
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars) {
  auto& block = ctx->prog_.Block(ctx->block_id_);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -61,6 +61,9 @@ class Executor {
  static std::unique_ptr<ExecutorPrepareContext> Prepare(
      const ProgramDesc& program, int block_id);
+  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
+      const ProgramDesc& program, const std::vector<int>& block_ids);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
                          bool create_vars = true);

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -45,20 +45,23 @@ static void CreateTensorFromMessageType(framework::Variable *var,
  }
 }
-static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
+static void ParallelExecuteBlocks(
-                                  framework::Executor *executor,
+    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-                                  framework::ProgramDesc *program,
+    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-                                  framework::Scope *scope) {
+        &prepared,
+    framework::ProgramDesc *program, framework::Scope *scope) {
  std::vector<std::future<void>> fs;
  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
+    fs.push_back(
-      int run_block = idx;  // thread local
+        framework::Async([&executor, &prepared, &program, &scope, idx]() {
-      try {
+          int run_block = idx;  // thread local
-        executor->Run(*program, scope, run_block, false, false);
+          try {
-      } catch (std::exception &e) {
+            executor->RunPreparedContext(prepared[run_block].get(), scope,
-        LOG(ERROR) << "run sub program error " << e.what();
+                                         false, false);
-      }
+          } catch (std::exception &e) {
-    }));
+            LOG(ERROR) << "run sub program error " << e.what();
+          }
+        }));
  }
  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
@@ -101,6 +104,13 @@ class ListenAndServOp : public framework::OperatorBase {
                      "server program should have at least 2 blocks");
    framework::Executor executor(dev_place);
+    std::vector<int> block_list;
+    for (size_t blkid = 1; blkid < num_blocks; ++blkid)
+      block_list.push_back(blkid);
+    auto prepared = executor.Prepare(*program, block_list);
+    prepared.insert(
+        prepared.begin(),
+        std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
    // TODO(qiao) set proper fields for table lookup and update
    rpc_service_->SetExecutor(&executor);
@@ -160,14 +170,15 @@ class ListenAndServOp : public framework::OperatorBase {
      for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
        if (program->Block(blkid).Parent() != last_parent_blkid) {
          for (size_t idx : parallel_blkids) VLOG(3) << idx;
-          ParallelExecuteBlocks(parallel_blkids, &executor, program,
+          ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
                                &recv_scope);
          parallel_blkids.clear();
          last_parent_blkid = program->Block(blkid).Parent();
        }
        parallel_blkids.push_back(blkid);
      }
-      ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
+      ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+                            &recv_scope);
      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
              << "(ms)";
@@ -181,7 +192,8 @@ class ListenAndServOp : public framework::OperatorBase {
        var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
      }
      rpc_service_->SetCond(1);
-      // FIXME(typhoonzero): use another condition to sync wait clients get.
+      // NOTE: does not consider barrier request retry in here, we may use
+      // global barrier id to resolve this.
      rpc_service_->WaitClientGet(fan_in);
      sparse_vars.clear();
    }  // while(true)