From 1e549563d5b06e8ae7db1edfc34ff5dd1a72ac68 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 18 Dec 2017 16:42:37 +0800
Subject: [PATCH] multi trainers

---
 paddle/operators/detail/recv_impl.cc     | 31 +++++++++++++++++++-----
 paddle/operators/detail/send_impl.cc     | 13 +++++-----
 paddle/operators/detail/send_recv.proto  |  4 ++-
 paddle/operators/detail/send_recv_impl.h | 22 +++++++++--------
 paddle/operators/recv_op.cc              | 16 ++++++------
 5 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
index bc930cbb007..47decb6d7eb 100644
--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
@@ -33,21 +33,40 @@ Status SendRecvServerImpl::SendVariable(ServerContext *context,
 }
 
 Status SendRecvServerImpl::GetVariable(ServerContext *context,
-                                       const VoidMessage *in_var,
+                                       const VariableMessage *in_var,
                                        VariableMessage *out_var) {
-  // Block util the sub graph is done.
-  auto out_tensor_with_name = var_return_queue_.Pop();
+  std::string get_var_name = in_var->varname();
+  auto *var = scope_->FindVar(get_var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
   std::ostringstream oss;
-  framework::SerializeToStream(oss, out_tensor_with_name.second,
-                               platform::CPUDeviceContext());
+  framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
 
   std::string *varname = out_var->mutable_varname();
-  *varname = out_tensor_with_name.first;
+  *varname = get_var_name;
   std::string *serialized = out_var->mutable_serialized();
   *serialized = oss.str();
   return Status::OK;
 }
 
+Status SendRecvServerImpl::Wait(ServerContext *context,
+                                const VoidMessage *in_var,
+                                VoidMessage *out_var) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  condition_.wait(lock, [=] { return this->done_ == true; });
+  return Status::OK;
+}
+
+void SendRecvServerImpl::Start() {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void SendRecvServerImpl::Done() {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  done_ = true;
+  condition_.notify_all();
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
index bf22d3df818..7555cc63fb2 100644
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
@@ -43,19 +43,20 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
   return true;
 }
 
-bool RPCClient::GetVariable(const framework::Scope& scope) {
+bool RPCClient::GetVariable(const framework::Scope& scope,
+                            const std::string& outname) {
   ClientContext context;
-  VariableMessage msg;
-  VoidMessage void_msg;
+  VariableMessage call_msg, ret_msg;
+  call_msg.set_varname(outname);
   auto ctx = platform::CPUDeviceContext();
-  Status status = stub_->GetVariable(&context, void_msg, &msg);
+  Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
   if (!status.ok()) {
     LOG(ERROR) << "gRPC error: " << status.error_message();
     return false;
   }
 
-  std::istringstream iss(msg.serialized());
-  auto outname = msg.varname();
+  std::istringstream iss(ret_msg.serialized());
+
   framework::LoDTensor ret_tensor;
   framework::DeserializeFromStream(iss, &ret_tensor);
   auto* outvar = scope.FindVar(outname);
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index d00c33fe42a..ce729908062 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -22,7 +22,9 @@ service SendRecvService {
   // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VoidMessage) returns (VariableMessage) {}
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // wait for one execution of the program
+  rpc Wait(VoidMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
index df01345e342..6edbb2d8348 100644
--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -20,10 +20,6 @@
 #include "paddle/framework/selected_rows.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
-// #include <grpc++/channel.h>
-// #include <grpc++/client_context.h>
-// #include <grpc++/create_channel.h>
-// #include <grpc++/security/credentials.h>
 #include "paddle/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/operators/detail/send_recv.pb.h"
 
@@ -56,18 +52,24 @@ class SendRecvServerImpl final : public SendRecvService::Service {
 
   Status SendVariable(ServerContext *context, const VariableMessage *in_var,
                       VoidMessage *out_var) override;
-  Status GetVariable(ServerContext *context, const VoidMessage *in_var,
+  Status GetVariable(ServerContext *context, const VariableMessage *in_var,
                      VariableMessage *out_var) override;
+  Status Wait(ServerContext *context, const VoidMessage *in_var,
+              VoidMessage *out_var) override;
+  void Start();
+  void Done();
+  void SetScope(framework::Scope *scope) { scope_ = scope; };
 
   const TensorWithName Get() { return this->var_recv_queue_.Pop(); }
 
-  void Push(const TensorWithName &var) { this->var_return_queue_.Push(var); }
-
  private:
   // received variable from RPC, operators fetch variable from this queue.
   SimpleBlockQueue<TensorWithName> var_recv_queue_;
-  // calculated variable should push to this queue.
-  SimpleBlockQueue<TensorWithName> var_return_queue_;
+  framework::Scope *scope_;
+  // condition of the sub program
+  std::mutex mutex_;
+  bool done_;
+  std::condition_variable condition_;
 };
 
 // RPCClient is a class to send tensors to pserver sub-network
@@ -78,7 +80,7 @@ class RPCClient {
       : stub_(SendRecvService::NewStub(channel)) {}
 
   bool SendVariable(const framework::Scope &scope, const std::string &inname);
-  bool GetVariable(const framework::Scope &scope);
+  bool GetVariable(const framework::Scope &scope, const std::string &outname);
 
  private:
   std::unique_ptr<SendRecvService::Stub> stub_;
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 9c3e8953bb7..9af8d311d92 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -76,12 +76,14 @@ class RecvOp : public framework::OperatorBase {
            const platform::DeviceContext &dev_ctx) const override {
     // FIXME(typhoonzero): no new scopes for every run.
     framework::Scope &recv_scope = scope.NewScope();
+    rpc_service_.SetScope(&recv_scope);
     auto param_list = Attr<std::vector<std::string>>("ParamList");
     auto grad_list = Attr<std::vector<std::string>>("GradList");
     auto trainer_count = Attr<int>("Trainers");
     size_t param_count = param_list.size();
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     while (true) {
+      rpc_service_.Start();
       // Get from multiple trainers, we don't care about order in which
       // the gradient arrives, just add suffix 0~n then average the gradient.
       for (size_t i = 0; i < param_count * trainer_count; ++i) {
@@ -125,13 +127,13 @@ class RecvOp : public framework::OperatorBase {
         LOG(ERROR) << "run sub program error " << e.what();
       }
 
-      for (size_t i = 0; i < param_count; ++i) {
-        auto *out_var = recv_scope.FindVar(param_list[i]);
-        detail::TensorWithName out;
-        out.first = param_list[i];
-        out.second = out_var->Get<framework::LoDTensor>();
-        rpc_service_->Push(out);
-      }
+      // for (size_t i = 0; i < param_count; ++i) {
+      //   auto *out_var = recv_scope.FindVar(param_list[i]);
+      //   detail::TensorWithName out;
+      //   out.first = param_list[i];
+      //   out.second = out_var->Get<framework::LoDTensor>();
+      //   rpc_service_->Push(out);
+      // }
     }  // while(true)
   }
 
-- 
GitLab