From 1e549563d5b06e8ae7db1edfc34ff5dd1a72ac68 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 18 Dec 2017 16:42:37 +0800 Subject: [PATCH] multi trainers --- paddle/operators/detail/recv_impl.cc | 31 +++++++++++++++++++----- paddle/operators/detail/send_impl.cc | 13 +++++----- paddle/operators/detail/send_recv.proto | 4 ++- paddle/operators/detail/send_recv_impl.h | 22 +++++++++-------- paddle/operators/recv_op.cc | 16 ++++++------ 5 files changed, 56 insertions(+), 30 deletions(-) diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc index bc930cbb007..47decb6d7eb 100644 --- a/paddle/operators/detail/recv_impl.cc +++ b/paddle/operators/detail/recv_impl.cc @@ -33,21 +33,40 @@ Status SendRecvServerImpl::SendVariable(ServerContext *context, } Status SendRecvServerImpl::GetVariable(ServerContext *context, - const VoidMessage *in_var, + const VariableMessage *in_var, VariableMessage *out_var) { - // Block util the sub graph is done. - auto out_tensor_with_name = var_return_queue_.Pop(); + std::string get_var_name = in_var->varname(); + auto *var = scope_->FindVar(get_var_name); + auto tensor = var->Get(); std::ostringstream oss; - framework::SerializeToStream(oss, out_tensor_with_name.second, - platform::CPUDeviceContext()); + framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext()); std::string *varname = out_var->mutable_varname(); - *varname = out_tensor_with_name.first; + *varname = get_var_name; std::string *serialized = out_var->mutable_serialized(); *serialized = oss.str(); return Status::OK; } +Status SendRecvServerImpl::Wait(ServerContext *context, + const VoidMessage *in_var, + VoidMessage *out_var) { + std::unique_lock lock(this->mutex_); + condition_.wait(lock, [=] { return this->done_ == true; }); + return Status::OK; +} + +void SendRecvServerImpl::Start() { + std::unique_lock lock(this->mutex_); + done_ = false; +} + +void SendRecvServerImpl::Done() { + std::unique_lock lock(this->mutex_); + done_ = true; + condition_.notify_all(); +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc index bf22d3df818..7555cc63fb2 100644 --- a/paddle/operators/detail/send_impl.cc +++ b/paddle/operators/detail/send_impl.cc @@ -43,19 +43,20 @@ bool RPCClient::SendVariable(const framework::Scope& scope, return true; } -bool RPCClient::GetVariable(const framework::Scope& scope) { +bool RPCClient::GetVariable(const framework::Scope& scope, + const std::string& outname) { ClientContext context; - VariableMessage msg; - VoidMessage void_msg; + VariableMessage call_msg, ret_msg; + call_msg.set_varname(outname); auto ctx = platform::CPUDeviceContext(); - Status status = stub_->GetVariable(&context, void_msg, &msg); + Status status = stub_->GetVariable(&context, call_msg, &ret_msg); if (!status.ok()) { LOG(ERROR) << "gRPC error: " << status.error_message(); return false; } - std::istringstream iss(msg.serialized()); - auto outname = msg.varname(); + std::istringstream iss(ret_msg.serialized()); + framework::LoDTensor ret_tensor; framework::DeserializeFromStream(iss, &ret_tensor); auto* outvar = scope.FindVar(outname); diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto index d00c33fe42a..ce729908062 100644 --- a/paddle/operators/detail/send_recv.proto +++ b/paddle/operators/detail/send_recv.proto @@ -22,7 +22,9 @@ service SendRecvService { // TODO(typhoonzero): add streaming API rpc SendVariable(VariableMessage) returns (VoidMessage) {} // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VoidMessage) returns (VariableMessage) {} + rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // wait for one execution of the program + rpc Wait(VoidMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h index df01345e342..6edbb2d8348 100644 --- a/paddle/operators/detail/send_recv_impl.h +++ b/paddle/operators/detail/send_recv_impl.h @@ -20,10 +20,6 @@ #include "paddle/framework/selected_rows.h" #include "paddle/operators/detail/simple_block_queue.h" -// #include -// #include -// #include -// #include #include "paddle/operators/detail/send_recv.grpc.pb.h" #include "paddle/operators/detail/send_recv.pb.h" @@ -56,18 +52,24 @@ class SendRecvServerImpl final : public SendRecvService::Service { Status SendVariable(ServerContext *context, const VariableMessage *in_var, VoidMessage *out_var) override; - Status GetVariable(ServerContext *context, const VoidMessage *in_var, + Status GetVariable(ServerContext *context, const VariableMessage *in_var, VariableMessage *out_var) override; + Status Wait(ServerContext *context, const VoidMessage *in_var, + VoidMessage *out_var) override; + void Start(); + void Done(); + void SetScope(framework::Scope *scope) { scope_ = scope; }; const TensorWithName Get() { return this->var_recv_queue_.Pop(); } - void Push(const TensorWithName &var) { this->var_return_queue_.Push(var); } - private: // received variable from RPC, operators fetch variable from this queue. SimpleBlockQueue var_recv_queue_; - // calculated variable should push to this queue. - SimpleBlockQueue var_return_queue_; + framework::Scope *scope_; + // condition of the sub program + std::mutex mutex_; + bool done_; + std::condition_variable condition_; }; // RPCClient is a class to send tensors to pserver sub-network @@ -78,7 +80,7 @@ class RPCClient { : stub_(SendRecvService::NewStub(channel)) {} bool SendVariable(const framework::Scope &scope, const std::string &inname); - bool GetVariable(const framework::Scope &scope); + bool GetVariable(const framework::Scope &scope, const std::string &outname); private: std::unique_ptr stub_; diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 9c3e8953bb7..9af8d311d92 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -76,12 +76,14 @@ class RecvOp : public framework::OperatorBase { const platform::DeviceContext &dev_ctx) const override { // FIXME(typhoonzero): no new scopes for every run. framework::Scope &recv_scope = scope.NewScope(); + rpc_service_.SetScope(&recv_scope); auto param_list = Attr>("ParamList"); auto grad_list = Attr>("GradList"); auto trainer_count = Attr("Trainers"); size_t param_count = param_list.size(); // TODO(typhoonzero): change this to a while_op for every cluster-batch. while (true) { + rpc_service_.Start(); // Get from multiple trainers, we don't care about order in which // the gradient arrives, just add suffix 0~n then average the gradient. for (size_t i = 0; i < param_count * trainer_count; ++i) { @@ -125,13 +127,13 @@ class RecvOp : public framework::OperatorBase { LOG(ERROR) << "run sub program error " << e.what(); } - for (size_t i = 0; i < param_count; ++i) { - auto *out_var = recv_scope.FindVar(param_list[i]); - detail::TensorWithName out; - out.first = param_list[i]; - out.second = out_var->Get(); - rpc_service_->Push(out); - } + // for (size_t i = 0; i < param_count; ++i) { + // auto *out_var = recv_scope.FindVar(param_list[i]); + // detail::TensorWithName out; + // out.first = param_list[i]; + // out.second = out_var->Get(); + // rpc_service_->Push(out); + // } } // while(true) } -- GitLab