From 9508c72685e1eab32eb672496ba8974e8e3e0927 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 13 Dec 2017 15:35:38 +0800 Subject: [PATCH] wip: should fix variable recreate --- paddle/framework/executor.cc | 50 +++++++------- paddle/framework/executor.h | 3 +- paddle/operators/detail/recv_impl.cc | 11 ++- paddle/operators/detail/send_impl.cc | 23 +++++-- paddle/operators/detail/send_recv.proto | 4 +- paddle/operators/detail/send_recv_impl.h | 8 ++- paddle/operators/recv_op.cc | 69 ++++++++----------- paddle/operators/send_op.cc | 9 ++- python/paddle/v2/fluid/executor.py | 3 +- .../book/test_recognize_digits_conv_dist.py | 1 + 10 files changed, 103 insertions(+), 78 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 83aa927c293..cc3916e7bb6 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -85,7 +85,7 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) { } void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, - bool create_local_scope) { + bool create_local_scope, bool create_vars) { // TODO(tonyyang-svail): // - only runs on the first device (i.e. no interdevice communication) // - will change to use multiple blocks for RNN op and Cond Op @@ -94,33 +94,35 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, auto& device = device_contexts_[0]; Scope* local_scope = scope; - if (create_local_scope) { - local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { + } else { + for (auto& var : block.AllVars()) { auto* ptr = local_scope->Var(var->Name()); CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } + } // if (create_local_scope) + } // if (create_vars) for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index b745f4f6474..28da0608300 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -35,7 +35,8 @@ class Executor { * ProgramDesc * Scope */ - void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true); + void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true, + bool create_vars = true); private: std::vector device_contexts_; diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc index dab3d1e14c8..bc930cbb007 100644 --- a/paddle/operators/detail/recv_impl.cc +++ b/paddle/operators/detail/recv_impl.cc @@ -20,7 +20,7 @@ namespace detail { Status SendRecvServerImpl::SendVariable(ServerContext *context, const VariableMessage *in_var, - VariableMessage *out_var) { + VoidMessage *out_var) { // TODO(typhoonzero): support different variable types. std::istringstream iss(in_var->serialized()); framework::LoDTensor t; @@ -29,6 +29,12 @@ Status SendRecvServerImpl::SendVariable(ServerContext *context, std::make_pair(in_var->varname(), std::move(t)); var_recv_queue_.Push(std::move(tensor_with_name)); + return Status::OK; +} + +Status SendRecvServerImpl::GetVariable(ServerContext *context, + const VoidMessage *in_var, + VariableMessage *out_var) { // Block util the sub graph is done. auto out_tensor_with_name = var_return_queue_.Pop(); std::ostringstream oss; @@ -36,10 +42,9 @@ Status SendRecvServerImpl::SendVariable(ServerContext *context, platform::CPUDeviceContext()); std::string *varname = out_var->mutable_varname(); - *varname = in_var->varname(); + *varname = out_tensor_with_name.first; std::string *serialized = out_var->mutable_serialized(); *serialized = oss.str(); - return Status::OK; } diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc index 2313255dcba..bf22d3df818 100644 --- a/paddle/operators/detail/send_impl.cc +++ b/paddle/operators/detail/send_impl.cc @@ -19,10 +19,10 @@ namespace operators { namespace detail { bool RPCClient::SendVariable(const framework::Scope& scope, - const std::string& inname, - const std::string& outname) { + const std::string& inname) { ClientContext context; - VariableMessage msg, out_msg; + VariableMessage msg; + VoidMessage out_msg; // FIXME(typhoonzero): pass device context to here. auto ctx = platform::CPUDeviceContext(); auto* var = scope.FindVar(inname); @@ -40,7 +40,22 @@ bool RPCClient::SendVariable(const framework::Scope& scope, LOG(ERROR) << "gRPC error: " << status.error_message(); return false; } - std::istringstream iss(out_msg.serialized()); + return true; +} + +bool RPCClient::GetVariable(const framework::Scope& scope) { + ClientContext context; + VariableMessage msg; + VoidMessage void_msg; + auto ctx = platform::CPUDeviceContext(); + Status status = stub_->GetVariable(&context, void_msg, &msg); + if (!status.ok()) { + LOG(ERROR) << "gRPC error: " << status.error_message(); + return false; + } + + std::istringstream iss(msg.serialized()); + auto outname = msg.varname(); framework::LoDTensor ret_tensor; framework::DeserializeFromStream(iss, &ret_tensor); auto* outvar = scope.FindVar(outname); diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto index 9b4058fd617..d00c33fe42a 100644 --- a/paddle/operators/detail/send_recv.proto +++ b/paddle/operators/detail/send_recv.proto @@ -20,7 +20,9 @@ service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VariableMessage) {} + rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. + rpc GetVariable(VoidMessage) returns (VariableMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h index b6b9919c609..df01345e342 100644 --- a/paddle/operators/detail/send_recv_impl.h +++ b/paddle/operators/detail/send_recv_impl.h @@ -55,7 +55,9 @@ class SendRecvServerImpl final : public SendRecvService::Service { explicit SendRecvServerImpl() {} Status SendVariable(ServerContext *context, const VariableMessage *in_var, - VariableMessage *out_var) override; + VoidMessage *out_var) override; + Status GetVariable(ServerContext *context, const VoidMessage *in_var, + VariableMessage *out_var) override; const TensorWithName Get() { return this->var_recv_queue_.Pop(); } @@ -75,8 +77,8 @@ class RPCClient { RPCClient(std::shared_ptr channel) : stub_(SendRecvService::NewStub(channel)) {} - bool SendVariable(const framework::Scope &scope, const std::string &inname, - const std::string &outname); + bool SendVariable(const framework::Scope &scope, const std::string &inname); + bool GetVariable(const framework::Scope &scope); private: std::unique_ptr stub_; diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 94cb39391f9..754338ec6bd 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -66,37 +66,25 @@ class RecvOp : public framework::OperatorBase { const platform::DeviceContext &dev_ctx) const override { // FIXME(typhoonzero): no new scopes for every run. framework::Scope &recv_scope = scope.NewScope(); - // blocking get one var from client. - const detail::TensorWithName &v = rpc_service_->Get(); - auto grad_var_name = v.first; - auto param_list = Attr>("ParamList"); auto grad_list = Attr>("GradList"); - auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name); - std::string param_var_name; - if (it != grad_list.end()) { - param_var_name = param_list[it - grad_list.begin()]; + size_t param_count = param_list.size(); + for (size_t i = 0; i < param_count; ++i) { + // blocking get one var from client. + const detail::TensorWithName &v = rpc_service_->Get(); + auto grad_var_name = v.first; + auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name); + std::string param_var_name; + if (it != grad_list.end()) { + param_var_name = param_list[it - grad_list.begin()]; + } + VLOG(10) << "recved grad: " << grad_var_name + << " updating param: " << param_var_name; + auto *var = recv_scope.Var(grad_var_name); + auto *tensor = var->GetMutable(); + // FIXME(typhoonzero): do not copy + framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor); } - // find input by "grad_var_name" - // auto inputs = Inputs("RX"); - - // FIXME(typhoonzero): Find the parameter name from input grad name - // rename X -> Param - // rename RX -> Grad - - LOG(ERROR) << "recved grad: " << grad_var_name - << " param: " << param_var_name; - auto *var = recv_scope.Var(grad_var_name); - auto *tensor = var->GetMutable(); - - // Param is in parent scope, put it in current scope. - auto *param_var = recv_scope.FindVar(param_var_name); - auto param_scope = recv_scope.FindScope(param_var); - param_scope->Rename(param_var_name, "Param"); - recv_scope.Rename(grad_var_name, "Grad"); - - // FIXME(typhoonzero): do not copy - framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor); std::string program_str = Attr("OptimizeProgram"); framework::ProgramDesc program_desc; @@ -104,17 +92,20 @@ class RecvOp : public framework::OperatorBase { framework::ProgramDescBind program(program_desc); framework::Executor executor(dev_ctx); // Run sub graph to get optimized tensor - executor.Run(program, &recv_scope, 0, /*global_block*/ - false /*create_local_scope*/); - - auto *out_var = recv_scope.FindVar("ParamOut"); - detail::TensorWithName out; - out.first = param_var_name; - out.second = out_var->Get(); - rpc_service_->Push(out); - // rename back the params - param_scope.Rename("Param", param_var_name); - recv_scope.Rename("Grad", grad_var_name); + try { + executor.Run(program, &recv_scope, 0, /*global_block*/ + false /*create_local_scope*/, false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + + for (size_t i = 0; i < param_count; ++i) { + auto *out_var = recv_scope.FindVar(param_list[i]); + detail::TensorWithName out; + out.first = param_list[i]; + out.second = out_var->Get(); + rpc_service_->Push(out); + } } protected: diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 648905743c8..ab1ae5b31dd 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -48,11 +48,18 @@ class SendOp : public framework::OperatorBase { // should block until server responds. for (auto in : ins) { LOG(ERROR) << "sending grad: " << in; - bool ret = client_->SendVariable(scope, in, in); + bool ret = client_->SendVariable(scope, in); if (!ret) { LOG(ERROR) << "send variable error"; } } + for (auto in : ins) { + LOG(ERROR) << "updating from server..."; + bool ret = client_->GetVariable(scope); + if (!ret) { + LOG(ERROR) << "GetVariable error"; + } + } } protected: diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index ba699442ce6..c8c9a4ef366 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -138,7 +138,6 @@ class Executor(object): inputs=opt_op.inputs, outputs=opt_op.outputs, attrs=opt_op.attrs) - print("optimize program: ", optimize_sub_program) pserver_program.global_block().append_op( type="recv", @@ -248,7 +247,7 @@ class Executor(object): outputs={'Out': [fetch_var]}, attrs={'col': i}) - self.executor.run(program.desc, scope, 0, True) + self.executor.run(program.desc, scope, 0, True, True) outs = [ core.get_fetch_variable(scope, fetch_var_name, i) for i in xrange(len(fetch_list)) diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py index 208002c8d6c..5178131ea77 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py @@ -44,6 +44,7 @@ exe.optimize(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1) pserver_endpoint = os.getenv("PSERVER") if pserver_endpoint: pserver_prog = exe.get_pserver_program(pserver_endpoint, optimize_ops) + print("pserver startup: ", fluid.default_startup_program()) exe.run(fluid.default_startup_program()) while True: exe.run(pserver_prog) -- GitLab