diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 2d7db382a60b23eba0924519a0cea9ad3eb90142..0668b08ff7ab3c8ca4f1e989fc7af45a8ec5f63c 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -90,6 +90,21 @@ OpDesc *BlockDesc::PrependOp() { return ops_.front().get(); } +void BlockDesc::RemoveOp(size_t s, size_t e) { + if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { + return; + } + need_update_ = true; + for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { + auto names = (*it)->InputArgumentNames(); + for (auto n : names) { + // TODO(typhoonzero): delete vars if no other op use it. + VLOG(3) << "deleting var " << n; + } + } + ops_.erase(ops_.begin() + s, ops_.begin() + e); +} + std::vector BlockDesc::AllOps() const { std::vector res; for (const auto &op : ops_) { diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h index 513fc54f24c4760b27936e2ac205b2410e9861a4..6c8c81b332d99e52db41018e117aa837be6745bc 100644 --- a/paddle/framework/block_desc.h +++ b/paddle/framework/block_desc.h @@ -79,6 +79,8 @@ class BlockDesc { OpDesc *PrependOp(); + void RemoveOp(size_t s, size_t e); + std::vector AllOps() const; size_t OpSize() const { return ops_.size(); } diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index c4b76911a67fe02d74dbc1917b0832a39604a366..14ae37ec49c12203381e74b3f9174a460e41c18e 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -65,7 +65,7 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, - bool create_local_scope) { + bool create_local_scope, bool create_vars) { // TODO(tonyyang-svail): // - only runs on the first device (i.e. no interdevice communication) // - will change to use multiple blocks for RNN op and Cond Op @@ -74,33 +74,35 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, auto& device = device_contexts_[0]; Scope* local_scope = scope; - if (create_local_scope) { - local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { + } else { + for (auto& var : block.AllVars()) { auto* ptr = local_scope->Var(var->Name()); CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } + } // if (create_local_scope) + } // if (create_vars) for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index fb861d47126c4397792c63d411b973ffd979244d..a3d1609293a0d687c33447ca7a0df95c6aac3bc5 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -124,7 +124,8 @@ class Executor { * ProgramDesc * Scope */ - void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true); + void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, + bool create_vars = true); private: std::vector device_contexts_; diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc index 89dc5045221156eed7aa9411bc96ad86f91136d2..517a1946a0c25081b20c320f4104c81503a4249e 100644 --- a/paddle/operators/detail/recv_impl.cc +++ b/paddle/operators/detail/recv_impl.cc @@ -20,25 +20,57 @@ namespace detail { Status SendRecvServerImpl::SendVariable(ServerContext *context, const VariableMessage *in_var, - VariableMessage *out_var) { - framework::LoDTensor t; - // TODO(typhoonzero): desirealize in_tensor and run pserver network. + VoidMessage *out_var) { + // TODO(typhoonzero): support different variable types. std::istringstream iss(in_var->serialized()); + framework::LoDTensor t; framework::DeserializeFromStream(iss, &t); - lodtensor_queue_.Push(std::move(t)); - // Block util the sub graph is done. - t = lodtensor_return_queue_.Pop(); + TensorWithName tensor_with_name = + std::make_pair(in_var->varname(), std::move(t)); + + var_recv_queue_.Push(std::move(tensor_with_name)); + return Status::OK; +} + +Status SendRecvServerImpl::GetVariable(ServerContext *context, + const VariableMessage *in_var, + VariableMessage *out_var) { + std::string get_var_name = in_var->varname(); + auto *var = scope_->FindVar(get_var_name); + auto tensor = var->Get(); std::ostringstream oss; - // FIXME(typhoonzero): get context from op. - framework::SerializeToStream(oss, t, platform::CPUDeviceContext()); + framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext()); + std::string *varname = out_var->mutable_varname(); - *varname = in_var->varname(); + *varname = get_var_name; std::string *serialized = out_var->mutable_serialized(); *serialized = oss.str(); + return Status::OK; +} +Status SendRecvServerImpl::Wait(ServerContext *context, + const VoidMessage *in_var, + VoidMessage *out_var) { + { + std::unique_lock lock(this->mutex_); + condition_.wait(lock, [=] { return this->done_ == true; }); + } return Status::OK; } +void SendRecvServerImpl::Reset() { + std::lock_guard lock(this->mutex_); + done_ = false; +} + +void SendRecvServerImpl::Done() { + { + std::lock_guard lock(this->mutex_); + done_ = true; + } + condition_.notify_all(); +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc index da1ddf75d2afb85670c5ea0c9884376415f28208..d7165e13db961f7719139b1a40211d6a2ca67a9f 100644 --- a/paddle/operators/detail/send_impl.cc +++ b/paddle/operators/detail/send_impl.cc @@ -19,10 +19,10 @@ namespace operators { namespace detail { bool RPCClient::SendVariable(const framework::Scope& scope, - const std::string& inname, - const std::string& outname) { + const std::string& inname) { ClientContext context; - VariableMessage msg, out_msg; + VariableMessage msg; + VoidMessage out_msg; // FIXME(typhoonzero): pass device context to here. auto ctx = platform::CPUDeviceContext(); auto* var = scope.FindVar(inname); @@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope, msg.set_serialized(oss.str()); Status status = stub_->SendVariable(&context, msg, &out_msg); if (!status.ok()) { + LOG(ERROR) << "gRPC error: " << status.error_message(); return false; } - std::istringstream iss(out_msg.serialized()); + return true; +} + +bool RPCClient::GetVariable(const framework::Scope& scope, + const std::string& outname) { + ClientContext context; + VariableMessage call_msg, ret_msg; + call_msg.set_varname(outname); + auto ctx = platform::CPUDeviceContext(); + Status status = stub_->GetVariable(&context, call_msg, &ret_msg); + if (!status.ok()) { + LOG(ERROR) << "gRPC error: " << status.error_message(); + return false; + } + + std::istringstream iss(ret_msg.serialized()); + framework::LoDTensor ret_tensor; framework::DeserializeFromStream(iss, &ret_tensor); auto* outvar = scope.FindVar(outname); @@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope, return true; } +void RPCClient::Wait() { + ClientContext context; + VoidMessage call_msg, ret_msg; + stub_->Wait(&context, call_msg, &ret_msg); +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto index 07ff9d2c621a2dfb51792821a0d3fc398c315835..ce729908062ad442e66cc00001e14ceb6f268560 100644 --- a/paddle/operators/detail/send_recv.proto +++ b/paddle/operators/detail/send_recv.proto @@ -19,7 +19,12 @@ package sendrecv; service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor - rpc SendVariable(VariableMessage) returns (VariableMessage) {} + // TODO(typhoonzero): add streaming API + rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. + rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // wait for one execution of the program + rpc Wait(VoidMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h index b9a5340a8636db7b5d6ec7b21368632d3916b4aa..eec9dd38d188247cba4da2a377038a28c847e40e 100644 --- a/paddle/operators/detail/send_recv_impl.h +++ b/paddle/operators/detail/send_recv_impl.h @@ -20,10 +20,6 @@ #include "paddle/framework/selected_rows.h" #include "paddle/operators/detail/simple_block_queue.h" -// #include -// #include -// #include -// #include #include "paddle/operators/detail/send_recv.grpc.pb.h" #include "paddle/operators/detail/send_recv.pb.h" @@ -48,24 +44,32 @@ namespace paddle { namespace operators { namespace detail { +typedef std::pair TensorWithName; + class SendRecvServerImpl final : public SendRecvService::Service { public: explicit SendRecvServerImpl() {} Status SendVariable(ServerContext *context, const VariableMessage *in_var, - VariableMessage *out_var) override; - - const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); } + VoidMessage *out_var) override; + Status GetVariable(ServerContext *context, const VariableMessage *in_var, + VariableMessage *out_var) override; + Status Wait(ServerContext *context, const VoidMessage *in_var, + VoidMessage *out_var) override; + void Reset(); + void Done(); + void SetScope(framework::Scope *scope) { scope_ = scope; }; - void Push(const framework::LoDTensor &tensor) { - this->lodtensor_return_queue_.Push(tensor); - } + const TensorWithName Get() { return this->var_recv_queue_.Pop(); } private: - SimpleBlockQueue lodtensor_queue_; - SimpleBlockQueue lodtensor_return_queue_; - SimpleBlockQueue selected_rows_queue_; - SimpleBlockQueue selected_rows_return_queue_; + // received variable from RPC, operators fetch variable from this queue. + SimpleBlockQueue var_recv_queue_; + framework::Scope *scope_; + // condition of the sub program + std::mutex mutex_; + bool done_; + std::condition_variable condition_; }; // RPCClient is a class to send tensors to pserver sub-network @@ -75,8 +79,9 @@ class RPCClient { RPCClient(std::shared_ptr channel) : stub_(SendRecvService::NewStub(channel)) {} - bool SendVariable(const framework::Scope &scope, const std::string &inname, - const std::string &outname); + bool SendVariable(const framework::Scope &scope, const std::string &inname); + bool GetVariable(const framework::Scope &scope, const std::string &outname); + void Wait(); private: std::unique_ptr stub_; diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 2cc6cf6947b601e326ed563082800946b3ff221d..4e91d1151ebf7a0cd520f2f1fa58a0cc4a0d1bef 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -24,6 +24,7 @@ #include "paddle/framework/framework.pb.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/proto_desc.h" #include "paddle/operators/detail/send_recv_impl.h" #include "paddle/operators/detail/simple_block_queue.h" @@ -61,29 +62,76 @@ class RecvOp : public framework::OperatorBase { server_thread_->join(); } + std::string GetGradVarNameForTrainer(const std::string &varname) const { + if (grads_counter_.find(varname) == grads_counter_.end()) { + grads_counter_[varname] = 0; + } + char ret[256]; + snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(), + grads_counter_[varname]++); + return std::string(ret); + } + void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { - // blocking get one var from client. - const framework::LoDTensor &t = rpc_service_->Get(); + // FIXME(typhoonzero): no new scopes for every run. framework::Scope &recv_scope = scope.NewScope(); - // set graph input var - auto *var = recv_scope.Var(Input("RX")); - auto *tensor = var->GetMutable(); - // FIXME(typhoonzero): do not copy - framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); - - std::string program_str = Attr("OptimizeProgram"); - framework::ProgramDesc program_desc; - program_desc.ParseFromString(program_str); - framework::ProgramDescBind program(program_desc); - framework::Executor executor(dev_ctx); - // Run sub graph to get optimized tensor - executor.Run(program, &recv_scope, 0, /*global_block*/ - false /*create_local_scope*/); - - auto *out_var = recv_scope.FindVar("Out"); - // push back - rpc_service_->Push(out_var->Get()); + rpc_service_->SetScope(&recv_scope); + auto param_list = Attr>("ParamList"); + auto grad_list = Attr>("GradList"); + auto trainer_count = Attr("Trainers"); + size_t param_count = param_list.size(); + rpc_service_->Reset(); + // TODO(typhoonzero): change this to a while_op for every cluster-batch. + while (true) { + // Get from multiple trainers, we don't care about order in which + // the gradient arrives, just add suffix 0~n then average the gradient. + for (size_t i = 0; i < param_count * trainer_count; ++i) { + // blocking get one var from client. + const detail::TensorWithName &v = rpc_service_->Get(); + auto grad_var_name = v.first; + auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name); + std::string param_var_name; + if (it != grad_list.end()) { + param_var_name = param_list[it - grad_list.begin()]; + } else { + LOG(ERROR) << "grad have no paired param found!"; + } + VLOG(3) << "recved grad: " << grad_var_name + << " updating param: " << param_var_name; + auto *merged_grad = recv_scope.FindVar(grad_var_name); + if (merged_grad == nullptr) { + // create output of merged var. + auto merged_var = recv_scope.Var(grad_var_name); + merged_var->GetMutable(); + } + + if (trainer_count > 1) { + grad_var_name = this->GetGradVarNameForTrainer(grad_var_name); + } + + auto *var = recv_scope.Var(grad_var_name); + auto *tensor = var->GetMutable(); + // FIXME(typhoonzero): do not copy + framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor); + } + rpc_service_->Reset(); + + std::string program_str = Attr("OptimizeProgram"); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(program_str); + framework::ProgramDesc program(program_desc); + framework::Executor executor(dev_ctx); + // Run sub graph to get optimized tensor + try { + executor.Run(program, &recv_scope, 0, /*global_block*/ + false /*create_local_scope*/, false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + rpc_service_->Done(); + grads_counter_.clear(); + } // while(true) } protected: @@ -93,13 +141,14 @@ class RecvOp : public framework::OperatorBase { // grpc send/recv service implement to register. std::shared_ptr rpc_service_; std::shared_ptr server_thread_; + mutable std::unordered_map grads_counter_; }; class RecvOpMaker : public framework::OpProtoAndCheckerMaker { public: RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("RX", "(Tensor) Input tensor to be saved"); + AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable(); AddComment(R"DOC( Recv operator @@ -112,6 +161,17 @@ This operator will recv tensor from send_op .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); AddAttr("OptimizeProgram", "type string", "Serialized ProgramDesc string for recv to run."); + AddAttr>( + "ParamList", "type list of string", + "grad->param name mapping to find which param to optimize.") + .SetDefault({}); + AddAttr>( + "GradList", "type list of string", + "grad->param name mapping to find which param to optimize.") + .SetDefault({}); + AddAttr("Trainers", "type int", + "Number of trainers in the current cluster job") + .SetDefault(1); } }; diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 0d121fb48dc2dc8bd0312aa2091f4e058caa4515..a5681910708bae584bdf4217b61ca63a3988e1ff 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -34,45 +34,56 @@ class SendOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) { // init client when the operator is created at runtime. - if (!client_) { - std::string endpoint = Attr("endpoint"); - client_.reset(new detail::RPCClient( - grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials()))); - // TODO(typhoonzero): how to call InitVariables + std::vector endpoints = + Attr>("endpoints"); + for (auto ep : endpoints) { + client_map_[ep].reset(new detail::RPCClient( + grpc::CreateChannel(ep, grpc::InsecureChannelCredentials()))); } } void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { - auto iname = Input("X"); - auto oname = Output("Out"); - // TODO(typhoonzero): currently it's non-blocking, - // should block until server responds. - bool ret = client_->SendVariable(scope, iname, oname); - if (!ret) { - LOG(ERROR) << "send variable error"; + auto ins = Inputs("X"); + std::vector epmap = Attr>("epmap"); + // TODO(typhoonzero): use async calls to send multiple variable asyncly. + for (size_t i = 0; i < ins.size(); ++i) { + bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]); + if (!ret) { + LOG(ERROR) << "send variable error: " << ins[i]; + } + } + // TODO(typhoonzero): support async optimization + client_map_[epmap[0]]->Wait(); + for (size_t i = 0; i < ins.size(); ++i) { + bool ret = client_map_[epmap[i]]->GetVariable(scope, ins[i]); + if (!ret) { + LOG(ERROR) << "GetVariable error: " << ins[i]; + } } } protected: - std::shared_ptr client_{nullptr}; + mutable std::unordered_map> + client_map_; }; class SendOpMaker : public framework::OpProtoAndCheckerMaker { public: SendOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "(Tensor) Input tensor to be saved"); - AddOutput("Out", "(Tensor) Output fetched from server"); + AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable(); AddComment(R"DOC( Recv operator This operator will recv tensor from send_op )DOC"); - AddAttr("endpoint", - "(string, default 127.0.0.1:6164)" - "IP address to listen on.") - .SetDefault("127.0.0.1:6164") - .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr>("endpoints", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints to send variables to."); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping"); } }; diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index 3e2e2051afacb748877e3b0c3dec8d6662ac4e72..d899d8154cce57a88de0e1d19e3393528ce367e2 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -16,12 +16,14 @@ // a RemoteOptimizer. #include +#include #include #include "gtest/gtest.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" #include "paddle/framework/program_desc.h" +#include "paddle/string/printf.h" USE_NO_KERNEL_OP(send); USE_NO_KERNEL_OP(recv); @@ -33,30 +35,33 @@ std::unique_ptr recv_op; void InitTensorsInScope(paddle::framework::Scope &scope, paddle::platform::CPUPlace &place) { paddle::platform::CPUDeviceContext ctx(place); - auto var = scope.Var("X"); - auto tensor = var->GetMutable(); - tensor->Resize({10, 10}); - float *expect = tensor->mutable_data(place); - for (int64_t i = 0; i < tensor->numel(); ++i) { - expect[i] = static_cast(i); + for (int i = 0; i < 2; ++i) { + auto var_name = paddle::string::Sprintf("x%d", i); + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + float *expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } } auto out_var = scope.Var("Out"); auto out_tensor = out_var->GetMutable(); out_tensor->Resize({10, 10}); - tensor->mutable_data(place); // allocate + out_tensor->mutable_data(place); // allocate } void AddOp(const std::string &type, const paddle::framework::VariableNameMap &inputs, const paddle::framework::VariableNameMap &outputs, paddle::framework::AttributeMap attrs, - paddle::framework::BlockDescBind *block) { + paddle::framework::BlockDesc *block) { // insert output for (auto kv : outputs) { for (auto v : kv.second) { auto var = block->Var(v); - var->SetDataType(paddle::framework::DataType::FP32); + var->SetDataType(paddle::framework::proto::DataType::FP32); } } @@ -78,10 +83,10 @@ void StartServerNet() { InitTensorsInScope(scope, place); // sub program run in recv_op, for simple test we use sum - paddle::framework::ProgramDescBind program; - paddle::framework::BlockDescBind *block = program.MutableBlock(0); + paddle::framework::ProgramDesc program; + paddle::framework::BlockDesc *block = program.MutableBlock(0); // X for server side tensors, RX for received tensers, must be of same shape. - AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block); + AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block); paddle::framework::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); @@ -89,8 +94,8 @@ void StartServerNet() { PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto)); attrs.insert({"OptimizeProgram", program_proto}); - recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, - {{"Out", {"Out"}}}, attrs); + recv_op = paddle::framework::OpRegistry::CreateOp( + "recv", {{"RX", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); recv_op->Run(scope, ctx); } @@ -107,11 +112,11 @@ TEST(SendRecvOp, CPU) { attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); auto send_op = paddle::framework::OpRegistry::CreateOp( - "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); + "send", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); send_op->Run(scope, ctx); - auto in_var = scope.Var("X"); + auto in_var = scope.Var("x0"); auto tensor = in_var->GetMutable(); float *expected = tensor->data(); diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 88e9cdadd8650504706baf07e8d29fa0663f2905..f105370f226e2cceaac685f280d55134d4291028 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -159,6 +159,7 @@ void BindBlockDesc(py::module &m) { py::return_value_policy::reference) .def("prepend_op", &BlockDesc::PrependOp, py::return_value_policy::reference) + .def("remove_op", &BlockDesc::RemoveOp) .def("var", [](BlockDesc &self, py::bytes byte_name) { std::string name = byte_name; @@ -249,6 +250,12 @@ void BindOpDesc(py::module &m) { .def("set_attr", &OpDesc::SetAttr) .def("attr", &OpDesc::GetAttr) .def("set_block_attr", &OpDesc::SetBlockAttr) + .def("set_serialized_attr", + [](OpDesc &self, const std::string &name, + const py::bytes &seriralized) { + std::string ser(seriralized); + self.SetAttr(name, ser); + }) .def("block_attr", &OpDesc::GetBlockAttr) .def("check_attrs", &OpDesc::CheckAttrs) .def("infer_shape", &OpDesc::InferShape) diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 9b3792ee9e3e4c6f319b3e2b13c4aa3a05cce8be..471255ef50a3be4739a89efbd978cdb4304d992d 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -16,13 +16,14 @@ import regularizer from param_attr import ParamAttr from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, GPUPlace +from distribute_transpiler import DistributeTranspiler import clip Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' - 'DataFeeder', 'clip' + 'DataFeeder', 'clip', 'DistributeTranspiler' ] diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..50364c64bec46aabc7be4f4b4370a3ad5b0eb07c --- /dev/null +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -0,0 +1,238 @@ +import framework +from framework import Program, default_main_program, Parameter, Variable +import optimizer +from layer_helper import LayerHelper + + +def hash_name_to_server(params_grads, pserver_endpoints): + """ + :param param_grads: + :return: a map of pserver endpoint -> + params -> [param list] + grads -> [grad list] + """ + + def _hash_param(param_name, total): + return hash(param_name) % total + + param_grad_map = dict() + for param, grad in params_grads: + if param.trainable is True and grad is not None: + server_id = _hash_param(param.name, len(pserver_endpoints)) + server_for_param = pserver_endpoints[server_id] + if not param_grad_map.has_key(server_for_param): + param_grad_map[server_for_param] = {"params": [], "grads": []} + param_grad_map[server_for_param]["params"].append(param) + param_grad_map[server_for_param]["grads"].append(grad) + + return param_grad_map + + +def round_robin(params_grads, pserver_endpoints): + assert (len(params_grads) > len(pserver_endpoints)) + + param_grad_map = dict() + pserver_idx = 0 + for param, grad in params_grads: + if param.trainable is True: + server_for_param = pserver_endpoints[pserver_idx] + if not param_grad_map.has_key(server_for_param): + param_grad_map[server_for_param] = {"params": [], "grads": []} + + param_grad_map[server_for_param]["params"].append(param) + param_grad_map[server_for_param]["grads"].append(grad) + + pserver_idx += 1 + if pserver_idx >= len(pserver_endpoints): + pserver_idx = 0 + return param_grad_map + + +class DistributeTranspiler: + def transpile(self, + optimize_ops, + params_grads, + program=None, + pservers="127.0.0.1:6174", + trainers=1, + split_method=round_robin): + """ + Transpile the program to a distributed data-parallelism programs. + + The main_program will be transform to use a remote parameter server + to do parameter optimization. And the optimization graph will be put + in to a parameter server program. + + Use different methods to split trainable varialbles to different + parameter servers. + + Example to run: + + exe = fluid.Executor(place) + t = fluid.DistributeTranspiler() + t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1) + + pserver_endpoint = os.getenv("PSERVER") + if pserver_endpoint: + pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops) + exe.run(fluid.default_startup_program()) + exe.run(pserver_prog) + else: + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + exe.run(fluid.default_startup_program()) + + for pass_id in range(PASS_NUM): + ... + + :param optimize_ops: op list of optimization, should be the + return value of Optimizer.minimize + :type optimize_ops: list + :param program: program to optimize, default default_main_program + :param pservers: parameter server endpoints like "m1:6174,m2:6174" + :type pservers: string + + :return: return a list of programs + """ + if program is None: + program = default_main_program() + self.trainers = trainers + self._optimize_distributed( + optimize_ops, + program, + params_grads, + pservers=pservers, + trainers=trainers, + split_method=split_method) + + def _clone_param(self, block, v): + assert isinstance(v, Parameter) + new_p = Parameter( + block=block, + shape=v.shape, + dtype=v.dtype, + type=v.type, + lod_level=v.lod_level, + stop_gradient=v.stop_gradient, + trainable=v.trainable, + optimize_attr=v.optimize_attr, + regularizer=v.regularizer, + name=v.name) + block.vars[new_p.name] = new_p + + def _clone_var(self, block, var): + assert isinstance(var, Variable) + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + lod_level=var.lod_level, + persistable=var.persistable) + + def _optimize_distributed(self, optimize_ops, program, params_and_grads, + **kwargs): + if kwargs.has_key("split_method"): + split_method = kwargs["split_method"] + else: + split_method = round_robin + + assert (callable(split_method)) + pserver_endpoints = kwargs["pservers"].split(",") + self.param_grad_map = split_method(params_and_grads, pserver_endpoints) + + send_op_ordered_inputs = [] + epmap = [] + for ep, v in self.param_grad_map.iteritems(): + send_op_ordered_inputs.extend(v["grads"]) + for i in v["grads"]: + epmap.append(ep) + send_op = program.global_block().append_op( + type="send", + inputs={"X": send_op_ordered_inputs + }, # inputs is a list of tensors to be send + outputs={}, + attrs={"endpoints": pserver_endpoints, + "epmap": epmap}) + + def get_trainer_program(optimize_ops, program): + # remove optimize ops and add a send op to main_program + program.global_block().delete_ops(optimize_ops) + + def _create_var_for_trainers(self, block, var, trainers): + var_list = [] + for i in xrange(trainers): + var_each = block.create_var( + name="%s.trainer_%d" % (var.name, i), + psersistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + var_list.append(var_each) + return var_list + + def get_pserver_program(self, endpoint, optimize_ops): + pserver_program = Program() + for v in self.param_grad_map[endpoint]["params"]: + self._clone_param(pserver_program.global_block(), v) + + optimize_sub_program = Program() + grad_var_names = [ + var.name for var in self.param_grad_map[endpoint]["grads"] + ] + for opt_op in optimize_ops: + for _, var in opt_op.inputs.iteritems(): + # NOTE: append operators to merge gradients from multiple + # trainers. If trainers == 1, this is not needed. + if self.trainers > 1 and var.name in grad_var_names: + vars2merge = self._create_var_for_trainers( + optimize_sub_program.global_block(), var, self.trainers) + merged_var = optimize_sub_program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + optimize_sub_program.global_block().append_op( + type="sum", + inputs={"X": vars2merge}, + outputs={"Out": merged_var}) + optimize_sub_program.global_block().append_op( + type="scale", + inputs={"X": merged_var}, + outputs={"Out": merged_var}, + attrs={"scale": 1.0 / float(self.trainers)}) + else: + optimize_sub_program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + + if opt_op.inputs.has_key("Grad"): + if opt_op.inputs["Grad"].name in grad_var_names: + print "appending ", opt_op.type, opt_op.inputs + optimize_sub_program.global_block().append_op( + type=opt_op.type, + inputs=opt_op.inputs, + outputs=opt_op.outputs, + attrs=opt_op.attrs) + else: + optimize_sub_program.global_block().append_op( + type=opt_op.type, + inputs=opt_op.inputs, + outputs=opt_op.outputs, + attrs=opt_op.attrs) + pserver_program.global_block().append_op( + type="recv", + inputs={"RX": + self.param_grad_map[endpoint]["grads"]}, # grads to recv + outputs={}, + attrs={ + "OptimizeProgram": optimize_sub_program.desc, + "endpoint": endpoint, + "ParamList": + [p.name for p in self.param_grad_map[endpoint]["params"]], + "GradList": + [p.name for p in self.param_grad_map[endpoint]["grads"]], + "Trainers": self.trainers + }) + pserver_program.sync_with_cpp() + return pserver_program diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 9a99b045dc70a9e4662a6f4da141183ffc8f1846..4b4a0820abb9a85f3e9936190c835c2f186107b3 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -1,6 +1,6 @@ import numpy as np from . import core -from framework import Program, default_main_program +from framework import Program, default_main_program, Parameter, Variable __all__ = ['Executor', 'g_scope'] @@ -148,7 +148,7 @@ class Executor(object): outputs={'Out': [fetch_var]}, attrs={'col': i}) - self.executor.run(program.desc, scope, 0, True) + self.executor.run(program.desc, scope, 0, True, True) outs = [ core.get_fetch_variable(scope, fetch_var_name, i) for i in xrange(len(fetch_list)) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index d1b12a8f097ed674b6b6384fe8ba5db950a94da5..920a49b69c77ccb98214ab76b4ee2aa125e01f02 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -359,6 +359,10 @@ class Operator(object): """ self.block = block self.desc = desc + # for clone a new operator + self.inputs = inputs + self.outputs = outputs + self.attrs = attrs if len(self.desc.type()) != 0: return if type is None: @@ -430,13 +434,18 @@ class Operator(object): continue if isinstance(attrs[attr_name], Block): self.desc.set_block_attr(attr_name, attrs[attr_name].desc) + elif isinstance(attrs[attr_name], core.BlockDesc) or \ + isinstance(attrs[attr_name], core.ProgramDesc): + self.desc.set_serialized_attr( + attr_name, attrs[attr_name].serialize_to_string()) else: self.desc.set_attr(attr_name, attrs[attr_name]) self.desc.check_attrs() no_kernel_op_set = { 'feed', 'fetch', 'save', 'load', 'recurrent', - 'rnn_memory_helper_grad', 'conditional_block', 'while' + 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', + 'recv' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) @@ -582,6 +591,7 @@ class Block(object): self.vars = dict() # var_name --> var self.ops = collections.deque() # operator list self.program = program + self.removed_vars = dict() def __str__(self): return self.to_string(True) @@ -638,6 +648,16 @@ class Block(object): self.ops.append(op) return op + def delete_ops(self, ops): + # remove from cpp + # FIXME(typhoonzero): remove only the first occuracy. + try: + start = list(self.ops).index(ops[0]) + end = list(self.ops).index(ops[-1]) + except Exception, e: + raise e + self.desc.remove_op(start, end) + def prepend_op(self, *args, **kwargs): op_desc = self.desc.prepend_op() op = Operator(self, op_desc, *args, **kwargs) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 84fcbcdc2f2868a84bad5e145a934e33485b1fef..c56a531ed531cf0219e94854ba66c7399e003292 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -207,7 +207,7 @@ class Optimizer(object): optimize_ops = self.create_optimization_pass(params_grads, loss, startup_program) - return optimize_ops + return optimize_ops, params_grads class SGDOptimizer(Optimizer): diff --git a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..2680502efb91061be37a77fbe5b451960fdd15f7 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py @@ -0,0 +1,72 @@ +from __future__ import print_function +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import os + +images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') +label = fluid.layers.data(name='label', shape=[1], dtype='int64') +conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=images, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") +conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + +predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax") +cost = fluid.layers.cross_entropy(input=predict, label=label) +avg_cost = fluid.layers.mean(x=cost) +optimizer = fluid.optimizer.Adam(learning_rate=0.01) +optimize_ops, params_grads = optimizer.minimize(avg_cost) + +accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + +BATCH_SIZE = 50 +PASS_NUM = 3 +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = fluid.CPUPlace() +exe = fluid.Executor(place) +t = fluid.DistributeTranspiler() +pserver_endpoints = os.getenv("PSERVERS") +training_role = os.getenv("TRAINING_ROLE", + "TRAINER") # get the training role: trainer/pserver +t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1) + +if training_role == "PSERVER": + pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops) + exe.run(fluid.default_startup_program()) + exe.run(pserver_prog) +elif training_role == "TRAINER": + feeder = fluid.DataFeeder(feed_list=[images, label], place=place) + exe.run(fluid.default_startup_program()) + + for pass_id in range(PASS_NUM): + accuracy.reset(exe) + for data in train_reader(): + loss, acc = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + accuracy.metrics) + pass_acc = accuracy.eval(exe) + # print loss, acc + if loss < 10.0 and pass_acc > 0.9: + # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good. + exit(0) + + pass_acc = accuracy.eval(exe) + print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc)) +else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + +exit(1) diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py index 2459dfd664300d405edb36c4ca906c1769b5e7d2..29694be58bce0eb41b05439da35ef07a542ef12a 100644 --- a/python/paddle/v2/fluid/tests/test_optimizer.py +++ b/python/paddle/v2/fluid/tests/test_optimizer.py @@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase): block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) - opts = sgd_optimizer.minimize(mean_out, init_program) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) self.assertEqual(len(opts), 1) sgd_op = opts[0] self.assertEqual(sgd_op.type, "sgd") @@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase): learning_rate = 0.01 sgd_optimizer = optimizer.SGDOptimizer( learning_rate=learning_rate, global_step=global_step) - opts = sgd_optimizer.minimize(mean_out, init_program) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) self.assertEqual(len(opts), 2) sgd_op = opts[0] self.assertEqual(sgd_op.type, "sgd")