From fbd186bd5d6dced8255607f9b6266cd438c564dc Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Feb 2019 14:18:14 +0800 Subject: [PATCH] complete recv op --- .../operators/distributed_ops/recv_op.cc | 58 ++++++++++++------- .../fluid/transpiler/distribute_transpiler.py | 25 +++++--- 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 5e004a7a3cb..a0185d66f0b 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/parameter_recv.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -48,32 +49,45 @@ class RecvOp : public framework::OperatorBase { distributed::RPCClient::GetInstance( Attr("trainer_id")); - if (with_barrier) { - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; - VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " - << varname << " and with AsyncGetVar"; - rets.push_back( - rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); - } - if (sync_mode) { + std::vector recv_varnames = + Attr>("recv_varnames"); + + if (recv_varnames.size() > 0) { + framework::RuntimeContext ctx(Inputs(), Outputs(), scope); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = pool.Get(place); + auto exe_ctx = framework::ExecutionContext(*this, scope, *dev_ctx, ctx); + auto recv_functor = distributed::ParameterRecv(); + recv_functor(outs[0], recv_varnames, epmap, exe_ctx, scope); + } else { + if (with_barrier) { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVar"; + rets.push_back( + rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); + } + if (sync_mode) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + } else { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVarNoBarrier"; + rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, + varname, outs[i])); + } for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } } - } else { - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; - VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " - << varname << " and with AsyncGetVarNoBarrier"; - rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, - varname, outs[i])); - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } } } }; diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 1b1b4165933..ae7deda897e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -519,12 +519,20 @@ class DistributeTranspiler(object): param_varname, height_sections, eps, table_names) else: all_recv_outputs.extend(splited_var) + + recv_varnames = [] + if self.config.runtime_split_send_recv: + orig_param = program.global_block().vars[param_varname] + recv_varnames = [var.name for var in splited_vars] + splited_var = [orig_param] + program.global_block().append_op( type="recv", inputs={"X": [recv_dep_in]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, + "recv_varnames": recv_varnames, "trainer_id": self.trainer_id, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: @@ -549,14 +557,15 @@ class DistributeTranspiler(object): continue orig_param = program.global_block().vars[param_varname] if param_varname not in self.sparse_param_to_height_sections: - program.global_block().append_op( - type="concat", - inputs={"X": splited_var}, - outputs={"Out": [orig_param]}, - attrs={ - "axis": 0, - RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE - }) + if not self.config.runtime_split_send_recv: + program.global_block().append_op( + type="concat", + inputs={"X": splited_var}, + outputs={"Out": [orig_param]}, + attrs={ + "axis": 0, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist) -- GitLab