diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index bc61b0eacbf6c8a1fd4487ad5a442fed1b536345..7722c9401e0e7c071adb7bee9b35306431bb7a11 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); - if (node->Op()->Type() == "concat") { - ConnectOp(result, result->Get(kGraphOps).back().get(), - "fetch_barrier"); +} + +void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { + auto *op_handle = result->Get(kGraphOps).back().get(); + for (ir::Node *input : node->inputs) { + VarHandle *var = nullptr; + for (int place_offset = 0; place_offset < num_places; ++place_offset) { + auto &var_holders = result->Get(kGraphVars)[place_offset]; + auto &var_holder = var_holders[input->Name()]; + if (!var_holder.empty()) { + var = var_holder.rbegin()->get(); + op_handle->AddInput(var); + } + } } } // Create RPC related op handles that connects its in ops and out ops. void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { - // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode - // put them into transpiler. int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - // FIXME(typhoonzero): assume each recv op output one param - // Use the same place as send. if (recv_param_grad.size() == 2U) { op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] @@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, .emplace(varname, op_dev_id); } } else { - // send_barrier and fetch_barrier op can be scheduled on device 0 + // send_barrier, fetch_barrier will run on place 0; op_dev_id = 0; } PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get(kGraphOps).emplace_back(new RPCOpHandle( result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], node->Op()->Type(), places_[op_dev_id])); - // TODO(panyx0718): This might not be needed anymore. - if (node->Op()->Type() == "send_barrier") { - ConnectOp(result, result->Get(kGraphOps).back().get(), "send"); - } else if (node->Op()->Type() == "recv") { - ConnectOp(result, result->Get(kGraphOps).back().get(), - "send_barrier"); - } else if (node->Op()->Type() == "fetch_barrier") { - ConnectOp(result, result->Get(kGraphOps).back().get(), "recv"); - } else if (node->Op()->Type() == "send") { - // do nothing + if (node->Op()->Type() == "send") { + CreateOpHandleIOs(result, node, op_dev_id); } else { - PADDLE_THROW( - "rpc op should be in [" - "send, send_barrier. recv, fetch_barrier]"); - } + // send_barrier, recv, fetch_barrier's inputs are deps var, get them from + // all places + auto p = places_[op_dev_id]; + auto *op_handle = result->Get(kGraphOps).back().get(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); - CreateOpHandleIOs(result, node, op_dev_id); + SetOpInputsAllPlaces(result, node, places_.size()); + for (ir::Node *output : node->outputs) { + int outvar_dev_id = op_dev_id; + if (node->Op()->Type() == "fetch_barrier") { + outvar_dev_id = GetVarDeviceID(*result, output->Name()); + PADDLE_ENFORCE_NE(outvar_dev_id, -1); + } + p = places_[outvar_dev_id]; + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); + } + } } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 2a6bf4ac230df81b38751000bf4b663f24984db3..39b0f2f038380b4631728e28031511205c4b40f2 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } - std::vector send_ops; - ir::Node *send_bar = nullptr; - std::vector recv_ops; - ir::Node *fetch_bar = nullptr; - for (ir::Node *node : Nodes()) { - if (node->Name() == "send") { - send_ops.push_back(node); - } else if (node->Name() == "send_barrier") { - PADDLE_ENFORCE(!send_bar, "only has one send barrier"); - send_bar = node; - } else if (node->Name() == "recv") { - recv_ops.push_back(node); - } else if (node->Name() == "fetch_barrier") { - PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier"); - fetch_bar = node; - } - } - if (send_bar) { - for (ir::Node *send : send_ops) { - ir::Node *dep_var = CreateControlDepVar(); - send->outputs.push_back(dep_var); - dep_var->inputs.push_back(send); - send_bar->inputs.push_back(dep_var); - dep_var->outputs.push_back(send_bar); - } - for (ir::Node *recv : recv_ops) { - ir::Node *dep_var = CreateControlDepVar(); - recv->inputs.push_back(dep_var); - dep_var->outputs.push_back(recv); - send_bar->outputs.push_back(dep_var); - dep_var->inputs.push_back(send_bar); - } - } - if (fetch_bar) { - for (ir::Node *recv : recv_ops) { - ir::Node *dep_var = CreateControlDepVar(); - recv->outputs.push_back(dep_var); - dep_var->inputs.push_back(recv); - fetch_bar->inputs.push_back(dep_var); - dep_var->outputs.push_back(fetch_bar); - } - } - - std::vector send_vars = FindDistTrainSendVars(send_ops); - std::vector recv_vars = FindDistTrainRecvVars(recv_ops); - for (ir::Node *node : Nodes()) { - if (IsDistTrainOp(node, send_vars, recv_vars)) { - if (fetch_bar && node->Name() == "concat") { - ir::Node *dep_var = CreateControlDepVar(); - fetch_bar->outputs.push_back(dep_var); - dep_var->inputs.push_back(fetch_bar); - node->inputs.push_back(dep_var); - dep_var->outputs.push_back(node); - } - } - } - /** * We should handle write after read(WAR) and write after write(WAW) here. * Because some of the operators of the program can be executed parallelly. diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index d9cd956dfdff3d009d38ee5088f5396080580483..9d7ac7ab6194593747548fac3cefc8d4ed3058d8 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase { class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( SendBarrier operator diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 14b07649c416ff1b671fc9b5ee4eb956b44570c5..40404295266899c6ac2f7b1e08fdf7db40958794 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase { class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddInput("X", "(Any) Dummy inputs, used for control dependency") + .AsDuplicable(); + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( SendBarrier operator diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index b03ee514f50f9a8c1425bd5b1d409b58ed62351a..0cf7aaef4ab75ca6976465d1b404004a9f2f64c5 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True): rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC }) if sync: - helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) + helper.append_op( + type="send_barrier", + inputs={"X": dummy_output}, + outputs={"Out": []}, + attrs={"endpoints": endpoints}) def Recv(endpoints, get_vars, dummy_input=None, sync=True): @@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True): attrs={"endpoints": endpoints, "epmap": epmap}) if sync: - helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints}) + helper.append_op( + type="fetch_barrier", + outputs={"Out": get_vars}, + attrs={"endpoints": endpoints}) return get_vars diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 0387e911880256ea6b8efb6f2311bbf4c4f8c0f2..a4ffe7d40c40501ebd43fec0b664159227ea34bd 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -134,7 +134,7 @@ class SE_ResNeXt(): size=class_dim, act='softmax', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.2))) + initializer=fluid.initializer.Constant(value=0.05))) return out def shortcut(self, input, ch_out, stride): @@ -184,7 +184,7 @@ class SE_ResNeXt(): act=None, # avoid pserver CPU init differs from GPU param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.2)), + initializer=fluid.initializer.Constant(value=0.05)), bias_attr=False) return fluid.layers.batch_norm(input=conv, act=act) @@ -192,13 +192,19 @@ class SE_ResNeXt(): pool = fluid.layers.pool2d( input=input, pool_size=0, pool_type='avg', global_pooling=True) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) - squeeze = fluid.layers.fc(input=pool, - size=num_channels // reduction_ratio, - act='relu') + squeeze = fluid.layers.fc( + input=pool, + size=num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='relu') stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid') + excitation = fluid.layers.fc( + input=squeeze, + size=num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='sigmoid') scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) return scale diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index 0ad994a258c04cabc807823b7d2a8ae8bb62ab2c..f3e740fc7027a4a562b836c3113b87d55062c185 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase): dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_second = fluid.layers.embedding( input=words[1], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_third = fluid.layers.embedding( input=words[2], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_forth = fluid.layers.embedding( input=words[3], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) concat_embed = fluid.layers.concat( input=[embed_first, embed_second, embed_third, embed_forth], @@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase): size=HIDDEN_SIZE, act='sigmoid', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant())) + initializer=fluid.initializer.Constant(value=0.1))) predict_word = fluid.layers.fc( input=hidden1, size=dict_size, act='softmax', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant())) + initializer=fluid.initializer.Constant(value=0.1))) cost = fluid.layers.cross_entropy( input=predict_word, label=words[4]) avg_cost = fluid.layers.mean(cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 9581abdf394d738470d32ae609838832077ee519..083525ccf54d389b60c4aaa9f8c6223f07c773cd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase): main.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": []}, attrs={ "endpoints": ["127.0.0.1:{0}".format(port)], RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 38af149ad336fcb818c3cbc9c686bcbdf00238be..9a3e92e8d775a37e0c24ee1bcc5435628d61bb91 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase): self._sync_mode = True def test_se_resnext(self): - self.check_with_place("dist_word2vec.py", delta=1e-7) + self.check_with_place("dist_word2vec.py", delta=1e-4) class TestDistSeResneXt2x2Async(TestDistBase): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 28ae89acd3a5567074bafe25ec75c48c34948f2f..21c51bd139ed72322aa363f42c8ead402a501bff 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -283,10 +283,13 @@ class DistributeTranspiler(object): send_vars.append(var) if self.sync_mode: + send_barrier_out = program.global_block().create_var( + name=framework.generate_control_dev_var_name()) + input_deps = grad_name_to_send_dummy_out.values() program.global_block().append_op( type="send_barrier", - inputs={}, - outputs={}, + inputs={"X": input_deps}, + outputs={"Out": send_barrier_out}, attrs={ "endpoints": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -304,16 +307,22 @@ class DistributeTranspiler(object): self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) # step4: Concat the parameters splits together after recv. + all_recv_outputs = [] for param_varname, splited_var in six.iteritems(self.param_var_mapping): eps = [] for var in splited_var: index = [v.name for v in recv_vars].index(var.name) eps.append(eplist[index]) - grad_send_dummy_out = grad_name_to_send_dummy_out[ - self.param_name_to_grad_name[param_varname]] + if self.sync_mode: + recv_dep_in = send_barrier_out + else: + # connect deps to send op in async mode + recv_dep_in = grad_name_to_send_dummy_out[ + self.param_name_to_grad_name[param_varname]] + all_recv_outputs.extend(splited_var) program.global_block().append_op( type="recv", - inputs={"X": [grad_send_dummy_out]}, + inputs={"X": [recv_dep_in]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, @@ -326,10 +335,11 @@ class DistributeTranspiler(object): }) if self.sync_mode: + # form a WAW dependency program.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": all_recv_outputs}, attrs={ "endpoints": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -413,10 +423,12 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) + fetch_barrier_out = startup_program.global_block().create_var( + name=framework.generate_control_dev_var_name()) startup_program.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": fetch_barrier_out}, attrs={ "endpoints": self.pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE