diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 6697102e56b89605c648e146ba78f91b5156f6a9..5c8c607026a6dd42b6de7b01e069dd9cbee76e13 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,8 +37,8 @@ paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, default paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', '02fcfc1eda07c03a84ed62422366239c')) paddle.fluid.DistributeTranspiler ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspiler', ('document', '5152a3ed0544d4d9600fb53a73f15c38')) paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32')) -paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf')) +paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '03ad12ad1c36dea4f3595a4e8c592f95')) +paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'a02723c79325ec4cac352b6d2eea7b52')) paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39')) paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '27ed6451913b6ab341bd0d1a36b3f8c1')) paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '13c09537f7a5660c528242eb370bb4c7')) @@ -902,8 +902,8 @@ paddle.fluid.dygraph.BackwardStrategy ('paddle.fluid.core_avx.BackwardStrategy', paddle.fluid.dygraph.BackwardStrategy.__init__ __init__(self: paddle.fluid.core_avx.BackwardStrategy) -> None paddle.fluid.transpiler.DistributeTranspiler ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspiler', ('document', '5152a3ed0544d4d9600fb53a73f15c38')) paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32')) -paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '03ad12ad1c36dea4f3595a4e8c592f95')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'a02723c79325ec4cac352b6d2eea7b52')) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39')) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '27ed6451913b6ab341bd0d1a36b3f8c1')) paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '13c09537f7a5660c528242eb370bb4c7')) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index 1d6732dd21e5c82057ee939a95f7812fa938fdbd..58b262100e5a0511f24d1ef72f6e85d5cde45416 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -936,6 +936,7 @@ void GeoSgdCommunicator::RpcRecv(const std::string &var_name, auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace()); distributed::RPCClient *rpc_client = distributed::RPCClient::GetInstance(train_id); + pserver_scope_->Var(splited_var_name); rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv, *pserver_scope_.get(), splited_var_name, splited_var_name, splited_var_name); diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 0e48814e203c7eb7f21750fbaf0113086959c2fe..73e0316c79af41fd4b35a664813acd9f7223fe7c 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -831,7 +831,15 @@ class DistributeTranspiler(object): def get_trainer_program(self, wait_port=True): """ - Get transpiled trainer side program. + Get transpiled trainer side program. The program on trainer side compared with origin program + has following difference: + + - Delete optimizer related op, because parameter updated on Pserver + - After the op which computed gradient of each parameter, add ``Send_op`` and ``Recv_op`` + + Args: + wait_port(bool): Whether to wait for the parameter server to be ready before returning to program, + default is True Returns: Program: trainer side program. @@ -965,7 +973,12 @@ class DistributeTranspiler(object): def get_pserver_program(self, endpoint): """ - Get parameter server side program. + Get parameter server side program.The program on pserver side compared with origin program + has following difference: + + - Only the following op is included: optimize-related op and communication-related op + - NO.0 block only has variable definitions and ``listen_and_serv_op`` + - Every variable which need to be updated has a unique block Args: endpoint (str): current parameter server endpoint. @@ -1224,6 +1237,8 @@ class DistributeTranspiler(object): def get_pserver_programs(self, endpoint): """ Get pserver side main program and startup program for distributed training. + The ``main_program`` returned by this function is consistent with the + return value of the function ``get_pserver_program`` . Args: endpoint (str): current pserver endpoint.