diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index fb674d67ebe0cafde4ee5b1fd2711d1c065a955a..07b4f9df231e3eabf7d9347cd6de72dbe25e6866 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -234,6 +234,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): super(MPISymetricRoleMaker, self).__init__() self._node_type = None self._proc_per_node = 2 + self._pserver_rand_port = 0 def _check_role_generation(self): if not self._role_is_generated: @@ -248,6 +249,20 @@ class MPISymetricRoleMaker(MPIRoleMaker): return self.is_worker() and 0 == self.worker_index() return False + def get_pserver_endpoints(self): + if self._pserver_rand_port <= 0: + import random + random.seed(self._server_num()) + # port will be randomly generated from 60001 to 63999 + # random seed is server num so that all nodes will get + # the same port + self._pserver_rand_port = random.randint(60001, 64000) + endpoints = [ + x + ":" + str(self._pserver_rand_port) + for x in self._server_endpoints + ] + return endpoints + def worker_num(self): return self._worker_num() @@ -273,7 +288,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): if self.is_worker(): - return self._get_size() / 2 + return self._get_size() / self._proc_per_node return 0 def _server_num(self): @@ -281,9 +296,10 @@ class MPISymetricRoleMaker(MPIRoleMaker): return the current number of server """ if self._check_role_generation(): - if self.is_server(): - return self._get_size() / 2 - return 0 + return self._get_size() / self._proc_per_node + else: + self.generate_role() + return self._get_size() / self._proc_per_node def worker_index(self): """ @@ -291,7 +307,9 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): return self._rank / self._proc_per_node - return 0 + else: + self.generate_role() + return self._get_size() / 2 def server_index(self): """ @@ -299,7 +317,9 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): return self._rank / self._proc_per_node - return 0 + else: + self.generate_role() + return self._get_size() / self._proc_per_node def _barrier_worker(self): """ @@ -308,6 +328,8 @@ class MPISymetricRoleMaker(MPIRoleMaker): if self._check_role_generation(): if self.is_worker(): self._node_type_comm.barrier() + else: + raise Exception("You should check role generation first") def _barrier_server(self): """ @@ -316,6 +338,8 @@ class MPISymetricRoleMaker(MPIRoleMaker): if self._check_role_generation(): if self.is_server(): self._node_type_comm.barrier() + else: + raise Exception("You should check role generation first") def generate_role(self): """ @@ -332,6 +356,8 @@ class MPISymetricRoleMaker(MPIRoleMaker): self._node_type = 1 self._node_type_comm = self._comm.Split(self._node_type) self._role_is_generated = True + else: + raise Exception("You should check role generation first") class PaddleCloudRoleMaker(RoleMakerBase):