From 1f1cc2221fcd19ca4fbb761b4b38e9d73794cc2f Mon Sep 17 00:00:00 2001 From: guru4elephant <35550832+guru4elephant@users.noreply.github.com> Date: Mon, 8 Jul 2019 11:01:18 +0800 Subject: [PATCH] add random port (#18504) * add random port --- .../fluid/incubate/fleet/base/role_maker.py | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index fb674d67ebe..07b4f9df231 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -234,6 +234,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): super(MPISymetricRoleMaker, self).__init__() self._node_type = None self._proc_per_node = 2 + self._pserver_rand_port = 0 def _check_role_generation(self): if not self._role_is_generated: @@ -248,6 +249,20 @@ class MPISymetricRoleMaker(MPIRoleMaker): return self.is_worker() and 0 == self.worker_index() return False + def get_pserver_endpoints(self): + if self._pserver_rand_port <= 0: + import random + random.seed(self._server_num()) + # port will be randomly generated from 60001 to 63999 + # random seed is server num so that all nodes will get + # the same port + self._pserver_rand_port = random.randint(60001, 64000) + endpoints = [ + x + ":" + str(self._pserver_rand_port) + for x in self._server_endpoints + ] + return endpoints + def worker_num(self): return self._worker_num() @@ -273,7 +288,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): if self.is_worker(): - return self._get_size() / 2 + return self._get_size() / self._proc_per_node return 0 def _server_num(self): @@ -281,9 +296,10 @@ class MPISymetricRoleMaker(MPIRoleMaker): return the current number of server """ if self._check_role_generation(): - if self.is_server(): - return self._get_size() / 2 - return 0 + return self._get_size() / self._proc_per_node + else: + self.generate_role() + return self._get_size() / self._proc_per_node def worker_index(self): """ @@ -291,7 +307,9 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): return self._rank / self._proc_per_node - return 0 + else: + self.generate_role() + return self._get_size() / 2 def server_index(self): """ @@ -299,7 +317,9 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): return self._rank / self._proc_per_node - return 0 + else: + self.generate_role() + return self._get_size() / self._proc_per_node def _barrier_worker(self): """ @@ -308,6 +328,8 @@ class MPISymetricRoleMaker(MPIRoleMaker): if self._check_role_generation(): if self.is_worker(): self._node_type_comm.barrier() + else: + raise Exception("You should check role generation first") def _barrier_server(self): """ @@ -316,6 +338,8 @@ class MPISymetricRoleMaker(MPIRoleMaker): if self._check_role_generation(): if self.is_server(): self._node_type_comm.barrier() + else: + raise Exception("You should check role generation first") def generate_role(self): """ @@ -332,6 +356,8 @@ class MPISymetricRoleMaker(MPIRoleMaker): self._node_type = 1 self._node_type_comm = self._comm.Split(self._node_type) self._role_is_generated = True + else: + raise Exception("You should check role generation first") class PaddleCloudRoleMaker(RoleMakerBase): -- GitLab