diff --git a/python/paddle/distributed/launch/context/node.py b/python/paddle/distributed/launch/context/node.py index 1ece4db0fbbeed379c2cda343022dd371a9e7540..2fa8b892275a03011651d5de05f24c6fa41ef624 100644 --- a/python/paddle/distributed/launch/context/node.py +++ b/python/paddle/distributed/launch/context/node.py @@ -25,6 +25,7 @@ class Node(object): self.device = Device.parse_device() self.ip = self.get_host_ip() self.free_ports = [] + self._allocated_ports = [] def get_host_ip(self): try: @@ -42,13 +43,18 @@ class Node(object): def get_ports_occupied(self): return self.free_ports - @classmethod def get_free_port(self): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, - struct.pack('ii', 1, 0)) - s.bind(('', 0)) - return s.getsockname()[1] + for _ in range(100): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + port = s.getsockname()[1] + if port in self._allocated_ports: + continue + else: + self._allocated_ports.append(port) + return port + return port @classmethod def is_server_ready(self, ip, port): diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index 83a5e18714dff8c75768e4bd4f46d898983b70f0..dd7edba35a474a255a62443b1c2d7ff91d3550d9 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -36,7 +36,7 @@ def launch(): Base Parameters: - - ``--master``: The master/rendezvous server, support http:// and etcd://, default with http://. e.g., ``--master=127.0.0.1:8080``. Default ``--log_dir=None``. + - ``--master``: The master/rendezvous server, support http:// and etcd://, default with http://. e.g., ``--master=127.0.0.1:8080``. Default ``--master=None``. - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.