From cc12d27f3cca4e19a8d7014ea5c59842aa4e9739 Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Wed, 17 Aug 2022 11:05:39 +0800 Subject: [PATCH] fix tcp LINGER (#45190) --- python/paddle/distributed/launch/context/node.py | 2 ++ python/paddle/distributed/launch/controllers/collective.py | 2 +- python/paddle/distributed/launch/controllers/master.py | 2 +- python/paddle/distributed/launch/controllers/ps.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/launch/context/node.py b/python/paddle/distributed/launch/context/node.py index 39f42d0210..6ee8fa6d10 100644 --- a/python/paddle/distributed/launch/context/node.py +++ b/python/paddle/distributed/launch/context/node.py @@ -49,6 +49,8 @@ class Node(object): for _ in range(100): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, + struct.pack('ii', 1, 0)) s.bind(('', 0)) port = s.getsockname()[1] if port in self._allocated_ports: diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py index e155d31b45..302070e55c 100644 --- a/python/paddle/distributed/launch/controllers/collective.py +++ b/python/paddle/distributed/launch/controllers/collective.py @@ -93,7 +93,7 @@ class CollectiveController(Controller): self.pod.replicas = self.pod_replicas() # rank will be reset when restart - self.pod.rank = self.ctx.args.rank + self.pod.rank = int(self.ctx.args.rank) port = self.ctx.node.get_free_port() diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py index 825be9c368..c71d0890f1 100644 --- a/python/paddle/distributed/launch/controllers/master.py +++ b/python/paddle/distributed/launch/controllers/master.py @@ -102,7 +102,7 @@ class HTTPMaster(Master): print(" ".join(cmd)) print("-" * 80) - if self.ctx.args.rank >= 0: + if int(self.ctx.args.rank) >= 0: self.ctx.logger.warning( "--rank set in the command may not compatible in auto mode") diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py index 573f578d24..f785311a52 100644 --- a/python/paddle/distributed/launch/controllers/ps.py +++ b/python/paddle/distributed/launch/controllers/ps.py @@ -111,7 +111,7 @@ class PSController(Controller): def _build_pod_with_master(self): - self.pod.rank = self.ctx.args.rank + self.pod.rank = int(self.ctx.args.rank) server_num = self.ctx.args.server_num or 1 servers = [ -- GitLab