From 446a62e84fb1c7d38fe03b12f7ed1544d6895d0d Mon Sep 17 00:00:00 2001 From: WangXi Date: Fri, 17 Dec 2021 09:59:22 +0800 Subject: [PATCH] fix bind failed with Address already in use (#38174) --- .../paddle/distributed/fleet/base/private_helper_function.py | 4 ++++ .../fluid/tests/unittests/npu/test_collective_base_npu.py | 5 +++++ .../tests/unittests/npu/test_sync_batch_norm_base_npu.py | 5 +++++ python/paddle/fluid/tests/unittests/test_collective_base.py | 5 +++++ python/paddle/fluid/transpiler/details/checkport.py | 4 ++++ 5 files changed, 23 insertions(+) diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py index c7ddd33d5d0..8e2871272a9 100644 --- a/python/paddle/distributed/fleet/base/private_helper_function.py +++ b/python/paddle/distributed/fleet/base/private_helper_function.py @@ -43,6 +43,10 @@ def wait_server_ready(endpoints): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: sock.settimeout(2) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) if result != 0: all_ok = False diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py index 6372e1ab85f..774423a8be1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py @@ -46,6 +46,11 @@ class TestCollectiveRunnerBase(object): socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: sock.settimeout(2) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, + 1) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) if result != 0: all_ok = False diff --git a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py index 9df216d9737..dfd8680c442 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py @@ -65,6 +65,11 @@ class TestSyncBatchNormRunnerBase(object): socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: sock.settimeout(2) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, + 1) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) if result != 0: all_ok = False diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py index 31b8bafd16d..1b55395ede5 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_base.py @@ -44,6 +44,11 @@ class TestCollectiveRunnerBase(object): socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: sock.settimeout(2) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, + 1) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) if result != 0: all_ok = False diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py index 89dd4dd50b0..1341bdaedf9 100644 --- a/python/paddle/fluid/transpiler/details/checkport.py +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -42,6 +42,10 @@ def wait_server_ready(endpoints): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: sock.settimeout(2) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) if result != 0: all_ok = False -- GitLab