From 8088395a844af0a9379b5e237e24a3387658e991 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 22 Oct 2019 10:14:29 +0800 Subject: [PATCH] Set unique port to every distribute test to avoid potential port conflicts (#20759) --- .../fluid/tests/unittests/CMakeLists.txt | 9 +++++- .../paddle/fluid/tests/unittests/dist_test.sh | 5 +-- .../fluid/tests/unittests/test_dist_base.py | 31 ++++++++++++++++--- .../tests/unittests/test_dist_fleet_base.py | 17 ++++++++-- 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 5bf406cc06..36de163050 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -239,6 +239,10 @@ if(WITH_DISTRIBUTE) list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler") + #not need + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base") + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base") + py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS}) @@ -249,8 +253,11 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh) bash_test_modules(test_launch MODULES test_launch.sh) + set(dist_ut_port 1000) foreach(TEST_OP ${DIST_TEST_OPS}) - bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE") + message(STATUS "set dist_ut_port=${dist_ut_port} on ${TEST_OP}") + bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}") + MATH(EXPR dist_ut_port "${dist_ut_port}+50") endforeach(TEST_OP) endif(NOT APPLE) endif() diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh index 79eb8b4f26..b185ab54a9 100644 --- a/python/paddle/fluid/tests/unittests/dist_test.sh +++ b/python/paddle/fluid/tests/unittests/dist_test.sh @@ -44,8 +44,9 @@ done #display system context for i in {1..2}; do - sleep 2 - ps -ef | grep -E "(test_|_test)" + sleep 3 + ps -aux + netstat -anlp if hash "nvidia-smi" > /dev/null; then nvidia-smi diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 97598755c5..531342ce26 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -36,6 +36,7 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker RUN_STEP = 5 DEFAULT_BATCH_SIZE = 2 +DIST_UT_PORT = 0 def print_to_out(out_losses): @@ -486,8 +487,6 @@ class TestDistBase(unittest.TestCase): self._trainers = 2 self._pservers = 2 self._port_set = set() - self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( - self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable self._sync_mode = True self._hogwild_mode = False @@ -512,6 +511,20 @@ class TestDistBase(unittest.TestCase): self._ut4grad_allreduce = False self._use_hallreduce = False self._setup_config() + + global DIST_UT_PORT + if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"): + DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT")) + + if DIST_UT_PORT == 0: + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + else: + print("set begin_port:", DIST_UT_PORT) + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + DIST_UT_PORT, DIST_UT_PORT + 1) + DIST_UT_PORT += 2 + self._after_setup_config() def _find_free_port(self): @@ -790,8 +803,16 @@ class TestDistBase(unittest.TestCase): check_error_log, log_name): if self._use_hallreduce: self._ps_endpoints = "" - for i in range(0, 4): - self._ps_endpoints += "127.0.0.1:%s," % (self._find_free_port()) + + global DIST_UT_PORT + if DIST_UT_PORT == 0: + for i in range(0, 4): + self._ps_endpoints += "127.0.0.1:%s," % ( + self._find_free_port()) + else: + for i in range(0, 4): + self._ps_endpoints += "127.0.0.1:%s," % (DIST_UT_PORT + i) + DIST_UT_PORT += 4 self._ps_endpoints = self._ps_endpoints[:-1] # NOTE: we reuse ps_endpoints as nccl2 worker endpoints @@ -858,7 +879,7 @@ class TestDistBase(unittest.TestCase): required_envs["GLOG_vmodule"] = \ "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10," \ "alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10," \ - "sparse_all_reduce_op_handle=10,gen_nccl_id_op=10" + "sparse_all_reduce_op_handle=10,gen_nccl_id_op=10,nccl_helper=10,grpc_client=10,grpc_server=10" required_envs["GLOG_logtostderr"] = "1" required_envs.update(need_envs) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py index 0774f618c8..3733d4cfad 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py @@ -40,6 +40,7 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo RUN_STEP = 5 LEARNING_RATE = 0.01 +DIST_UT_PORT = 0 class FleetDistRunnerBase(object): @@ -123,8 +124,20 @@ class TestFleetBase(unittest.TestCase): self._trainers = 2 self._pservers = 2 self._port_set = set() - self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( - self._find_free_port(), self._find_free_port()) + + global DIST_UT_PORT + if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"): + DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT")) + + if DIST_UT_PORT: + print("set begin_port:", DIST_UT_PORT) + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + DIST_UT_PORT, DIST_UT_PORT + 1) + DIST_UT_PORT += 2 + else: + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + self._python_interp = sys.executable self._geo_sgd = False self._geo_sgd_need_push_nums = 5 -- GitLab