diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 5bf406cc061f339d1791402057aad399cf301b48..36de1630504e593da2374e54af0df6b1ba0fd350 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -239,6 +239,10 @@ if(WITH_DISTRIBUTE) list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler") + #not need + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base") + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base") + py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS}) @@ -249,8 +253,11 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh) bash_test_modules(test_launch MODULES test_launch.sh) + set(dist_ut_port 1000) foreach(TEST_OP ${DIST_TEST_OPS}) - bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE") + message(STATUS "set dist_ut_port=${dist_ut_port} on ${TEST_OP}") + bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}") + MATH(EXPR dist_ut_port "${dist_ut_port}+50") endforeach(TEST_OP) endif(NOT APPLE) endif() diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh index 79eb8b4f26c8c606197aae6b583b1f26e53c36ea..b185ab54a95170c817f1186500831e0f16a6befe 100644 --- a/python/paddle/fluid/tests/unittests/dist_test.sh +++ b/python/paddle/fluid/tests/unittests/dist_test.sh @@ -44,8 +44,9 @@ done #display system context for i in {1..2}; do - sleep 2 - ps -ef | grep -E "(test_|_test)" + sleep 3 + ps -aux + netstat -anlp if hash "nvidia-smi" > /dev/null; then nvidia-smi diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 97598755c546f9b0699da9b937950a9b2139db02..531342ce262aaa259ada60eb5a2ad7d4595d17c2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -36,6 +36,7 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker RUN_STEP = 5 DEFAULT_BATCH_SIZE = 2 +DIST_UT_PORT = 0 def print_to_out(out_losses): @@ -486,8 +487,6 @@ class TestDistBase(unittest.TestCase): self._trainers = 2 self._pservers = 2 self._port_set = set() - self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( - self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable self._sync_mode = True self._hogwild_mode = False @@ -512,6 +511,20 @@ class TestDistBase(unittest.TestCase): self._ut4grad_allreduce = False self._use_hallreduce = False self._setup_config() + + global DIST_UT_PORT + if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"): + DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT")) + + if DIST_UT_PORT == 0: + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + else: + print("set begin_port:", DIST_UT_PORT) + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + DIST_UT_PORT, DIST_UT_PORT + 1) + DIST_UT_PORT += 2 + self._after_setup_config() def _find_free_port(self): @@ -790,8 +803,16 @@ class TestDistBase(unittest.TestCase): check_error_log, log_name): if self._use_hallreduce: self._ps_endpoints = "" - for i in range(0, 4): - self._ps_endpoints += "127.0.0.1:%s," % (self._find_free_port()) + + global DIST_UT_PORT + if DIST_UT_PORT == 0: + for i in range(0, 4): + self._ps_endpoints += "127.0.0.1:%s," % ( + self._find_free_port()) + else: + for i in range(0, 4): + self._ps_endpoints += "127.0.0.1:%s," % (DIST_UT_PORT + i) + DIST_UT_PORT += 4 self._ps_endpoints = self._ps_endpoints[:-1] # NOTE: we reuse ps_endpoints as nccl2 worker endpoints @@ -858,7 +879,7 @@ class TestDistBase(unittest.TestCase): required_envs["GLOG_vmodule"] = \ "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10," \ "alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10," \ - "sparse_all_reduce_op_handle=10,gen_nccl_id_op=10" + "sparse_all_reduce_op_handle=10,gen_nccl_id_op=10,nccl_helper=10,grpc_client=10,grpc_server=10" required_envs["GLOG_logtostderr"] = "1" required_envs.update(need_envs) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py index 0774f618c8b36a6e0a91c3decba3f5def4e1b3af..3733d4cfad0dffa8bd38602774286661dd022709 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py @@ -40,6 +40,7 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo RUN_STEP = 5 LEARNING_RATE = 0.01 +DIST_UT_PORT = 0 class FleetDistRunnerBase(object): @@ -123,8 +124,20 @@ class TestFleetBase(unittest.TestCase): self._trainers = 2 self._pservers = 2 self._port_set = set() - self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( - self._find_free_port(), self._find_free_port()) + + global DIST_UT_PORT + if DIST_UT_PORT == 0 and os.getenv("PADDLE_DIST_UT_PORT"): + DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT")) + + if DIST_UT_PORT: + print("set begin_port:", DIST_UT_PORT) + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + DIST_UT_PORT, DIST_UT_PORT + 1) + DIST_UT_PORT += 2 + else: + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + self._python_interp = sys.executable self._geo_sgd = False self._geo_sgd_need_push_nums = 5