diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh index f1f6788ce7b272b7692585a299a85e4de45a01b7..f8d464598ce3d6e0e091a2428fedb41adfbc19d6 100644 --- a/python/paddle/fluid/tests/unittests/dist_test.sh +++ b/python/paddle/fluid/tests/unittests/dist_test.sh @@ -15,7 +15,7 @@ if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then fi # rm flag file -rm -f ${name}*.log +rm -f ${name}_*.log # start the unit test run_time=$(( $TEST_TIMEOUT - 10 )) @@ -28,9 +28,15 @@ fi echo "${name} faild with ${exit_code}" +netstat -an + # paddle log echo "${name} log" -cat -n ${name}*.log +for log in `ls ${name}_*.log` +do + printf "\ncat ${log}\n" + cat -n ${log} +done #display system context for i in {1..2}; do diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 49cf07d67b27bd3b81a6db405e497e2e87ffd08f..9708e53ba1f947371f0be7b6197547cc9f287d19 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -525,7 +525,11 @@ class TestDistBase(unittest.TestCase): self._port_set.add(port) return port - def start_pserver(self, model_file, check_error_log, required_envs): + def start_pserver(self, + model_file, + check_error_log, + required_envs, + log_name=""): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps_cmd = "%s" @@ -548,8 +552,8 @@ class TestDistBase(unittest.TestCase): print(ps0_cmd) print(ps1_cmd) - ps0_pipe = open("/tmp/ps0_err.log", "wb") - ps1_pipe = open("/tmp/ps1_err.log", "wb") + ps0_pipe = open(log_name + "_ps0_err.log", "wb") + ps1_pipe = open(log_name + "_ps1_err.log", "wb") print_to_err(type(self).__name__, "going to start pserver process 0") ps0_proc = subprocess.Popen( @@ -628,8 +632,8 @@ class TestDistBase(unittest.TestCase): def _run_cluster(self, model, envs, check_error_log, log_name): # Run dist train to compare with local results - ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, - check_error_log, envs) + ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver( + model, check_error_log, envs, log_name=log_name) ps0_ep, ps1_ep = self._ps_endpoints.split(",") @@ -848,7 +852,7 @@ class TestDistBase(unittest.TestCase): if check_error_log: required_envs["GLOG_vmodule"] = \ - "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10" + "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10" required_envs["GLOG_logtostderr"] = "1" local_losses \ diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py index 0d99728965abf7a90377d39bfb91cad752cfe303..3a34f744b2d2fa2bdb49888463520392091b7be0 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py @@ -15,7 +15,9 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase + import os +flag_name = os.path.splitext(__file__)[0] def skip_ci(func): @@ -36,7 +38,11 @@ class TestDistSeResneXt2x2Async(TestDistBase): @skip_ci def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=100) + self.check_with_place( + "dist_se_resnext.py", + delta=100, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__":