未验证 提交 bf6470c7 编写于 作者: G gongweibao 提交者: GitHub

Add detail logs on resnet unit test (#20558)

 Add detail logs on resnet unit test
上级 36c85ef4
...@@ -15,7 +15,7 @@ if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then ...@@ -15,7 +15,7 @@ if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
fi fi
# rm flag file # rm flag file
rm -f ${name}*.log rm -f ${name}_*.log
# start the unit test # start the unit test
run_time=$(( $TEST_TIMEOUT - 10 )) run_time=$(( $TEST_TIMEOUT - 10 ))
...@@ -28,9 +28,15 @@ fi ...@@ -28,9 +28,15 @@ fi
echo "${name} faild with ${exit_code}" echo "${name} faild with ${exit_code}"
netstat -an
# paddle log # paddle log
echo "${name} log" echo "${name} log"
cat -n ${name}*.log for log in `ls ${name}_*.log`
do
printf "\ncat ${log}\n"
cat -n ${log}
done
#display system context #display system context
for i in {1..2}; do for i in {1..2}; do
......
...@@ -525,7 +525,11 @@ class TestDistBase(unittest.TestCase): ...@@ -525,7 +525,11 @@ class TestDistBase(unittest.TestCase):
self._port_set.add(port) self._port_set.add(port)
return port return port
def start_pserver(self, model_file, check_error_log, required_envs): def start_pserver(self,
model_file,
check_error_log,
required_envs,
log_name=""):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s" ps_cmd = "%s"
...@@ -548,8 +552,8 @@ class TestDistBase(unittest.TestCase): ...@@ -548,8 +552,8 @@ class TestDistBase(unittest.TestCase):
print(ps0_cmd) print(ps0_cmd)
print(ps1_cmd) print(ps1_cmd)
ps0_pipe = open("/tmp/ps0_err.log", "wb") ps0_pipe = open(log_name + "_ps0_err.log", "wb")
ps1_pipe = open("/tmp/ps1_err.log", "wb") ps1_pipe = open(log_name + "_ps1_err.log", "wb")
print_to_err(type(self).__name__, "going to start pserver process 0") print_to_err(type(self).__name__, "going to start pserver process 0")
ps0_proc = subprocess.Popen( ps0_proc = subprocess.Popen(
...@@ -628,8 +632,8 @@ class TestDistBase(unittest.TestCase): ...@@ -628,8 +632,8 @@ class TestDistBase(unittest.TestCase):
def _run_cluster(self, model, envs, check_error_log, log_name): def _run_cluster(self, model, envs, check_error_log, log_name):
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(
check_error_log, envs) model, check_error_log, envs, log_name=log_name)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
...@@ -848,7 +852,7 @@ class TestDistBase(unittest.TestCase): ...@@ -848,7 +852,7 @@ class TestDistBase(unittest.TestCase):
if check_error_log: if check_error_log:
required_envs["GLOG_vmodule"] = \ required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10" "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
local_losses \ local_losses \
......
...@@ -15,7 +15,9 @@ ...@@ -15,7 +15,9 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func): def skip_ci(func):
...@@ -36,7 +38,11 @@ class TestDistSeResneXt2x2Async(TestDistBase): ...@@ -36,7 +38,11 @@ class TestDistSeResneXt2x2Async(TestDistBase):
@skip_ci @skip_ci
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place(
"dist_se_resnext.py",
delta=100,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册