未验证 提交 bf6470c7 编写于 作者: G gongweibao 提交者: GitHub

Add detail logs on resnet unit test (#20558)

 Add detail logs on resnet unit test
上级 36c85ef4
......@@ -15,7 +15,7 @@ if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
fi
# rm flag file
rm -f ${name}*.log
rm -f ${name}_*.log
# start the unit test
run_time=$(( $TEST_TIMEOUT - 10 ))
......@@ -28,9 +28,15 @@ fi
echo "${name} faild with ${exit_code}"
netstat -an
# paddle log
echo "${name} log"
cat -n ${name}*.log
for log in `ls ${name}_*.log`
do
printf "\ncat ${log}\n"
cat -n ${log}
done
#display system context
for i in {1..2}; do
......
......@@ -525,7 +525,11 @@ class TestDistBase(unittest.TestCase):
self._port_set.add(port)
return port
def start_pserver(self, model_file, check_error_log, required_envs):
def start_pserver(self,
model_file,
check_error_log,
required_envs,
log_name=""):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s"
......@@ -548,8 +552,8 @@ class TestDistBase(unittest.TestCase):
print(ps0_cmd)
print(ps1_cmd)
ps0_pipe = open("/tmp/ps0_err.log", "wb")
ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_pipe = open(log_name + "_ps0_err.log", "wb")
ps1_pipe = open(log_name + "_ps1_err.log", "wb")
print_to_err(type(self).__name__, "going to start pserver process 0")
ps0_proc = subprocess.Popen(
......@@ -628,8 +632,8 @@ class TestDistBase(unittest.TestCase):
def _run_cluster(self, model, envs, check_error_log, log_name):
# Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs)
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(
model, check_error_log, envs, log_name=log_name)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
......@@ -848,7 +852,7 @@ class TestDistBase(unittest.TestCase):
if check_error_log:
required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10,executor=10,operator=10"
required_envs["GLOG_logtostderr"] = "1"
local_losses \
......
......@@ -15,7 +15,9 @@
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func):
......@@ -36,7 +38,11 @@ class TestDistSeResneXt2x2Async(TestDistBase):
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100)
self.check_with_place(
"dist_se_resnext.py",
delta=100,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册