未验证 提交 554bcdbd 编写于 作者: W Wu Yi 提交者: GitHub

add more log for dist test for ci test=develop (#14813)

* add more log for dist test for ci test=develop

* increase deadline test=develop
上级 9623b45f
...@@ -378,6 +378,18 @@ class TestDistBase(unittest.TestCase): ...@@ -378,6 +378,18 @@ class TestDistBase(unittest.TestCase):
stderr=tr1_pipe, stderr=tr1_pipe,
env=env1) env=env1)
# Wait until trainer process terminate
while True:
stat0 = tr0_proc.poll()
time.sleep(0.1)
if stat0 is not None:
break
while True:
stat1 = tr1_proc.poll()
time.sleep(0.1)
if stat1 is not None:
break
tr0_out, tr0_err = tr0_proc.communicate() tr0_out, tr0_err = tr0_proc.communicate()
tr1_out, tr1_err = tr1_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate()
...@@ -390,11 +402,21 @@ class TestDistBase(unittest.TestCase): ...@@ -390,11 +402,21 @@ class TestDistBase(unittest.TestCase):
ps0.terminate() ps0.terminate()
ps1.terminate() ps1.terminate()
# print server log
with open("/tmp/ps0_err.log", "r") as fn:
sys.stderr.write("ps0 stderr: %s\n" % fn.read())
with open("/tmp/ps1_err.log", "r") as fn:
sys.stderr.write("ps1 stderr: %s\n" % fn.read())
# print log # print log
sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) if stat0 == 0:
sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) with open("/tmp/tr0_err.log", "r") as fn:
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
if stat1 == 0:
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
with open("/tmp/tr1_err.log", "r") as fn:
sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
return pickle.loads(tr0_out), pickle.loads(tr1_out) return pickle.loads(tr0_out), pickle.loads(tr1_out)
...@@ -474,6 +496,7 @@ class TestDistBase(unittest.TestCase): ...@@ -474,6 +496,7 @@ class TestDistBase(unittest.TestCase):
"PYTHONPATH": os.getenv("PYTHONPATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_rpc_deadline": "5000", # 5sec to fail fast
"FLAGS_cudnn_deterministic": "1", "FLAGS_cudnn_deterministic": "1",
"http_proxy": "", "http_proxy": "",
"NCCL_P2P_DISABLE": "1" "NCCL_P2P_DISABLE": "1"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册