未验证 提交 89c4b3dd 编写于 作者: G gongweibao 提交者: GitHub

Add bash_test_modules function to capture the timeout or failed context. (#20197)

上级 73cf08d6
......@@ -3,30 +3,20 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0)
set(dist_ENVS http_proxy="" https_proxy="")
if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_ring_allreduce)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_backward_deps)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_lars)
LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_geo)
endif(NOT WITH_DISTRIBUTE)
file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
#remove distribute unittests.
list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op)
list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op)
list(APPEND MIXED_DIST_TEST_OPS test_hsigmoid_remote_table_op)
list(APPEND MIXED_DIST_TEST_OPS test_lookup_remote_table_op)
list(APPEND MIXED_DIST_TEST_OPS test_launch)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach()
if(NOT WITH_GPU OR WIN32)
LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
......@@ -43,8 +33,6 @@ if(WIN32)
LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
endif()
LIST(REMOVE_ITEM TEST_OPS test_launch)
if (NOT ${WITH_GPU})
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
......@@ -70,14 +58,7 @@ if(APPLE)
message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*")
# this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
# TODO(tangwei12): add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_dist_word2vec)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
endif()
if(NOT WITH_MKLML)
......@@ -130,39 +111,36 @@ function(bash_test_modules TARGET_NAME)
set(options SERIAL)
set(oneValueArgs "")
set(multiValueArgs MODULES DEPS ENVS)
set(multiValueArgs MODULES DEPS ENVS LABELS)
cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
message(STATUS "CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR})
set(timeout 350)
if(${bash_test_modules_TIMEOUT})
set(timeout ${bash_test_modules_TIMEOUT})
endif()
add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${bash_test_modules_ENVS}
COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (bash_test_modules_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
endif()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
if(bash_test_modules_LABELS)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout} LABELS ${bash_test_modules_LABELS})
else()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout})
endif()
endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
LIST(REMOVE_ITEM TEST_OPS test_lookup_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_train)
LIST(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf_auto_growth)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
list(REMOVE_ITEM TEST_OPS test_dgc_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
list(REMOVE_ITEM TEST_OPS test_dist_transformer)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth)
list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
......@@ -255,49 +233,24 @@ py_test_modules(test_install_check MODULES test_install_check ENVS
set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS})
# FIXME(typhoonzero): add these tests back
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS})
py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS})
py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
#py_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv_op ENVS ${dist_ENVS})
if(WITH_DGC)
py_test_modules(test_dgc_op MODULES test_dgc_op)
endif()
if(NOT APPLE)
bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 100 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_nce_remote_table_op test_hsigmoid_remote_table_op test_dist_ctr test_dist_fleet_ctr test_dist_mnist_batch_merge PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_fleetapi PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE )
set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
py_test_modules(test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc)
py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
bash_test_modules(test_launch MODULES test_launch.sh)
# FIXME(typhoonzero): add these tests back
# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
foreach(TEST_OP ${DIST_TEST_OPS})
bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE")
endforeach(TEST_OP)
endif(NOT APPLE)
# py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
......
#!/bin/bash
unset https_proxy http_proxy
name=${TEST_TARGET_NAME}
TEST_TIMEOUT=${TEST_TIMEOUT}
if [[ ${name}"x" == "x" ]]; then
echo "can't find ${name}, please set ${TEST_TARGET_NAME} first"
exit 1
fi
if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
exit 1
fi
# rm flag file
rm -f ${name}*.log
# start the unit test
run_time=$(( $TEST_TIMEOUT - 10 ))
echo "run_time: ${run_time}"
timeout -s SIGKILL ${run_time} python -u ${name}.py > ${name}_run.log 2>&1
exit_code=$?
if [[ $exit_code -eq 0 ]]; then
exit 0
fi
echo "${name} faild with ${exit_code}"
# paddle log
echo "${name} log"
cat -n ${name}*.log
#display system context
for i in {1..2}; do
sleep 2
ps -ef | grep -E "(test_|_test)"
if hash "nvidia-smi" > /dev/null; then
nvidia-smi
fi
done
#display /tmp/files
ls -l /tmp/paddle.*
exit 1
......@@ -571,7 +571,8 @@ class TestDistBase(unittest.TestCase):
envs,
check_error_log=False,
batch_size=DEFAULT_BATCH_SIZE,
batch_merge_repeat=1):
batch_merge_repeat=1,
log_name=""):
cmd = self._python_interp
......@@ -602,7 +603,7 @@ class TestDistBase(unittest.TestCase):
print("local_cmd: {}, env: {}".format(cmd, env_local))
if check_error_log:
err_log = open("/tmp/trainer.err.log", "wb")
err_log = open(log_name + "_local.log", "wb")
local_proc = subprocess.Popen(
cmd.split(" "),
stdout=subprocess.PIPE,
......@@ -625,7 +626,7 @@ class TestDistBase(unittest.TestCase):
return pickle.loads(local_out)
def _run_cluster(self, model, envs, check_error_log):
def _run_cluster(self, model, envs, check_error_log, log_name):
# Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs)
......@@ -673,8 +674,8 @@ class TestDistBase(unittest.TestCase):
print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_pipe = open(log_name + "_tr0_err.log", "wb")
tr1_pipe = open(log_name + "_tr1_err.log", "wb")
print_to_err(type(self).__name__, "going to start trainer process 0")
tr0_proc = subprocess.Popen(
......@@ -773,7 +774,7 @@ class TestDistBase(unittest.TestCase):
return tr_cmd, env
def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
check_error_log):
check_error_log, log_name):
if self._use_hallreduce:
self._ps_endpoints = ""
for i in range(0, 4):
......@@ -798,7 +799,7 @@ class TestDistBase(unittest.TestCase):
print("use_hallreduce:{} tr_cmd:{}, env: {}".format(
self._use_hallreduce, tr_cmd, tr_env))
tr_pipe = open("/tmp/tr{}_err.log".format(i), "wb")
tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb")
print_to_err(
type(self).__name__,
......@@ -828,7 +829,8 @@ class TestDistBase(unittest.TestCase):
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
need_envs={},
log_name=""):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
......@@ -845,22 +847,32 @@ class TestDistBase(unittest.TestCase):
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "10"
required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1"
local_losses \
= self._run_local(model_file, required_envs,
check_error_log)
check_error_log, log_name=log_name)
if self._nccl2_mode:
if self._nccl2_reduce_layer:
tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, True, check_error_log)
model_file,
required_envs,
True,
check_error_log,
log_name=log_name)
else:
tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, False, check_error_log)
model_file,
required_envs,
False,
check_error_log,
log_name=log_name)
else:
tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log)
model_file, required_envs, check_error_log, log_name=log_name)
for step_id in range(RUN_STEP):
local_loss = local_losses[step_id]
......
......@@ -18,6 +18,9 @@ import os
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
......@@ -36,7 +39,8 @@ class TestDistCTR2x2(TestDistBase):
self._enforce_place = "CPU"
def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-2, check_error_log=False)
self.check_with_place(
"dist_ctr.py", delta=1e-2, check_error_log=True, log_name=flag_name)
@skip_ci
......@@ -51,7 +55,8 @@ class TestDistCTRWithL2Decay2x2(TestDistBase):
"dist_ctr.py",
delta=1e-7,
check_error_log=True,
need_envs=need_envs)
need_envs=need_envs,
log_name=flag_name)
class TestDistCTR2x2_ASYNC(TestDistBase):
......@@ -68,7 +73,11 @@ class TestDistCTR2x2_ASYNC(TestDistBase):
}
self.check_with_place(
"dist_ctr.py", delta=100, check_error_log=True, need_envs=need_envs)
"dist_ctr.py",
delta=100,
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
class TestDistCTR2x2_ASYNC2(TestDistBase):
......@@ -86,7 +95,11 @@ class TestDistCTR2x2_ASYNC2(TestDistBase):
}
self.check_with_place(
"dist_ctr.py", delta=100, check_error_log=True, need_envs=need_envs)
"dist_ctr.py",
delta=100,
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnist2x2(TestDistBase):
def _setup_config(self):
......@@ -23,7 +26,11 @@ class TestDistMnist2x2(TestDistBase):
self._use_reduce = False
def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-5)
self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
class TestDistMnist2x2WithMemopt(TestDistBase):
......@@ -32,7 +39,11 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._mem_opt = True
def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-5)
self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
class TestDistMnistAsync(TestDistBase):
......@@ -41,7 +52,11 @@ class TestDistMnistAsync(TestDistBase):
self._use_reduce = False
def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=200)
self.check_with_place(
"dist_mnist.py",
delta=200,
check_error_log=True,
log_name=flag_name)
class TestDistMnistDcAsgd(TestDistBase):
......@@ -50,7 +65,11 @@ class TestDistMnistDcAsgd(TestDistBase):
self._dc_asgd = True
def test_se_resnext(self):
self.check_with_place("dist_mnist.py", delta=200)
self.check_with_place(
"dist_mnist.py",
delta=200,
check_error_log=True,
log_name=flag_name)
# FIXME(typhoonzero): enable these tests once we have 4
......
......@@ -17,6 +17,8 @@ import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnist2x2(TestDistBase):
def _setup_config(self):
......@@ -43,21 +45,24 @@ class TestDistMnist2x2(TestDistBase):
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1"
no_merge_losses = self._run_local(
model_file,
required_envs,
check_error_log=check_error_log,
batch_size=4)
batch_size=4,
log_name=flag_name)
batch_merge_losses = self._run_local(
model_file,
required_envs,
check_error_log=check_error_log,
batch_size=2,
batch_merge_repeat=2)
batch_merge_repeat=2,
log_name=flag_name)
# Ensure both result have values.
self.assertGreater(len(no_merge_losses), 1)
self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
......
......@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnistNCCL2DGC(TestDistBase):
def _setup_config(self):
......@@ -28,7 +31,11 @@ class TestDistMnistNCCL2DGC(TestDistBase):
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1e-5)
self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnistNCCL2HAllreduce(TestDistBase):
def _setup_config(self):
......@@ -28,7 +31,11 @@ class TestDistMnistNCCL2HAllreduce(TestDistBase):
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1e-5)
self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnistNCCL2MultiNCCLComm(TestDistBase):
def _setup_config(self):
......@@ -28,7 +31,11 @@ class TestDistMnistNCCL2MultiNCCLComm(TestDistBase):
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1e-5)
self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -22,6 +22,9 @@ import numpy as np
from test_dist_base import TestDistBase, RUN_STEP
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistSaveLoadDense2x2(TestDistBase):
def _setup_config(self):
......@@ -32,7 +35,8 @@ class TestDistSaveLoadDense2x2(TestDistBase):
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
need_envs={},
log_name=""):
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
......@@ -43,7 +47,8 @@ class TestDistSaveLoadDense2x2(TestDistBase):
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "3"
required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp()
......@@ -59,8 +64,8 @@ class TestDistSaveLoadDense2x2(TestDistBase):
cluster_env.update(required_envs)
local_var = self._run_local(model_file, local_env, check_error_log)
tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
check_error_log)
tr0_var, tr1_var = self._run_cluster(
model_file, cluster_env, check_error_log, log_name=flag_name)
shutil.rmtree(model_dir)
......@@ -95,7 +100,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
need_envs={},
log_name=""):
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
......@@ -106,7 +112,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "3"
required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp()
......@@ -117,15 +124,15 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
save_env["MODEL_DIR"] = model_dir
save_env.update(required_envs)
tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env,
check_error_log)
tr0_var_1, tr1_var_1 = self._run_cluster(
model_file, save_env, check_error_log, log_name=flag_name)
load_env = {}
load_env["LOAD"] = "1"
load_env["MODEL_DIR"] = model_dir
load_env.update(required_envs)
tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env,
check_error_log)
tr0_var_2, tr1_var_2 = self._run_cluster(
model_file, load_env, check_error_log, log_name=flag_name)
shutil.rmtree(model_dir)
......@@ -149,8 +156,9 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
self.check_with_place(
"dist_save_load.py",
delta=0,
check_error_log=False,
need_envs=need_envs)
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -17,6 +17,9 @@ import unittest
from test_dist_base import TestDistBase
import os
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
......@@ -41,7 +44,11 @@ class TestDistSeResnetNCCL2DGC(TestDistBase):
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_se_resnext.py", delta=30)
self.check_with_place(
"dist_se_resnext.py",
delta=30,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -17,6 +17,9 @@ import unittest
from test_dist_base import TestDistBase
import os
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
......@@ -39,7 +42,11 @@ class TestDistSeResneXtNCCL(TestDistBase):
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_se_resnext.py", delta=1e-5)
self.check_with_place(
"dist_se_resnext.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
class TestDistSeResneXtNCCLMP(TestDistBase):
......@@ -57,7 +64,8 @@ class TestDistSeResneXtNCCLMP(TestDistBase):
"dist_se_resnext.py",
delta=1e-5,
check_error_log=True,
need_envs={"NCCL_P2P_DISABLE": "1"})
need_envs={"NCCL_P2P_DISABLE": "1"},
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -17,6 +17,9 @@ import unittest
from test_dist_base import TestDistBase
import os
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
......@@ -36,7 +39,11 @@ class TestDistSeResneXt2x2(TestDistBase):
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
self.check_with_place(
"dist_se_resnext.py",
delta=1e-7,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
def __func__(*args, **kwargs):
if on_ci:
return
return func(*args, **kwargs)
return __func__
class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
self._use_reader_alloc = False
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
if __name__ == "__main__":
unittest.main()
......@@ -18,6 +18,9 @@ import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistSimnetBowDense2x2(TestDistBase):
def _setup_config(self):
......@@ -34,7 +37,8 @@ class TestDistSimnetBowDense2x2(TestDistBase):
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=True,
need_envs=need_envs)
need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2DenseAsync(TestDistBase):
......@@ -52,8 +56,9 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBowSparse2x2(TestDistBase):
......@@ -70,8 +75,9 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2SparseAsync(TestDistBase):
......@@ -88,8 +94,9 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
# FIXME(tangwei): Learningrate variable is not created on pserver.
......@@ -108,7 +115,8 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=True,
need_envs=need_envs)
need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
......@@ -125,8 +133,9 @@ class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
self.check_with_place(
"dist_simnet_bow.py",
delta=100,
check_error_log=False,
need_envs=need_envs)
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
......@@ -143,8 +152,9 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -17,6 +17,9 @@ import os
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistTextClassification2x2(TestDistBase):
def _setup_config(self):
......@@ -24,7 +27,11 @@ class TestDistTextClassification2x2(TestDistBase):
self._enforce_place = "CPU"
def test_text_classification(self):
self.check_with_place("dist_text_classification.py", delta=1e-6)
self.check_with_place(
"dist_text_classification.py",
delta=1e-6,
check_error_log=True,
log_name=flag_name)
class TestDistTextClassification2x2Async(TestDistBase):
......@@ -33,7 +40,11 @@ class TestDistTextClassification2x2Async(TestDistBase):
self._enforce_place = "CPU"
def test_se_resnext(self):
self.check_with_place("dist_text_classification.py", delta=100)
self.check_with_place(
"dist_text_classification.py",
delta=100,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
......@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistW2V2x2(TestDistBase):
def _setup_config(self):
......@@ -23,7 +26,11 @@ class TestDistW2V2x2(TestDistBase):
self._enforce_place = "CPU"
def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4)
self.check_with_place(
"dist_word2vec.py",
delta=1e-4,
check_error_log=True,
log_name=flag_name)
class TestDistW2V2x2WithMemOpt(TestDistBase):
......@@ -33,7 +40,11 @@ class TestDistW2V2x2WithMemOpt(TestDistBase):
self._enforce_place = "CPU"
def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4)
self.check_with_place(
"dist_word2vec.py",
delta=1e-4,
check_error_log=True,
log_name=flag_name)
class TestDistW2V2x2Async(TestDistBase):
......@@ -42,7 +53,11 @@ class TestDistW2V2x2Async(TestDistBase):
self._enforce_place = "CPU"
def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=100)
self.check_with_place(
"dist_word2vec.py",
delta=100,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册