提交 d4f11bd5 编写于 作者: G gongweibao 提交者: Dong Daxiang

Add bash_test_modules function to capture the timeout or failed context. (#20355)

* cherry pick test=develop

* cleanup test=develop test=release/1.6

* cleanup test=develop test=release/1.6
上级 7ee5b8bc
...@@ -3,30 +3,20 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") ...@@ -3,30 +3,20 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0) set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0)
set(dist_ENVS http_proxy="" https_proxy="") set(dist_ENVS http_proxy="" https_proxy="")
if(NOT WITH_DISTRIBUTE) file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
list(REMOVE_ITEM TEST_OPS test_recv_op) string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
list(REMOVE_ITEM TEST_OPS test_dist_transpiler) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler) #remove distribute unittests.
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi) list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl) list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce) list(APPEND MIXED_DIST_TEST_OPS test_hsigmoid_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm) list(APPEND MIXED_DIST_TEST_OPS test_lookup_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_ring_allreduce) list(APPEND MIXED_DIST_TEST_OPS test_launch)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_backward_deps) foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_lars) list(REMOVE_ITEM TEST_OPS ${TEST_OP})
LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) endforeach()
LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_geo)
endif(NOT WITH_DISTRIBUTE)
if(NOT WITH_GPU OR WIN32) if(NOT WITH_GPU OR WIN32)
LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op) LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
...@@ -43,8 +33,6 @@ if(WIN32) ...@@ -43,8 +33,6 @@ if(WIN32)
LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization) LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
endif() endif()
LIST(REMOVE_ITEM TEST_OPS test_launch)
if (NOT ${WITH_GPU}) if (NOT ${WITH_GPU})
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
...@@ -70,14 +58,7 @@ if(APPLE) ...@@ -70,14 +58,7 @@ if(APPLE)
message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*") message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*")
# this op is not support on mac # this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
# TODO(tangwei12): add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_dist_word2vec)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
endif() endif()
if(NOT WITH_MKLML) if(NOT WITH_MKLML)
...@@ -130,39 +111,36 @@ function(bash_test_modules TARGET_NAME) ...@@ -130,39 +111,36 @@ function(bash_test_modules TARGET_NAME)
set(options SERIAL) set(options SERIAL)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs MODULES DEPS ENVS) set(multiValueArgs MODULES DEPS ENVS LABELS)
cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
message(STATUS "CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR}) set(timeout 350)
if(${bash_test_modules_TIMEOUT})
set(timeout ${bash_test_modules_TIMEOUT})
endif()
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${bash_test_modules_ENVS} COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES} bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (bash_test_modules_SERIAL) if (bash_test_modules_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
endif() endif()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
if(bash_test_modules_LABELS)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout} LABELS ${bash_test_modules_LABELS})
else()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout})
endif()
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
LIST(REMOVE_ITEM TEST_OPS test_lookup_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_train)
LIST(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf_auto_growth) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf_auto_growth)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
list(REMOVE_ITEM TEST_OPS test_dgc_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
list(REMOVE_ITEM TEST_OPS test_dist_transformer)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth)
list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
...@@ -255,49 +233,24 @@ py_test_modules(test_install_check MODULES test_install_check ENVS ...@@ -255,49 +233,24 @@ py_test_modules(test_install_check MODULES test_install_check ENVS
set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1) py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS}) # FIXME(typhoonzero): add these tests back
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS})
py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS})
py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
#py_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv_op ENVS ${dist_ENVS})
if(WITH_DGC) if(WITH_DGC)
py_test_modules(test_dgc_op MODULES test_dgc_op) py_test_modules(test_dgc_op MODULES test_dgc_op)
endif() endif()
if(NOT APPLE) if(NOT APPLE)
bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh) bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 100 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_nce_remote_table_op test_hsigmoid_remote_table_op test_dist_ctr test_dist_fleet_ctr test_dist_mnist_batch_merge PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_fleetapi PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE )
set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
py_test_modules(test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc)
py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
bash_test_modules(test_launch MODULES test_launch.sh) bash_test_modules(test_launch MODULES test_launch.sh)
# FIXME(typhoonzero): add these tests back foreach(TEST_OP ${DIST_TEST_OPS})
# py_test_modules(test_dist_transformer MODULES test_dist_transformer) bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE")
# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) endforeach(TEST_OP)
set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
endif(NOT APPLE) endif(NOT APPLE)
# py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
......
#!/bin/bash
unset https_proxy http_proxy
name=${TEST_TARGET_NAME}
TEST_TIMEOUT=${TEST_TIMEOUT}
if [[ ${name}"x" == "x" ]]; then
echo "can't find ${name}, please set ${TEST_TARGET_NAME} first"
exit 1
fi
if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
exit 1
fi
# rm flag file
rm -f ${name}*.log
# start the unit test
run_time=$(( $TEST_TIMEOUT - 10 ))
echo "run_time: ${run_time}"
timeout -s SIGKILL ${run_time} python -u ${name}.py > ${name}_run.log 2>&1
exit_code=$?
if [[ $exit_code -eq 0 ]]; then
exit 0
fi
echo "${name} faild with ${exit_code}"
# paddle log
echo "${name} log"
cat -n ${name}*.log
#display system context
for i in {1..2}; do
sleep 2
ps -ef | grep -E "(test_|_test)"
if hash "nvidia-smi" > /dev/null; then
nvidia-smi
fi
done
#display /tmp/files
ls -l /tmp/paddle.*
exit 1
...@@ -571,7 +571,8 @@ class TestDistBase(unittest.TestCase): ...@@ -571,7 +571,8 @@ class TestDistBase(unittest.TestCase):
envs, envs,
check_error_log=False, check_error_log=False,
batch_size=DEFAULT_BATCH_SIZE, batch_size=DEFAULT_BATCH_SIZE,
batch_merge_repeat=1): batch_merge_repeat=1,
log_name=""):
cmd = self._python_interp cmd = self._python_interp
...@@ -602,7 +603,7 @@ class TestDistBase(unittest.TestCase): ...@@ -602,7 +603,7 @@ class TestDistBase(unittest.TestCase):
print("local_cmd: {}, env: {}".format(cmd, env_local)) print("local_cmd: {}, env: {}".format(cmd, env_local))
if check_error_log: if check_error_log:
err_log = open("/tmp/trainer.err.log", "wb") err_log = open(log_name + "_local.log", "wb")
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
...@@ -625,7 +626,7 @@ class TestDistBase(unittest.TestCase): ...@@ -625,7 +626,7 @@ class TestDistBase(unittest.TestCase):
return pickle.loads(local_out) return pickle.loads(local_out)
def _run_cluster(self, model, envs, check_error_log): def _run_cluster(self, model, envs, check_error_log, log_name):
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs) check_error_log, envs)
...@@ -673,8 +674,8 @@ class TestDistBase(unittest.TestCase): ...@@ -673,8 +674,8 @@ class TestDistBase(unittest.TestCase):
print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0)) print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1)) print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb") tr0_pipe = open(log_name + "_tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb") tr1_pipe = open(log_name + "_tr1_err.log", "wb")
print_to_err(type(self).__name__, "going to start trainer process 0") print_to_err(type(self).__name__, "going to start trainer process 0")
tr0_proc = subprocess.Popen( tr0_proc = subprocess.Popen(
...@@ -773,7 +774,7 @@ class TestDistBase(unittest.TestCase): ...@@ -773,7 +774,7 @@ class TestDistBase(unittest.TestCase):
return tr_cmd, env return tr_cmd, env
def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer, def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
check_error_log): check_error_log, log_name):
if self._use_hallreduce: if self._use_hallreduce:
self._ps_endpoints = "" self._ps_endpoints = ""
for i in range(0, 4): for i in range(0, 4):
...@@ -798,7 +799,7 @@ class TestDistBase(unittest.TestCase): ...@@ -798,7 +799,7 @@ class TestDistBase(unittest.TestCase):
print("use_hallreduce:{} tr_cmd:{}, env: {}".format( print("use_hallreduce:{} tr_cmd:{}, env: {}".format(
self._use_hallreduce, tr_cmd, tr_env)) self._use_hallreduce, tr_cmd, tr_env))
tr_pipe = open("/tmp/tr{}_err.log".format(i), "wb") tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb")
print_to_err( print_to_err(
type(self).__name__, type(self).__name__,
...@@ -828,7 +829,8 @@ class TestDistBase(unittest.TestCase): ...@@ -828,7 +829,8 @@ class TestDistBase(unittest.TestCase):
model_file, model_file,
delta=1e-3, delta=1e-3,
check_error_log=False, check_error_log=False,
need_envs={}): need_envs={},
log_name=""):
# TODO(typhoonzero): should auto adapt GPU count on the machine. # TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = { required_envs = {
"PATH": os.getenv("PATH", ""), "PATH": os.getenv("PATH", ""),
...@@ -845,22 +847,32 @@ class TestDistBase(unittest.TestCase): ...@@ -845,22 +847,32 @@ class TestDistBase(unittest.TestCase):
required_envs.update(need_envs) required_envs.update(need_envs)
if check_error_log: if check_error_log:
required_envs["GLOG_v"] = "10" required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
local_losses \ local_losses \
= self._run_local(model_file, required_envs, = self._run_local(model_file, required_envs,
check_error_log) check_error_log, log_name=log_name)
if self._nccl2_mode: if self._nccl2_mode:
if self._nccl2_reduce_layer: if self._nccl2_reduce_layer:
tr0_losses, tr1_losses = self._run_cluster_nccl2( tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, True, check_error_log) model_file,
required_envs,
True,
check_error_log,
log_name=log_name)
else: else:
tr0_losses, tr1_losses = self._run_cluster_nccl2( tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, False, check_error_log) model_file,
required_envs,
False,
check_error_log,
log_name=log_name)
else: else:
tr0_losses, tr1_losses = self._run_cluster( tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log) model_file, required_envs, check_error_log, log_name=log_name)
for step_id in range(RUN_STEP): for step_id in range(RUN_STEP):
local_loss = local_losses[step_id] local_loss = local_losses[step_id]
......
...@@ -18,6 +18,9 @@ import os ...@@ -18,6 +18,9 @@ import os
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func): def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
...@@ -36,7 +39,8 @@ class TestDistCTR2x2(TestDistBase): ...@@ -36,7 +39,8 @@ class TestDistCTR2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_dist_ctr(self): def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-2, check_error_log=False) self.check_with_place(
"dist_ctr.py", delta=1e-2, check_error_log=True, log_name=flag_name)
@skip_ci @skip_ci
...@@ -51,7 +55,8 @@ class TestDistCTRWithL2Decay2x2(TestDistBase): ...@@ -51,7 +55,8 @@ class TestDistCTRWithL2Decay2x2(TestDistBase):
"dist_ctr.py", "dist_ctr.py",
delta=1e-7, delta=1e-7,
check_error_log=True, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
class TestDistCTR2x2_ASYNC(TestDistBase): class TestDistCTR2x2_ASYNC(TestDistBase):
...@@ -68,7 +73,11 @@ class TestDistCTR2x2_ASYNC(TestDistBase): ...@@ -68,7 +73,11 @@ class TestDistCTR2x2_ASYNC(TestDistBase):
} }
self.check_with_place( self.check_with_place(
"dist_ctr.py", delta=100, check_error_log=True, need_envs=need_envs) "dist_ctr.py",
delta=100,
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
class TestDistCTR2x2_ASYNC2(TestDistBase): class TestDistCTR2x2_ASYNC2(TestDistBase):
...@@ -86,7 +95,11 @@ class TestDistCTR2x2_ASYNC2(TestDistBase): ...@@ -86,7 +95,11 @@ class TestDistCTR2x2_ASYNC2(TestDistBase):
} }
self.check_with_place( self.check_with_place(
"dist_ctr.py", delta=100, check_error_log=True, need_envs=need_envs) "dist_ctr.py",
delta=100,
check_error_log=True,
need_envs=need_envs,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -16,6 +16,9 @@ from __future__ import print_function ...@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnist2x2(TestDistBase): class TestDistMnist2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -23,7 +26,11 @@ class TestDistMnist2x2(TestDistBase): ...@@ -23,7 +26,11 @@ class TestDistMnist2x2(TestDistBase):
self._use_reduce = False self._use_reduce = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
class TestDistMnist2x2WithMemopt(TestDistBase): class TestDistMnist2x2WithMemopt(TestDistBase):
...@@ -32,7 +39,11 @@ class TestDistMnist2x2WithMemopt(TestDistBase): ...@@ -32,7 +39,11 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._mem_opt = True self._mem_opt = True
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
class TestDistMnistAsync(TestDistBase): class TestDistMnistAsync(TestDistBase):
...@@ -41,7 +52,11 @@ class TestDistMnistAsync(TestDistBase): ...@@ -41,7 +52,11 @@ class TestDistMnistAsync(TestDistBase):
self._use_reduce = False self._use_reduce = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=200) self.check_with_place(
"dist_mnist.py",
delta=200,
check_error_log=True,
log_name=flag_name)
class TestDistMnistDcAsgd(TestDistBase): class TestDistMnistDcAsgd(TestDistBase):
...@@ -50,7 +65,11 @@ class TestDistMnistDcAsgd(TestDistBase): ...@@ -50,7 +65,11 @@ class TestDistMnistDcAsgd(TestDistBase):
self._dc_asgd = True self._dc_asgd = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_mnist.py", delta=200) self.check_with_place(
"dist_mnist.py",
delta=200,
check_error_log=True,
log_name=flag_name)
# FIXME(typhoonzero): enable these tests once we have 4 # FIXME(typhoonzero): enable these tests once we have 4
......
...@@ -17,6 +17,8 @@ import unittest ...@@ -17,6 +17,8 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnist2x2(TestDistBase): class TestDistMnist2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -43,21 +45,24 @@ class TestDistMnist2x2(TestDistBase): ...@@ -43,21 +45,24 @@ class TestDistMnist2x2(TestDistBase):
required_envs.update(need_envs) required_envs.update(need_envs)
if check_error_log: if check_error_log:
required_envs["GLOG_v"] = "7" required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
no_merge_losses = self._run_local( no_merge_losses = self._run_local(
model_file, model_file,
required_envs, required_envs,
check_error_log=check_error_log, check_error_log=check_error_log,
batch_size=4) batch_size=4,
log_name=flag_name)
batch_merge_losses = self._run_local( batch_merge_losses = self._run_local(
model_file, model_file,
required_envs, required_envs,
check_error_log=check_error_log, check_error_log=check_error_log,
batch_size=2, batch_size=2,
batch_merge_repeat=2) batch_merge_repeat=2,
log_name=flag_name)
# Ensure both result have values. # Ensure both result have values.
self.assertGreater(len(no_merge_losses), 1) self.assertGreater(len(no_merge_losses), 1)
self.assertEqual(len(no_merge_losses), len(batch_merge_losses)) self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
......
...@@ -16,6 +16,9 @@ from __future__ import print_function ...@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnistNCCL2DGC(TestDistBase): class TestDistMnistNCCL2DGC(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -28,7 +31,11 @@ class TestDistMnistNCCL2DGC(TestDistBase): ...@@ -28,7 +31,11 @@ class TestDistMnistNCCL2DGC(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -16,6 +16,9 @@ from __future__ import print_function ...@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnistNCCL2HAllreduce(TestDistBase): class TestDistMnistNCCL2HAllreduce(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -28,7 +31,11 @@ class TestDistMnistNCCL2HAllreduce(TestDistBase): ...@@ -28,7 +31,11 @@ class TestDistMnistNCCL2HAllreduce(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -16,6 +16,9 @@ from __future__ import print_function ...@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistMnistNCCL2MultiNCCLComm(TestDistBase): class TestDistMnistNCCL2MultiNCCLComm(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -28,7 +31,11 @@ class TestDistMnistNCCL2MultiNCCLComm(TestDistBase): ...@@ -28,7 +31,11 @@ class TestDistMnistNCCL2MultiNCCLComm(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place(
"dist_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -22,6 +22,9 @@ import numpy as np ...@@ -22,6 +22,9 @@ import numpy as np
from test_dist_base import TestDistBase, RUN_STEP from test_dist_base import TestDistBase, RUN_STEP
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistSaveLoadDense2x2(TestDistBase): class TestDistSaveLoadDense2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -32,7 +35,8 @@ class TestDistSaveLoadDense2x2(TestDistBase): ...@@ -32,7 +35,8 @@ class TestDistSaveLoadDense2x2(TestDistBase):
model_file, model_file,
delta=1e-3, delta=1e-3,
check_error_log=False, check_error_log=False,
need_envs={}): need_envs={},
log_name=""):
required_envs = { required_envs = {
"PATH": os.getenv("PATH", ""), "PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""),
...@@ -43,7 +47,8 @@ class TestDistSaveLoadDense2x2(TestDistBase): ...@@ -43,7 +47,8 @@ class TestDistSaveLoadDense2x2(TestDistBase):
required_envs.update(need_envs) required_envs.update(need_envs)
if check_error_log: if check_error_log:
required_envs["GLOG_v"] = "3" required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp() model_dir = tempfile.mkdtemp()
...@@ -59,8 +64,8 @@ class TestDistSaveLoadDense2x2(TestDistBase): ...@@ -59,8 +64,8 @@ class TestDistSaveLoadDense2x2(TestDistBase):
cluster_env.update(required_envs) cluster_env.update(required_envs)
local_var = self._run_local(model_file, local_env, check_error_log) local_var = self._run_local(model_file, local_env, check_error_log)
tr0_var, tr1_var = self._run_cluster(model_file, cluster_env, tr0_var, tr1_var = self._run_cluster(
check_error_log) model_file, cluster_env, check_error_log, log_name=flag_name)
shutil.rmtree(model_dir) shutil.rmtree(model_dir)
...@@ -95,7 +100,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): ...@@ -95,7 +100,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
model_file, model_file,
delta=1e-3, delta=1e-3,
check_error_log=False, check_error_log=False,
need_envs={}): need_envs={},
log_name=""):
required_envs = { required_envs = {
"PATH": os.getenv("PATH", ""), "PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""),
...@@ -106,7 +112,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): ...@@ -106,7 +112,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
required_envs.update(need_envs) required_envs.update(need_envs)
if check_error_log: if check_error_log:
required_envs["GLOG_v"] = "3" required_envs["GLOG_vmodule"] = \
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp() model_dir = tempfile.mkdtemp()
...@@ -117,15 +124,15 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): ...@@ -117,15 +124,15 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
save_env["MODEL_DIR"] = model_dir save_env["MODEL_DIR"] = model_dir
save_env.update(required_envs) save_env.update(required_envs)
tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env, tr0_var_1, tr1_var_1 = self._run_cluster(
check_error_log) model_file, save_env, check_error_log, log_name=flag_name)
load_env = {} load_env = {}
load_env["LOAD"] = "1" load_env["LOAD"] = "1"
load_env["MODEL_DIR"] = model_dir load_env["MODEL_DIR"] = model_dir
load_env.update(required_envs) load_env.update(required_envs)
tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env, tr0_var_2, tr1_var_2 = self._run_cluster(
check_error_log) model_file, load_env, check_error_log, log_name=flag_name)
shutil.rmtree(model_dir) shutil.rmtree(model_dir)
...@@ -149,8 +156,9 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): ...@@ -149,8 +156,9 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
self.check_with_place( self.check_with_place(
"dist_save_load.py", "dist_save_load.py",
delta=0, delta=0,
check_error_log=False, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -17,6 +17,9 @@ import unittest ...@@ -17,6 +17,9 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os import os
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func): def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
...@@ -41,7 +44,11 @@ class TestDistSeResnetNCCL2DGC(TestDistBase): ...@@ -41,7 +44,11 @@ class TestDistSeResnetNCCL2DGC(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_se_resnext.py", delta=30) self.check_with_place(
"dist_se_resnext.py",
delta=30,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -17,6 +17,9 @@ import unittest ...@@ -17,6 +17,9 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os import os
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func): def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
...@@ -39,7 +42,11 @@ class TestDistSeResneXtNCCL(TestDistBase): ...@@ -39,7 +42,11 @@ class TestDistSeResneXtNCCL(TestDistBase):
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_se_resnext.py", delta=1e-5) self.check_with_place(
"dist_se_resnext.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
class TestDistSeResneXtNCCLMP(TestDistBase): class TestDistSeResneXtNCCLMP(TestDistBase):
...@@ -57,7 +64,8 @@ class TestDistSeResneXtNCCLMP(TestDistBase): ...@@ -57,7 +64,8 @@ class TestDistSeResneXtNCCLMP(TestDistBase):
"dist_se_resnext.py", "dist_se_resnext.py",
delta=1e-5, delta=1e-5,
check_error_log=True, check_error_log=True,
need_envs={"NCCL_P2P_DISABLE": "1"}) need_envs={"NCCL_P2P_DISABLE": "1"},
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -17,6 +17,9 @@ import unittest ...@@ -17,6 +17,9 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os import os
import os
flag_name = os.path.splitext(__file__)[0]
def skip_ci(func): def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
...@@ -36,7 +39,11 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -36,7 +39,11 @@ class TestDistSeResneXt2x2(TestDistBase):
@skip_ci @skip_ci
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place(
"dist_se_resnext.py",
delta=1e-7,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
def __func__(*args, **kwargs):
if on_ci:
return
return func(*args, **kwargs)
return __func__
class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
self._use_reader_alloc = False
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
if __name__ == "__main__":
unittest.main()
...@@ -18,6 +18,9 @@ import unittest ...@@ -18,6 +18,9 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistSimnetBowDense2x2(TestDistBase): class TestDistSimnetBowDense2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -34,7 +37,8 @@ class TestDistSimnetBowDense2x2(TestDistBase): ...@@ -34,7 +37,8 @@ class TestDistSimnetBowDense2x2(TestDistBase):
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
check_error_log=True, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2DenseAsync(TestDistBase): class TestDistSimnetBow2x2DenseAsync(TestDistBase):
...@@ -52,8 +56,9 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): ...@@ -52,8 +56,9 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
check_error_log=False, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBowSparse2x2(TestDistBase): class TestDistSimnetBowSparse2x2(TestDistBase):
...@@ -70,8 +75,9 @@ class TestDistSimnetBowSparse2x2(TestDistBase): ...@@ -70,8 +75,9 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
check_error_log=False, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2SparseAsync(TestDistBase): class TestDistSimnetBow2x2SparseAsync(TestDistBase):
...@@ -88,8 +94,9 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -88,8 +94,9 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
check_error_log=False, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
# FIXME(tangwei): Learningrate variable is not created on pserver. # FIXME(tangwei): Learningrate variable is not created on pserver.
...@@ -108,7 +115,8 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase): ...@@ -108,7 +115,8 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
check_error_log=True, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
...@@ -125,8 +133,9 @@ class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): ...@@ -125,8 +133,9 @@ class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=100, delta=100,
check_error_log=False, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
...@@ -143,8 +152,9 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): ...@@ -143,8 +152,9 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
delta=1e-5, delta=1e-5,
check_error_log=False, check_error_log=True,
need_envs=need_envs) need_envs=need_envs,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -17,6 +17,9 @@ import os ...@@ -17,6 +17,9 @@ import os
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistTextClassification2x2(TestDistBase): class TestDistTextClassification2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -24,7 +27,11 @@ class TestDistTextClassification2x2(TestDistBase): ...@@ -24,7 +27,11 @@ class TestDistTextClassification2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_text_classification(self): def test_text_classification(self):
self.check_with_place("dist_text_classification.py", delta=1e-6) self.check_with_place(
"dist_text_classification.py",
delta=1e-6,
check_error_log=True,
log_name=flag_name)
class TestDistTextClassification2x2Async(TestDistBase): class TestDistTextClassification2x2Async(TestDistBase):
...@@ -33,7 +40,11 @@ class TestDistTextClassification2x2Async(TestDistBase): ...@@ -33,7 +40,11 @@ class TestDistTextClassification2x2Async(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_text_classification.py", delta=100) self.check_with_place(
"dist_text_classification.py",
delta=100,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -16,6 +16,9 @@ from __future__ import print_function ...@@ -16,6 +16,9 @@ from __future__ import print_function
import unittest import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
import os
flag_name = os.path.splitext(__file__)[0]
class TestDistW2V2x2(TestDistBase): class TestDistW2V2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
...@@ -23,7 +26,11 @@ class TestDistW2V2x2(TestDistBase): ...@@ -23,7 +26,11 @@ class TestDistW2V2x2(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4) self.check_with_place(
"dist_word2vec.py",
delta=1e-4,
check_error_log=True,
log_name=flag_name)
class TestDistW2V2x2WithMemOpt(TestDistBase): class TestDistW2V2x2WithMemOpt(TestDistBase):
...@@ -33,7 +40,11 @@ class TestDistW2V2x2WithMemOpt(TestDistBase): ...@@ -33,7 +40,11 @@ class TestDistW2V2x2WithMemOpt(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4) self.check_with_place(
"dist_word2vec.py",
delta=1e-4,
check_error_log=True,
log_name=flag_name)
class TestDistW2V2x2Async(TestDistBase): class TestDistW2V2x2Async(TestDistBase):
...@@ -42,7 +53,11 @@ class TestDistW2V2x2Async(TestDistBase): ...@@ -42,7 +53,11 @@ class TestDistW2V2x2Async(TestDistBase):
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=100) self.check_with_place(
"dist_word2vec.py",
delta=100,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册