From d4f11bd585404201469f12bcf9c4d39796878ac6 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 11 Oct 2019 09:34:01 +0800 Subject: [PATCH] Add bash_test_modules function to capture the timeout or failed context. (#20355) * cherry pick test=develop * cleanup test=develop test=release/1.6 * cleanup test=develop test=release/1.6 --- .../fluid/tests/unittests/CMakeLists.txt | 117 ++++++------------ .../paddle/fluid/tests/unittests/dist_test.sh | 48 +++++++ .../fluid/tests/unittests/test_dist_base.py | 38 ++++-- .../fluid/tests/unittests/test_dist_ctr.py | 21 +++- .../fluid/tests/unittests/test_dist_mnist.py | 27 +++- .../unittests/test_dist_mnist_batch_merge.py | 11 +- .../unittests/test_dist_mnist_dgc_nccl.py | 9 +- .../unittests/test_dist_mnist_hallreduce.py | 9 +- .../unittests/test_dist_mnist_multi_comm.py | 9 +- .../tests/unittests/test_dist_save_load.py | 32 +++-- .../unittests/test_dist_se_resnext_dgc.py | 9 +- .../unittests/test_dist_se_resnext_nccl.py | 12 +- .../unittests/test_dist_se_resnext_sync.py | 9 +- .../test_dist_se_resnext_sync_with_memopt.py | 44 ------- .../tests/unittests/test_dist_simnet_bow.py | 34 +++-- .../test_dist_text_classification.py | 15 ++- .../tests/unittests/test_dist_word2vec.py | 21 +++- 17 files changed, 279 insertions(+), 186 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/dist_test.sh delete mode 100644 python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8b7375edeb5..d63a575b9ef 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -3,30 +3,20 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0) set(dist_ENVS http_proxy="" https_proxy="") -if(NOT WITH_DISTRIBUTE) - list(REMOVE_ITEM TEST_OPS test_recv_op) - list(REMOVE_ITEM TEST_OPS test_dist_transpiler) - list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler) - list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_ring_allreduce) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_backward_deps) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_lars) - LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) - LIST(REMOVE_ITEM TEST_OPS test_dist_ctr) - LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) - LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) - LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) - LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) - LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) - LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_ctr) - LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_geo) -endif(NOT WITH_DISTRIBUTE) - +file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py") +string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") +set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) +#remove distribute unittests. +list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) +list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler) +list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op) +list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op) +list(APPEND MIXED_DIST_TEST_OPS test_hsigmoid_remote_table_op) +list(APPEND MIXED_DIST_TEST_OPS test_lookup_remote_table_op) +list(APPEND MIXED_DIST_TEST_OPS test_launch) +foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) + list(REMOVE_ITEM TEST_OPS ${TEST_OP}) +endforeach() if(NOT WITH_GPU OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op) @@ -43,8 +33,6 @@ if(WIN32) LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization) endif() -LIST(REMOVE_ITEM TEST_OPS test_launch) - if (NOT ${WITH_GPU}) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future @@ -70,14 +58,7 @@ if(APPLE) message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) - # TODO: add the unitest back when it fixed list(REMOVE_ITEM TEST_OPS test_detection_map_op) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt) - # TODO(tangwei12): add the unitest back when it fixed - list(REMOVE_ITEM TEST_OPS test_dist_word2vec) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) endif() if(NOT WITH_MKLML) @@ -130,39 +111,36 @@ function(bash_test_modules TARGET_NAME) set(options SERIAL) set(oneValueArgs "") - set(multiValueArgs MODULES DEPS ENVS) + set(multiValueArgs MODULES DEPS ENVS LABELS) cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - message(STATUS "CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR}) + set(timeout 350) + if(${bash_test_modules_TIMEOUT}) + set(timeout ${bash_test_modules_TIMEOUT}) + endif() add_test(NAME ${TARGET_NAME} - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${bash_test_modules_ENVS} + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python + TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS} bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if (bash_test_modules_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) endif() - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) + + if(bash_test_modules_LABELS) + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout} LABELS ${bash_test_modules_LABELS}) + else() + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout}) + endif() endfunction() list(REMOVE_ITEM TEST_OPS test_warpctc_op) -LIST(REMOVE_ITEM TEST_OPS test_lookup_remote_table_op) -LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) -LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) -LIST(REMOVE_ITEM TEST_OPS test_dist_train) -LIST(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) -list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf_auto_growth) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) -list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc) -list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync) -list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async) -list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt) -list(REMOVE_ITEM TEST_OPS test_dgc_op) -list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl) -list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) @@ -255,49 +233,24 @@ py_test_modules(test_install_check MODULES test_install_check ENVS set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST") py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1) if(WITH_DISTRIBUTE) - py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS}) + # FIXME(typhoonzero): add these tests back + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer") + list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler") + py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS}) - #py_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv_op ENVS ${dist_ENVS}) if(WITH_DGC) py_test_modules(test_dgc_op MODULES test_dgc_op) endif() if(NOT APPLE) bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh) - set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 100 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_nce_remote_table_op test_hsigmoid_remote_table_op test_dist_ctr test_dist_fleet_ctr test_dist_mnist_batch_merge PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - - set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_fleetapi PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE ) - set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async) - list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt) - - py_test_modules(test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc) - py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync) - py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl) bash_test_modules(test_launch MODULES test_launch.sh) - # FIXME(typhoonzero): add these tests back - # py_test_modules(test_dist_transformer MODULES test_dist_transformer) - # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) - set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) - set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE) + foreach(TEST_OP ${DIST_TEST_OPS}) + bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE") + endforeach(TEST_OP) endif(NOT APPLE) - # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf) diff --git a/python/paddle/fluid/tests/unittests/dist_test.sh b/python/paddle/fluid/tests/unittests/dist_test.sh new file mode 100644 index 00000000000..f1f6788ce7b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_test.sh @@ -0,0 +1,48 @@ +#!/bin/bash +unset https_proxy http_proxy + +name=${TEST_TARGET_NAME} +TEST_TIMEOUT=${TEST_TIMEOUT} + +if [[ ${name}"x" == "x" ]]; then + echo "can't find ${name}, please set ${TEST_TARGET_NAME} first" + exit 1 +fi + +if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then + echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first" + exit 1 +fi + +# rm flag file +rm -f ${name}*.log + +# start the unit test +run_time=$(( $TEST_TIMEOUT - 10 )) +echo "run_time: ${run_time}" +timeout -s SIGKILL ${run_time} python -u ${name}.py > ${name}_run.log 2>&1 +exit_code=$? +if [[ $exit_code -eq 0 ]]; then + exit 0 +fi + +echo "${name} faild with ${exit_code}" + +# paddle log +echo "${name} log" +cat -n ${name}*.log + +#display system context +for i in {1..2}; do + sleep 2 + ps -ef | grep -E "(test_|_test)" + + if hash "nvidia-smi" > /dev/null; then + nvidia-smi + fi +done + +#display /tmp/files +ls -l /tmp/paddle.* + +exit 1 diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 1c697e4e66d..49cf07d67b2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -571,7 +571,8 @@ class TestDistBase(unittest.TestCase): envs, check_error_log=False, batch_size=DEFAULT_BATCH_SIZE, - batch_merge_repeat=1): + batch_merge_repeat=1, + log_name=""): cmd = self._python_interp @@ -602,7 +603,7 @@ class TestDistBase(unittest.TestCase): print("local_cmd: {}, env: {}".format(cmd, env_local)) if check_error_log: - err_log = open("/tmp/trainer.err.log", "wb") + err_log = open(log_name + "_local.log", "wb") local_proc = subprocess.Popen( cmd.split(" "), stdout=subprocess.PIPE, @@ -625,7 +626,7 @@ class TestDistBase(unittest.TestCase): return pickle.loads(local_out) - def _run_cluster(self, model, envs, check_error_log): + def _run_cluster(self, model, envs, check_error_log, log_name): # Run dist train to compare with local results ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, check_error_log, envs) @@ -673,8 +674,8 @@ class TestDistBase(unittest.TestCase): print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0)) print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1)) - tr0_pipe = open("/tmp/tr0_err.log", "wb") - tr1_pipe = open("/tmp/tr1_err.log", "wb") + tr0_pipe = open(log_name + "_tr0_err.log", "wb") + tr1_pipe = open(log_name + "_tr1_err.log", "wb") print_to_err(type(self).__name__, "going to start trainer process 0") tr0_proc = subprocess.Popen( @@ -773,7 +774,7 @@ class TestDistBase(unittest.TestCase): return tr_cmd, env def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer, - check_error_log): + check_error_log, log_name): if self._use_hallreduce: self._ps_endpoints = "" for i in range(0, 4): @@ -798,7 +799,7 @@ class TestDistBase(unittest.TestCase): print("use_hallreduce:{} tr_cmd:{}, env: {}".format( self._use_hallreduce, tr_cmd, tr_env)) - tr_pipe = open("/tmp/tr{}_err.log".format(i), "wb") + tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb") print_to_err( type(self).__name__, @@ -828,7 +829,8 @@ class TestDistBase(unittest.TestCase): model_file, delta=1e-3, check_error_log=False, - need_envs={}): + need_envs={}, + log_name=""): # TODO(typhoonzero): should auto adapt GPU count on the machine. required_envs = { "PATH": os.getenv("PATH", ""), @@ -845,22 +847,32 @@ class TestDistBase(unittest.TestCase): required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "10" + required_envs["GLOG_vmodule"] = \ + "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10" required_envs["GLOG_logtostderr"] = "1" local_losses \ = self._run_local(model_file, required_envs, - check_error_log) + check_error_log, log_name=log_name) + if self._nccl2_mode: if self._nccl2_reduce_layer: tr0_losses, tr1_losses = self._run_cluster_nccl2( - model_file, required_envs, True, check_error_log) + model_file, + required_envs, + True, + check_error_log, + log_name=log_name) else: tr0_losses, tr1_losses = self._run_cluster_nccl2( - model_file, required_envs, False, check_error_log) + model_file, + required_envs, + False, + check_error_log, + log_name=log_name) else: tr0_losses, tr1_losses = self._run_cluster( - model_file, required_envs, check_error_log) + model_file, required_envs, check_error_log, log_name=log_name) for step_id in range(RUN_STEP): local_loss = local_losses[step_id] diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py index a108631df70..3bdf28bf9ac 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -18,6 +18,9 @@ import os import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + def skip_ci(func): on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) @@ -36,7 +39,8 @@ class TestDistCTR2x2(TestDistBase): self._enforce_place = "CPU" def test_dist_ctr(self): - self.check_with_place("dist_ctr.py", delta=1e-2, check_error_log=False) + self.check_with_place( + "dist_ctr.py", delta=1e-2, check_error_log=True, log_name=flag_name) @skip_ci @@ -51,7 +55,8 @@ class TestDistCTRWithL2Decay2x2(TestDistBase): "dist_ctr.py", delta=1e-7, check_error_log=True, - need_envs=need_envs) + need_envs=need_envs, + log_name=flag_name) class TestDistCTR2x2_ASYNC(TestDistBase): @@ -68,7 +73,11 @@ class TestDistCTR2x2_ASYNC(TestDistBase): } self.check_with_place( - "dist_ctr.py", delta=100, check_error_log=True, need_envs=need_envs) + "dist_ctr.py", + delta=100, + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) class TestDistCTR2x2_ASYNC2(TestDistBase): @@ -86,7 +95,11 @@ class TestDistCTR2x2_ASYNC2(TestDistBase): } self.check_with_place( - "dist_ctr.py", delta=100, check_error_log=True, need_envs=need_envs) + "dist_ctr.py", + delta=100, + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 89bbc69fa88..6042dfa4efd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -16,6 +16,9 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistMnist2x2(TestDistBase): def _setup_config(self): @@ -23,7 +26,11 @@ class TestDistMnist2x2(TestDistBase): self._use_reduce = False def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) class TestDistMnist2x2WithMemopt(TestDistBase): @@ -32,7 +39,11 @@ class TestDistMnist2x2WithMemopt(TestDistBase): self._mem_opt = True def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) class TestDistMnistAsync(TestDistBase): @@ -41,7 +52,11 @@ class TestDistMnistAsync(TestDistBase): self._use_reduce = False def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=200) + self.check_with_place( + "dist_mnist.py", + delta=200, + check_error_log=True, + log_name=flag_name) class TestDistMnistDcAsgd(TestDistBase): @@ -50,7 +65,11 @@ class TestDistMnistDcAsgd(TestDistBase): self._dc_asgd = True def test_se_resnext(self): - self.check_with_place("dist_mnist.py", delta=200) + self.check_with_place( + "dist_mnist.py", + delta=200, + check_error_log=True, + log_name=flag_name) # FIXME(typhoonzero): enable these tests once we have 4 diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py index 22d4b792903..24c9b9a1397 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py @@ -17,6 +17,8 @@ import unittest from test_dist_base import TestDistBase import os +flag_name = os.path.splitext(__file__)[0] + class TestDistMnist2x2(TestDistBase): def _setup_config(self): @@ -43,21 +45,24 @@ class TestDistMnist2x2(TestDistBase): required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "7" + required_envs["GLOG_vmodule"] = \ + "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10" required_envs["GLOG_logtostderr"] = "1" no_merge_losses = self._run_local( model_file, required_envs, check_error_log=check_error_log, - batch_size=4) + batch_size=4, + log_name=flag_name) batch_merge_losses = self._run_local( model_file, required_envs, check_error_log=check_error_log, batch_size=2, - batch_merge_repeat=2) + batch_merge_repeat=2, + log_name=flag_name) # Ensure both result have values. self.assertGreater(len(no_merge_losses), 1) self.assertEqual(len(no_merge_losses), len(batch_merge_losses)) diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py index 529bd330ac9..aaa43ec10bd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py @@ -16,6 +16,9 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistMnistNCCL2DGC(TestDistBase): def _setup_config(self): @@ -28,7 +31,11 @@ class TestDistMnistNCCL2DGC(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py index 247e4c0500f..cc002582371 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py @@ -16,6 +16,9 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistMnistNCCL2HAllreduce(TestDistBase): def _setup_config(self): @@ -28,7 +31,11 @@ class TestDistMnistNCCL2HAllreduce(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py index d0a21fe0dca..f43ccc8becb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py @@ -16,6 +16,9 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistMnistNCCL2MultiNCCLComm(TestDistBase): def _setup_config(self): @@ -28,7 +31,11 @@ class TestDistMnistNCCL2MultiNCCLComm(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py index 8c2d6d9b4dc..ed71a389756 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -22,6 +22,9 @@ import numpy as np from test_dist_base import TestDistBase, RUN_STEP +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistSaveLoadDense2x2(TestDistBase): def _setup_config(self): @@ -32,7 +35,8 @@ class TestDistSaveLoadDense2x2(TestDistBase): model_file, delta=1e-3, check_error_log=False, - need_envs={}): + need_envs={}, + log_name=""): required_envs = { "PATH": os.getenv("PATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""), @@ -43,7 +47,8 @@ class TestDistSaveLoadDense2x2(TestDistBase): required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "3" + required_envs["GLOG_vmodule"] = \ + "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10" required_envs["GLOG_logtostderr"] = "1" model_dir = tempfile.mkdtemp() @@ -59,8 +64,8 @@ class TestDistSaveLoadDense2x2(TestDistBase): cluster_env.update(required_envs) local_var = self._run_local(model_file, local_env, check_error_log) - tr0_var, tr1_var = self._run_cluster(model_file, cluster_env, - check_error_log) + tr0_var, tr1_var = self._run_cluster( + model_file, cluster_env, check_error_log, log_name=flag_name) shutil.rmtree(model_dir) @@ -95,7 +100,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): model_file, delta=1e-3, check_error_log=False, - need_envs={}): + need_envs={}, + log_name=""): required_envs = { "PATH": os.getenv("PATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""), @@ -106,7 +112,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "3" + required_envs["GLOG_vmodule"] = \ + "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10" required_envs["GLOG_logtostderr"] = "1" model_dir = tempfile.mkdtemp() @@ -117,15 +124,15 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): save_env["MODEL_DIR"] = model_dir save_env.update(required_envs) - tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env, - check_error_log) + tr0_var_1, tr1_var_1 = self._run_cluster( + model_file, save_env, check_error_log, log_name=flag_name) load_env = {} load_env["LOAD"] = "1" load_env["MODEL_DIR"] = model_dir load_env.update(required_envs) - tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env, - check_error_log) + tr0_var_2, tr1_var_2 = self._run_cluster( + model_file, load_env, check_error_log, log_name=flag_name) shutil.rmtree(model_dir) @@ -149,8 +156,9 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): self.check_with_place( "dist_save_load.py", delta=0, - check_error_log=False, - need_envs=need_envs) + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py index 24ef0736a01..d929ccea648 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py @@ -17,6 +17,9 @@ import unittest from test_dist_base import TestDistBase import os +import os +flag_name = os.path.splitext(__file__)[0] + def skip_ci(func): on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) @@ -41,7 +44,11 @@ class TestDistSeResnetNCCL2DGC(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_se_resnext.py", delta=30) + self.check_with_place( + "dist_se_resnext.py", + delta=30, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py index 3e55efb633d..f557bcb09f7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py @@ -17,6 +17,9 @@ import unittest from test_dist_base import TestDistBase import os +import os +flag_name = os.path.splitext(__file__)[0] + def skip_ci(func): on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) @@ -39,7 +42,11 @@ class TestDistSeResneXtNCCL(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_se_resnext.py", delta=1e-5) + self.check_with_place( + "dist_se_resnext.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) class TestDistSeResneXtNCCLMP(TestDistBase): @@ -57,7 +64,8 @@ class TestDistSeResneXtNCCLMP(TestDistBase): "dist_se_resnext.py", delta=1e-5, check_error_log=True, - need_envs={"NCCL_P2P_DISABLE": "1"}) + need_envs={"NCCL_P2P_DISABLE": "1"}, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py index 23987f4eff4..044a3706c64 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py @@ -17,6 +17,9 @@ import unittest from test_dist_base import TestDistBase import os +import os +flag_name = os.path.splitext(__file__)[0] + def skip_ci(func): on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) @@ -36,7 +39,11 @@ class TestDistSeResneXt2x2(TestDistBase): @skip_ci def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=1e-7) + self.check_with_place( + "dist_se_resnext.py", + delta=1e-7, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py deleted file mode 100644 index e39e07a58e8..00000000000 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import unittest -from test_dist_base import TestDistBase -import os - - -def skip_ci(func): - on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) - - def __func__(*args, **kwargs): - if on_ci: - return - return func(*args, **kwargs) - - return __func__ - - -class TestDistseResnXt2x2WithMemopt(TestDistBase): - def _setup_config(self): - self._sync_mode = True - self._mem_opt = True - self._use_reader_alloc = False - - @skip_ci - def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=1e-7) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py index 821974914b1..a872b5ce4db 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -18,6 +18,9 @@ import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistSimnetBowDense2x2(TestDistBase): def _setup_config(self): @@ -34,7 +37,8 @@ class TestDistSimnetBowDense2x2(TestDistBase): "dist_simnet_bow.py", delta=1e-5, check_error_log=True, - need_envs=need_envs) + need_envs=need_envs, + log_name=flag_name) class TestDistSimnetBow2x2DenseAsync(TestDistBase): @@ -52,8 +56,9 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): self.check_with_place( "dist_simnet_bow.py", delta=100, - check_error_log=False, - need_envs=need_envs) + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) class TestDistSimnetBowSparse2x2(TestDistBase): @@ -70,8 +75,9 @@ class TestDistSimnetBowSparse2x2(TestDistBase): self.check_with_place( "dist_simnet_bow.py", delta=1e-5, - check_error_log=False, - need_envs=need_envs) + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) class TestDistSimnetBow2x2SparseAsync(TestDistBase): @@ -88,8 +94,9 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): self.check_with_place( "dist_simnet_bow.py", delta=100, - check_error_log=False, - need_envs=need_envs) + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) # FIXME(tangwei): Learningrate variable is not created on pserver. @@ -108,7 +115,8 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase): "dist_simnet_bow.py", delta=1e-5, check_error_log=True, - need_envs=need_envs) + need_envs=need_envs, + log_name=flag_name) class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): @@ -125,8 +133,9 @@ class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): self.check_with_place( "dist_simnet_bow.py", delta=100, - check_error_log=False, - need_envs=need_envs) + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): @@ -143,8 +152,9 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): self.check_with_place( "dist_simnet_bow.py", delta=1e-5, - check_error_log=False, - need_envs=need_envs) + check_error_log=True, + need_envs=need_envs, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py index 0c1680359e2..d49ea3372e5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py +++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py @@ -17,6 +17,9 @@ import os import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistTextClassification2x2(TestDistBase): def _setup_config(self): @@ -24,7 +27,11 @@ class TestDistTextClassification2x2(TestDistBase): self._enforce_place = "CPU" def test_text_classification(self): - self.check_with_place("dist_text_classification.py", delta=1e-6) + self.check_with_place( + "dist_text_classification.py", + delta=1e-6, + check_error_log=True, + log_name=flag_name) class TestDistTextClassification2x2Async(TestDistBase): @@ -33,7 +40,11 @@ class TestDistTextClassification2x2Async(TestDistBase): self._enforce_place = "CPU" def test_se_resnext(self): - self.check_with_place("dist_text_classification.py", delta=100) + self.check_with_place( + "dist_text_classification.py", + delta=100, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 4553cb0ffd7..9385d42c559 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -16,6 +16,9 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os +flag_name = os.path.splitext(__file__)[0] + class TestDistW2V2x2(TestDistBase): def _setup_config(self): @@ -23,7 +26,11 @@ class TestDistW2V2x2(TestDistBase): self._enforce_place = "CPU" def test_dist_train(self): - self.check_with_place("dist_word2vec.py", delta=1e-4) + self.check_with_place( + "dist_word2vec.py", + delta=1e-4, + check_error_log=True, + log_name=flag_name) class TestDistW2V2x2WithMemOpt(TestDistBase): @@ -33,7 +40,11 @@ class TestDistW2V2x2WithMemOpt(TestDistBase): self._enforce_place = "CPU" def test_dist_train(self): - self.check_with_place("dist_word2vec.py", delta=1e-4) + self.check_with_place( + "dist_word2vec.py", + delta=1e-4, + check_error_log=True, + log_name=flag_name) class TestDistW2V2x2Async(TestDistBase): @@ -42,7 +53,11 @@ class TestDistW2V2x2Async(TestDistBase): self._enforce_place = "CPU" def test_dist_train(self): - self.check_with_place("dist_word2vec.py", delta=100) + self.check_with_place( + "dist_word2vec.py", + delta=100, + check_error_log=True, + log_name=flag_name) if __name__ == "__main__": -- GitLab