From 9d985ca145198c2f744ebc60863b603ecfff3f0b Mon Sep 17 00:00:00 2001 From: Qi Li Date: Thu, 29 Jul 2021 20:50:12 +0800 Subject: [PATCH] [NPU] add unit test retry for NPU UT, test=develop (#34443) --- paddle/scripts/paddle_build.sh | 70 +++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2dd6cbafac..0ef6b45c51 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1627,7 +1627,6 @@ function parallel_test_base_npu() { EOF set +x - ut_startTime_s=`date +%s` test_cases=$(ctest -N -V) # get all test cases get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list while read -r line; do @@ -1643,12 +1642,73 @@ set +x done <<< "$test_cases"; card_test "$single_card_tests" 1 collect_failed_tests -set -x - ut_endTime_s=`date +%s` - echo "NPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s" + # add unit test retry for NPU + rm -f $tmp_dir/* + exec_times=0 + retry_unittests_record='' + retry_time=3 + exec_time_array=('first' 'second' 'third') + exec_retry_threshold=10 + is_retry_execuate=0 + if [ -n "$failed_test_lists" ];then + if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then + bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest + fi + read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + need_retry_ut_arr=(${need_retry_ut_str}) + need_retry_ut_count=${#need_retry_ut_arr[@]} + read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + if [ $need_retry_ut_count -lt $exec_retry_threshold ];then + while ( [ $exec_times -lt $retry_time ] ) + do + set +e + retry_unittests_record="$retry_unittests_record$failed_test_lists" + failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'` + set -e + if [[ "${exec_times}" == "1" ]];then + if [[ "${failed_test_lists}" == "" ]];then + break + else + read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + fi + fi + echo "=========================================" + echo "This is the ${exec_time_array[$exec_times]} time to re-run" + echo "=========================================" + echo "The following unittest will be re-run:" + echo "${retry_unittests}" + + for line in ${retry_unittests[@]} ; + do + read tmp_one_tmp <<< "$( echo $single_card_tests | grep -oEi $line )" + if [[ "$tmp_one_tmp" != "" ]]; then + if [[ "$one_card_retry" == "" ]]; then + one_card_retry="^$line$" + else + one_card_retry="$one_card_retry|^$line$" + fi + fi + done + + if [[ "$one_card_retry" != "" ]]; then + card_test "$one_card_retry" 1 + fi + + exec_times=$[$exec_times+1] + failed_test_lists='' + collect_failed_tests + rm -f $tmp_dir/* + one_card_retry='' + done + else + # There are more than 10 failed unit tests, so no unit test retry + is_retry_execuate=1 + fi + fi if [[ "$EXIT_CODE" != "0" ]]; then - exit 8; + show_ut_retry_result fi +set -ex fi } -- GitLab