From a332352ac071c8956912ed0f7f458b97346d1ef3 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 24 Aug 2021 17:06:57 +0800 Subject: [PATCH] [NPU] fix NPU ci scripts, test=develop (#35095) --- paddle/scripts/paddle_build.sh | 59 ++++++++++++++++++++++--------- tools/coverage/paddle_coverage.sh | 24 +++++++++++++ 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0ace9568d49..069d77fc36a 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1059,7 +1059,7 @@ function get_quickly_disable_ut() { function card_test() { set -m - CTEST_PARALLEL_LEVEL=2 + case_count $1 $2 ut_startTime_s=`date +%s` @@ -1725,16 +1725,22 @@ set +x single_card_tests="$single_card_tests|^$testcase$" fi done <<< "$test_cases"; - card_test "$single_card_tests" 1 + + ut_actual_total_startTime_s=`date +%s` + + card_test "$single_card_tests" 1 # run cases 1 job each time with single GPU collect_failed_tests + # add unit test retry for NPU rm -f $tmp_dir/* exec_times=0 retry_unittests_record='' - retry_time=3 - exec_time_array=('first' 'second' 'third') + retry_time=4 + exec_time_array=('first' 'second' 'third' 'fourth') + parallel_failed_tests_exec_retry_threshold=80 exec_retry_threshold=10 is_retry_execuate=0 + rerun_ut_startTime_s=`date +%s` if [ -n "$failed_test_lists" ];then if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest @@ -1743,14 +1749,30 @@ set +x need_retry_ut_arr=(${need_retry_ut_str}) need_retry_ut_count=${#need_retry_ut_arr[@]} read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) - if [ $need_retry_ut_count -lt $exec_retry_threshold ];then - while ( [ $exec_times -lt $retry_time ] ) - do + while ( [ $exec_times -lt $retry_time ] ) + do + if [[ "${exec_times}" == "0" ]] ;then + if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then + is_retry_execuate=0 + else + is_retry_execuate=1 + fi + elif [[ "${exec_times}" == "1" ]] ;then + read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + need_retry_ut_arr=(${need_retry_ut_str}) + need_retry_ut_count=${#need_retry_ut_arr[@]} + if [ $need_retry_ut_count -lt $exec_retry_threshold ];then + is_retry_execuate=0 + else + is_retry_execuate=1 + fi + fi + if [[ "$is_retry_execuate" == "0" ]];then set +e retry_unittests_record="$retry_unittests_record$failed_test_lists" failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'` set -e - if [[ "${exec_times}" == "1" ]];then + if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then if [[ "${failed_test_lists}" == "" ]];then break else @@ -1761,11 +1783,11 @@ set +x echo "This is the ${exec_time_array[$exec_times]} time to re-run" echo "=========================================" echo "The following unittest will be re-run:" - echo "${retry_unittests}" - + echo "${retry_unittests}" for line in ${retry_unittests[@]} ; do read tmp_one_tmp <<< "$( echo $single_card_tests | grep -oEi $line )" + if [[ "$tmp_one_tmp" != "" ]]; then if [[ "$one_card_retry" == "" ]]; then one_card_retry="^$line$" @@ -1773,23 +1795,26 @@ set +x one_card_retry="$one_card_retry|^$line$" fi fi + done if [[ "$one_card_retry" != "" ]]; then - card_test "$one_card_retry" 1 + card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU fi - exec_times=$[$exec_times+1] failed_test_lists='' collect_failed_tests rm -f $tmp_dir/* one_card_retry='' - done - else - # There are more than 10 failed unit tests, so no unit test retry - is_retry_execuate=1 - fi + fi + done fi + + rerun_ut_endTime_s=`date +%s` + + echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + ut_actual_total_endTime_s=`date +%s` + echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt if [[ "$EXIT_CODE" != "0" ]]; then show_ut_retry_result fi diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh index c89926ebf96..54d08967b7f 100644 --- a/tools/coverage/paddle_coverage.sh +++ b/tools/coverage/paddle_coverage.sh @@ -85,8 +85,30 @@ function gen_full_html_report_xpu() { mv -f coverage-full.tmp coverage-full.info } +function gen_full_html_report_npu() { + lcov --extract coverage.info \ + '/paddle/paddle/fluid/operators/*npu*' \ + -o coverage-full.tmp \ + --rc lcov_branch_coverage=0 + + mv -f coverage-full.tmp coverage-full.info + + lcov --remove coverage-full.info \ + '/paddle/paddle/fluid/framework/*_test*' \ + '/paddle/paddle/fluid/*/*test*' \ + '/paddle/paddle/fluid/*/*/*test*' \ + '/paddle/paddle/fluid/inference/tests/*' \ + '/paddle/paddle/fluid/inference/api/demo_ci/*' \ + -o coverage-full.tmp \ + --rc lcov_branch_coverage=0 + + mv -f coverage-full.tmp coverage-full.info +} + if [ ${WITH_XPU:-OFF} == "ON" ]; then gen_full_html_report_xpu || true +elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then + gen_full_html_report_npu || true else gen_full_html_report || true fi @@ -183,6 +205,8 @@ echo "Assert Python Diff Coverage" if [ ${WITH_XPU:-OFF} == "ON" ]; then echo "XPU has no python coverage!" +elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then + echo "NPU has no python coverage!" else if [[ "${NO_PYTHON_COVERAGE_DATA}" != "1" ]];then python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1 -- GitLab