[NPU] fix NPU ci scripts, test=develop (#35095)

a332352a · Qi Li · GitHub · de645153 · a332352a · a332352a
隐藏空白更改
内联并排

Showing with 66 addition and 17 deletion

paddle/scripts/paddle_build.sh paddle/scripts/paddle_build.sh +42 -17

tools/coverage/paddle_coverage.sh tools/coverage/paddle_coverage.sh +24 -0

未找到文件。
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1059,7 +1059,7 @@ function get_quickly_disable_ut() {
 function card_test() {
    set -m
-    CTEST_PARALLEL_LEVEL=2
    case_count $1 $2
    ut_startTime_s=`date +%s` 
@@ -1725,16 +1725,22 @@ set +x
                single_card_tests="$single_card_tests|^$testcase$"
            fi
        done <<< "$test_cases";
-        card_test "$single_card_tests" 1
+        ut_actual_total_startTime_s=`date +%s`
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single GPU
        collect_failed_tests
        # add unit test retry for NPU
        rm -f $tmp_dir/*
        exec_times=0
        retry_unittests_record=''
-        retry_time=3
+        retry_time=4
-        exec_time_array=('first' 'second' 'third')
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=80
        exec_retry_threshold=10
        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
        if [ -n "$failed_test_lists" ];then
            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
@@ -1743,14 +1749,30 @@ set +x
            need_retry_ut_arr=(${need_retry_ut_str})
            need_retry_ut_count=${#need_retry_ut_arr[@]}
            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
-            if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+            while ( [ $exec_times -lt $retry_time ] )
-                while ( [ $exec_times -lt $retry_time ] )
+                do
-                    do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
                        set +e
                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
                        set -e
-                        if [[ "${exec_times}" == "1" ]];then
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
                            if [[ "${failed_test_lists}" == "" ]];then
                                break
                            else
@@ -1761,11 +1783,11 @@ set +x
                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                        echo "========================================="
                        echo "The following unittest will be re-run:"
-                        echo "${retry_unittests}"
+                        echo "${retry_unittests}"                    
                        for line in ${retry_unittests[@]} ;
                            do
                                read tmp_one_tmp <<< "$( echo $single_card_tests | grep -oEi $line )"
                                if [[ "$tmp_one_tmp" != ""  ]]; then
                                    if [[ "$one_card_retry" == "" ]]; then
                                        one_card_retry="^$line$"
@@ -1773,23 +1795,26 @@ set +x
                                        one_card_retry="$one_card_retry|^$line$"
                                    fi
                                fi
                            done
                        if [[ "$one_card_retry" != "" ]]; then
-                            card_test "$one_card_retry" 1
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
                        fi
                        exec_times=$[$exec_times+1]
                        failed_test_lists=''
                        collect_failed_tests
                        rm -f $tmp_dir/*
                        one_card_retry=''
-                    done
+                    fi 
-            else 
+                done
-                # There are more than 10 failed unit tests, so no unit test retry
-                is_retry_execuate=1
-            fi
        fi
+        rerun_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
        if [[ "$EXIT_CODE" != "0" ]]; then
            show_ut_retry_result
        fi

--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -85,8 +85,30 @@ function gen_full_html_report_xpu() {
    mv -f coverage-full.tmp coverage-full.info
 }
+function gen_full_html_report_npu() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/operators/*npu*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+    mv -f coverage-full.tmp coverage-full.info
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+    mv -f coverage-full.tmp coverage-full.info
+}
 if [ ${WITH_XPU:-OFF} == "ON" ]; then
    gen_full_html_report_xpu || true
+elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
+    gen_full_html_report_npu || true
 else
    gen_full_html_report || true
 fi
@@ -183,6 +205,8 @@ echo "Assert Python Diff Coverage"
 if [ ${WITH_XPU:-OFF} == "ON" ]; then
    echo "XPU has no python coverage!"
+elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
+    echo "NPU has no python coverage!"
 else
    if [[ "${NO_PYTHON_COVERAGE_DATA}" != "1" ]];then
        python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1