未验证 提交 ed6624ab 编写于 作者: Z zhangchunle 提交者: GitHub

concurrent (#34908)

上级 35ef4180
......@@ -1059,6 +1059,7 @@ function get_quickly_disable_ut() {
function card_test() {
set -m
CTEST_PARALLEL_LEVEL=2
case_count $1 $2
ut_startTime_s=`date +%s`
......@@ -1127,10 +1128,8 @@ function card_test() {
ut_endTime_s=`date +%s`
if (( $2 == -1 )); then
echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
else
echo "$2 card TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
fi
set +m
}
......@@ -1181,16 +1180,19 @@ set +x
EXIT_CODE=0;
test_cases=$(ctest -N -V) # get all test cases
# Note(zhouwei): Parallel runs are relative to 'CTEST_PARALLEL_LEVEL', e.g: '4 job each time' means 4*CTEST_PARALLEL_LEVEL
single_card_tests_high_parallel='^job$' # cases list which would run the most job each time with single GPU
single_card_tests_two_parallel='^job$' # cases list which would run 2 job each time with single GPU
single_card_tests_non_parallel='^job$' # cases list which would run 1 job each time with single GPU
single_card_tests='^job$' # all cases list which would take single GPU
single_card_tests_high_parallel='^job$' # cases list which would run 24 job each time with single GPU
single_card_tests_secondary_high_parallel='^job$' # cases list which would run 15 job each time with single GPU
single_card_tests_third_high_parallel='^job$' # cases list which would run 12 job each time with single GPU
single_card_tests_medium_parallel='^job$' # cases list which would run 7 job each time with single GPU
single_card_tests_non_parallel='^job$' # cases list which would run 2 job each time with single GPU
single_card_tests='^job$' # all cases list which would take single GPU
multiple_card_tests_two_parallel='^job$' # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs
multiple_card_tests_non_parallel='^job$' # cases list which would run 1 job each time with multiple GPUs, most cases would be two GPUs
multiple_card_tests_medium_parallel='^job$' # cases list which would run 4 job each time with multiple GPUs, most cases would be two GPUs
multiple_card_tests_non_parallel='^job$' # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs
exclusive_tests_two_parallel='^job$' # cases list which would run 2 job exclusively(with all GPUs)
exclusive_tests_non_parallel='^job$' # cases list which would run 1 job exclusively(with all GPUs)
exclusive_tests_high_parallel='^job$' # cases list which would run 5 job exclusively(with all GPUs)
exclusive_tests_medium_parallel='^job$' # cases list which would run 3 job exclusively(with all GPUs)
exclusive_tests_non_parallel='^job$' # cases list which would run 2 job exclusively(with all GPUs)
is_exclusive='' # indicate whether the case is exclusive type
is_multicard='' # indicate whether the case is multiple GPUs type
......@@ -1200,9 +1202,11 @@ set +x
UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
two_parallel_job=$(echo $output | cut -d ";" -f 3)
non_parallel_job=$(echo $output | cut -d ";" -f 4)
secondary_cpu_parallel_job=$(echo $output | cut -d ";" -f 2)
third_cpu_parallel_job=$(echo $output | cut -d ";" -f 3)
tetrad_parallel_job=$(echo $output | cut -d ";" -f 4)
two_parallel_job=$(echo $output | cut -d ";" -f 5)
non_parallel_job=$(echo $output | cut -d ";" -f 6)
while read -r line; do
if [[ "$line" == "" ]]; then
continue
......@@ -1244,22 +1248,28 @@ set +x
fi
if [[ "$is_exclusive" != "" ]]; then
if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$"
if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
exclusive_tests_high_parallel="$exclusive_tests_high_parallel|^$testcase$"
elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
exclusive_tests_medium_parallel="$exclusive_tests_medium_parallel|^$testcase$"
else
exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$"
fi
elif [[ "$is_multicard" != "" ]]; then
if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$"
if [[ $(echo $cpu_parallel_job$tetrad_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
multiple_card_tests_medium_parallel="$multiple_card_tests_medium_parallel|^$testcase$"
else
multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$"
fi
else
if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$"
elif [[ $(echo $secondary_cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
single_card_tests_secondary_high_parallel="$single_card_tests_secondary_high_parallel|^$testcase$"
elif [[ $(echo $third_cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
single_card_tests_third_high_parallel="$single_card_tests_third_high_parallel|^$testcase$"
elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$"
single_card_tests_medium_parallel="$single_card_tests_medium_parallel|^$testcase$"
else
single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$"
fi
......@@ -1271,24 +1281,42 @@ set +x
matchstr=''
testcase=''
done <<< "$test_cases";
card_test "$single_card_tests_high_parallel" 1 6 # run cases the most each time with single GPU
card_test "$single_card_tests_two_parallel" 1 2 # run cases 2 job each time with single GPU
card_test "$single_card_tests_non_parallel" 1 # run cases 1 job each time with single GPU
card_test "$multiple_card_tests_two_parallel" 2 2 # run cases 2 job each time with two GPUs
card_test "$multiple_card_tests_non_parallel" 2 # run cases 1 job each time with two GPUs
card_test "$exclusive_tests_two_parallel" -1 2 # run cases exclusively, in this cases would be run with 2/4/8 GPUs
card_test "$exclusive_tests_non_parallel" -1 # run cases exclusively, in this cases would be run with 2/4/8 GPUs
ut_actual_total_startTime_s=`date +%s`
single_ut_startTime_s=`date +%s`
card_test "$single_card_tests_high_parallel" 1 24 # run cases 24 job each time with single GPU
card_test "$single_card_tests_secondary_high_parallel" 1 15 # run cases 15 job each time with single GPU
card_test "$single_card_tests_third_high_parallel" 1 12 # run cases 12 job each time with single GPU
card_test "$single_card_tests_medium_parallel" 1 7 # run cases 7 job each time with single GPU
card_test "$single_card_tests_non_parallel" 1 2 # run cases 2 job each time with single GPU
single_ut_endTime_s=`date +%s`
multi_ut_startTime_s=`date +%s`
card_test "$multiple_card_tests_medium_parallel" 2 4 # run cases 2 job each time with two GPUs
card_test "$multiple_card_tests_non_parallel" 2 2 # run cases 1 job each time with two GPUs
multi_ut_endTime_s=`date +%s`
exclu_ut_startTime_s=`date +%s`
card_test "$exclusive_tests_high_parallel" -1 5 # run cases exclusively, in this cases would be run with 2/4/8 GPUs
card_test "$exclusive_tests_medium_parallel" -1 3 # run cases exclusively, in this cases would be run with 2/4/8 GPUs
card_test "$exclusive_tests_non_parallel" -1 2 # run cases exclusively, in this cases would be run with 2/4/8 GPUs
exclu_ut_endTime_s=`date +%s`
echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multi_ut_endTime_s - $multi_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $exclu_ut_endTime_s - $exclu_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
collect_failed_tests
rm -f $tmp_dir/*
exec_times=0
retry_unittests_record=''
retry_time=3
exec_time_array=('first' 'second' 'third')
retry_time=4
exec_time_array=('first' 'second' 'third' 'fourth')
parallel_failed_tests_exec_retry_threshold=80
exec_retry_threshold=10
is_retry_execuate=0
rerun_ut_startTime_s=`date +%s`
if [ -n "$failed_test_lists" ];then
if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest
......@@ -1297,14 +1325,30 @@ set +x
need_retry_ut_arr=(${need_retry_ut_str})
need_retry_ut_count=${#need_retry_ut_arr[@]}
read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
while ( [ $exec_times -lt $retry_time ] )
do
while ( [ $exec_times -lt $retry_time ] )
do
if [[ "${exec_times}" == "0" ]] ;then
if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
is_retry_execuate=0
else
is_retry_execuate=1
fi
elif [[ "${exec_times}" == "1" ]] ;then
read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
need_retry_ut_arr=(${need_retry_ut_str})
need_retry_ut_count=${#need_retry_ut_arr[@]}
if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
is_retry_execuate=0
else
is_retry_execuate=1
fi
fi
if [[ "$is_retry_execuate" == "0" ]];then
set +e
retry_unittests_record="$retry_unittests_record$failed_test_lists"
failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
set -e
if [[ "${exec_times}" == "1" ]];then
if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
if [[ "${failed_test_lists}" == "" ]];then
break
else
......@@ -1315,11 +1359,9 @@ set +x
echo "This is the ${exec_time_array[$exec_times]} time to re-run"
echo "========================================="
echo "The following unittest will be re-run:"
echo "${retry_unittests}"
echo "${retry_unittests}"
for line in ${retry_unittests[@]} ;
do
read tmp_one_tmp <<< "$( echo $single_card_tests | grep -oEi $line )"
read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )"
read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )"
......@@ -1347,7 +1389,7 @@ set +x
done
if [[ "$one_card_retry" != "" ]]; then
card_test "$one_card_retry" 1
card_test "$one_card_retry" 1 4
fi
if [[ "$multiple_card_retry" != "" ]]; then
......@@ -1357,21 +1399,22 @@ set +x
if [[ "$exclusive_retry" != "" ]]; then
card_test "$exclusive_retry" -1
fi
exec_times=$[$exec_times+1]
failed_test_lists=''
collect_failed_tests
rm -f $tmp_dir/*
one_card_retry=''
multiple_card_retry=''
exclusive_retry=''
done
else
# There are more than 10 failed unit tests, so no unit test retry
is_retry_execuate=1
fi
exclusive_retry=''
fi
done
fi
rerun_ut_endTime_s=`date +%s`
echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
ut_actual_total_endTime_s=`date +%s`
echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
if [[ "$EXIT_CODE" != "0" ]]; then
show_ut_retry_result
fi
......@@ -1380,7 +1423,20 @@ set -ex
}
function show_ut_retry_result() {
if [[ "$is_retry_execuate" != "0" ]];then
if [ "$SYSTEM" == "Darwin" ]; then
exec_retry_threshold_count=10
else
exec_retry_threshold_count=80
fi
if [[ "$is_retry_execuate" != "0" ]] && [[ "${exec_times}" == "0" ]] ;then
failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'`
echo "========================================="
echo "There are more than ${exec_retry_threshold_count} failed unit tests in parallel test, so no unit test retry!!!"
echo "========================================="
echo "The following tests FAILED: "
echo "${failed_test_lists_ult}"
exit 8;
elif [[ "$is_retry_execuate" != "0" ]] && [[ "${exec_times}" == "1" ]];then
failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'`
echo "========================================="
echo "There are more than 10 failed unit tests, so no unit test retry!!!"
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册