op benchmark ci auto retry (#30143)

6d14659f · wuhuanzhou · GitHub · 42a6442a · 6d14659f · 6d14659f
隐藏空白更改
内联并排

Showing with 68 addition and 5 deletion

tools/check_op_benchmark_result.py tools/check_op_benchmark_result.py +32 -2

tools/test_op_benchmark.sh tools/test_op_benchmark.sh +36 -3

未找到文件。
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -121,7 +121,29 @@ def compare_benchmark_result(case_name, develop_result, pr_result,
            check_results["accuracy"].append(case_name)
-def summary_results(check_results):
+def update_api_info_file(fail_case_list, api_info_file):
+    """Update api info file to auto retry benchmark test.
+    """
+    check_path_exists(api_info_file)
+    # set of case names for performance check failures
+    fail_case_set = set(map(lambda x: x.split('_')[0], fail_case_list))
+    # list of api infos for performance check failures
+    api_info_list = list()
+    with open(api_info_file) as f:
+        for line in f:
+            case = line.split(',')[0]
+            if case in fail_case_set:
+                api_info_list.append(line)
+    # update api info file
+    with open(api_info_file, 'w') as f:
+        for api_info_line in api_info_list:
+            f.write(api_info_line)
+def summary_results(check_results, api_info_file):
    """Summary results and return exit code.
    """
    for case_name in check_results["speed"]:
@@ -131,6 +153,9 @@ def summary_results(check_results):
        logging.error("Check accuracy result with case \"%s\" failed." %
                      case_name)
+    if len(check_results["speed"]) and api_info_file:
+        update_api_info_file(check_results["speed"], api_info_file)
    if len(check_results["speed"]) or len(check_results["accuracy"]):
        return 8
    else:
@@ -155,6 +180,11 @@ if __name__ == "__main__":
        type=str,
        required=True,
        help="Specify the benchmark result directory of PR branch.")
+    parser.add_argument(
+        "--api_info_file",
+        type=str,
+        required=False,
+        help="Specify the api info to run benchmark test.")
    args = parser.parse_args()
    check_results = dict(accuracy=list(), speed=list())
@@ -172,4 +202,4 @@ if __name__ == "__main__":
        compare_benchmark_result(case_name, develop_result, pr_result,
                                 check_results)
-    exit(summary_results(check_results))
+    exit(summary_results(check_results, args.api_info_file))
--- a/tools/test_op_benchmark.sh
+++ b/tools/test_op_benchmark.sh
@@ -208,15 +208,48 @@ function run_op_benchmark_test {
  done
 }
+# check benchmark result
+function check_op_benchmark_result {
+  local api_info_file check_status_code
+  # default 3 times
+  [ -z "${RETRY_TIMES}" ] && RETRY_TIMES=3
+  api_info_file=$(pwd)/api_info.txt
+  for retry_time in $(seq 0 ${RETRY_TIMES})
+  do
+    if [ $retry_time -gt 0 ]; then
+      # run op benchmark speed test
+      # there is no need to recompile and install paddle
+      LOG "[INFO] retry ${retry_time} times ..."
+      pushd benchmark/api > /dev/null
+      bash deploy/main_control.sh tests_v2 \
+                                  tests_v2/configs \
+                                  $(pwd)/logs-test_pr \
+                                  $VISIBLE_DEVICES \
+                                  "gpu" \
+                                  "speed" \
+                                  ${api_info_file} \
+                                  "paddle"
+      popd > /dev/null
+    fi
+    # check current result and update the file to benchmark test
+    python ${PADDLE_ROOT}/tools/check_op_benchmark_result.py \
+        --develop_logs_dir $(pwd)/logs-develop \
+        --pr_logs_dir $(pwd)/logs-test_pr \
+        --api_info_file ${api_info_file}
+    check_status_code=$?
+    # TODO(Avin0323): retry only if the performance check fails
+    [ $check_status_code -eq 0 ] && break
+  done
+  return $check_status_code
+}
 # diff benchmakr result and miss op
 function summary_problems {
  local op_name exit_code
  exit_code=0
  if [ ${#BENCHMARK_OP_MAP[*]} -ne 0 ]
  then
-    python ${PADDLE_ROOT}/tools/check_op_benchmark_result.py \
+    check_op_benchmark_result
-        --develop_logs_dir $(pwd)/logs-develop \
-        --pr_logs_dir $(pwd)/logs-test_pr
    exit_code=$?
  fi
  for op_name in ${!CHANGE_OP_MAP[@]}