From 6d14659f9765af0c281b68cbeeead457c9562721 Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Mon, 11 Jan 2021 15:51:27 +0800
Subject: [PATCH] op benchmark ci auto retry (#30143)

---
 tools/check_op_benchmark_result.py | 34 ++++++++++++++++++++++++--
 tools/test_op_benchmark.sh         | 39 +++++++++++++++++++++++++++---
 2 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index 7d6e1205bb..cef55f5ba0 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -121,7 +121,29 @@ def compare_benchmark_result(case_name, develop_result, pr_result,
             check_results["accuracy"].append(case_name)
 
 
-def summary_results(check_results):
+def update_api_info_file(fail_case_list, api_info_file):
+    """Update api info file to auto retry benchmark test.
+    """
+    check_path_exists(api_info_file)
+
+    # set of case names for performance check failures
+    fail_case_set = set(map(lambda x: x.split('_')[0], fail_case_list))
+
+    # list of api infos for performance check failures
+    api_info_list = list()
+    with open(api_info_file) as f:
+        for line in f:
+            case = line.split(',')[0]
+            if case in fail_case_set:
+                api_info_list.append(line)
+
+    # update api info file
+    with open(api_info_file, 'w') as f:
+        for api_info_line in api_info_list:
+            f.write(api_info_line)
+
+
+def summary_results(check_results, api_info_file):
     """Summary results and return exit code.
     """
     for case_name in check_results["speed"]:
@@ -131,6 +153,9 @@ def summary_results(check_results):
         logging.error("Check accuracy result with case \"%s\" failed." %
                       case_name)
 
+    if len(check_results["speed"]) and api_info_file:
+        update_api_info_file(check_results["speed"], api_info_file)
+
     if len(check_results["speed"]) or len(check_results["accuracy"]):
         return 8
     else:
@@ -155,6 +180,11 @@ if __name__ == "__main__":
         type=str,
         required=True,
         help="Specify the benchmark result directory of PR branch.")
+    parser.add_argument(
+        "--api_info_file",
+        type=str,
+        required=False,
+        help="Specify the api info to run benchmark test.")
     args = parser.parse_args()
 
     check_results = dict(accuracy=list(), speed=list())
@@ -172,4 +202,4 @@ if __name__ == "__main__":
         compare_benchmark_result(case_name, develop_result, pr_result,
                                  check_results)
 
-    exit(summary_results(check_results))
+    exit(summary_results(check_results, args.api_info_file))
diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh
index 0932e37879..2789c0f702 100644
--- a/tools/test_op_benchmark.sh
+++ b/tools/test_op_benchmark.sh
@@ -208,15 +208,48 @@ function run_op_benchmark_test {
   done
 }
 
+# check benchmark result
+function check_op_benchmark_result {
+  local api_info_file check_status_code
+  # default 3 times
+  [ -z "${RETRY_TIMES}" ] && RETRY_TIMES=3
+  api_info_file=$(pwd)/api_info.txt
+  for retry_time in $(seq 0 ${RETRY_TIMES})
+  do
+    if [ $retry_time -gt 0 ]; then
+      # run op benchmark speed test
+      # there is no need to recompile and install paddle
+      LOG "[INFO] retry ${retry_time} times ..."
+      pushd benchmark/api > /dev/null
+      bash deploy/main_control.sh tests_v2 \
+                                  tests_v2/configs \
+                                  $(pwd)/logs-test_pr \
+                                  $VISIBLE_DEVICES \
+                                  "gpu" \
+                                  "speed" \
+                                  ${api_info_file} \
+                                  "paddle"
+      popd > /dev/null
+    fi
+    # check current result and update the file to benchmark test
+    python ${PADDLE_ROOT}/tools/check_op_benchmark_result.py \
+        --develop_logs_dir $(pwd)/logs-develop \
+        --pr_logs_dir $(pwd)/logs-test_pr \
+        --api_info_file ${api_info_file}
+    check_status_code=$?
+    # TODO(Avin0323): retry only if the performance check fails
+    [ $check_status_code -eq 0 ] && break
+  done
+  return $check_status_code
+}
+
 # diff benchmakr result and miss op
 function summary_problems {
   local op_name exit_code
   exit_code=0
   if [ ${#BENCHMARK_OP_MAP[*]} -ne 0 ]
   then
-    python ${PADDLE_ROOT}/tools/check_op_benchmark_result.py \
-        --develop_logs_dir $(pwd)/logs-develop \
-        --pr_logs_dir $(pwd)/logs-test_pr
+    check_op_benchmark_result
     exit_code=$?
   fi
   for op_name in ${!CHANGE_OP_MAP[@]}
-- 
GitLab