[benchmark] fix nan in training (#4345)

4f0aa7ef · shangliang Xu · GitHub · 8474ac99 · 4f0aa7ef · 4f0aa7ef
隐藏空白更改
内联并排

Showing with 113 addition and 6 deletion

benchmark/README.md benchmark/README.md +2 -1

benchmark/analysis_log.py benchmark/analysis_log.py +100 -0

benchmark/run_benchmark.sh benchmark/run_benchmark.sh +11 -5

未找到文件。
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -2,6 +2,7 @@

 ```
 ├── benchmark
+│   ├── analysis_log.py
 │   ├── prepare.sh
 │   ├── README.md
 │   ├── run_all.sh
@@ -18,7 +19,7 @@
 单模型运行脚本，可完成指定模型的测试方案

 ## Docker 运行环境
-* docker image: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7
+* docker image: registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7
 * paddle = 2.1.2
 * python = 3.7


--- a/benchmark/analysis_log.py
+++ b/benchmark/analysis_log.py
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import sys
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--filename", type=str, help="The name of log which need to analysis.")
+    parser.add_argument(
+        "--jsonname", type=str, help="The name of dumped json where to output.")
+    parser.add_argument(
+        "--keyword",
+        type=str,
+        default="ips:",
+        help="Keyword to specify analysis data")
+    parser.add_argument(
+        '--model_name',
+        type=str,
+        default="faster_rcnn",
+        help='training model_name, transformer_base')
+    parser.add_argument(
+        '--mission_name',
+        type=str,
+        default="目标检测",
+        help='training mission name')
+    parser.add_argument(
+        '--direction_id', type=int, default=0, help='training direction_id')
+    parser.add_argument(
+        '--run_mode',
+        type=str,
+        default="sp",
+        help='multi process or single process')
+    parser.add_argument(
+        '--index',
+        type=int,
+        default=1,
+        help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
+    parser.add_argument(
+        '--gpu_num', type=int, default=1, help='nums of training gpus')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=1,
+        help='batch size of training samples')
+    args = parser.parse_args()
+    return args
+
+
+def parse_text_from_file(file_path: str):
+    with open(file_path, "r") as f:
+        lines = f.read().splitlines()
+    return lines
+
+
+def parse_avg_from_text(text: list, keyword: str, skip_line=4):
+    count_list = []
+    for i, line in enumerate(text):
+        if keyword in line:
+            words = line.split(" ")
+            for j, word in enumerate(words):
+                if word == keyword:
+                    count_list.append(float(words[j + 1]))
+                    break
+    count_list = count_list[skip_line:]
+    if count_list:
+        return sum(count_list) / len(count_list)
+    else:
+        return 0.0
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    run_info = dict()
+    run_info["log_file"] = args.filename
+    res_log_file = args.jsonname
+    run_info["model_name"] = args.model_name
+    run_info["mission_name"] = args.mission_name
+    run_info["direction_id"] = args.direction_id
+    run_info["run_mode"] = args.run_mode
+    run_info["index"] = args.index
+    run_info["gpu_num"] = args.gpu_num
+    run_info["FINAL_RESULT"] = 0
+    run_info["JOB_FAIL_FLAG"] = 0
+
+    text = parse_text_from_file(args.filename)
+    avg_ips = parse_avg_from_text(text, args.keyword)
+    run_info["FINAL_RESULT"] = avg_ips * args.gpu_num
+
+    if avg_ips == 0.0:
+        run_info["JOB_FAIL_FLAG"] = 1
+        print("Failed at get info from training's output log, please check.")
+        sys.exit()
+
+    json_info = json.dumps(run_info)
+    with open(res_log_file, "w") as of:
+        of.write(json_info)
--- a/benchmark/run_benchmark.sh
+++ b/benchmark/run_benchmark.sh
@@ -20,12 +20,18 @@ function _train(){
    echo "Train on ${num_gpu_devices} GPUs"
    echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"

+    # set runtime params
+    set_optimizer_lr_sp=" "
+    set_optimizer_lr_mp=" "
    # parse model_name
    case ${model_name} in
-        faster_rcnn) model_yml="configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml" ;;
-        fcos) model_yml="configs/fcos/fcos_r50_fpn_1x_coco.yml" ;;
+        faster_rcnn) model_yml="configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml"
+            set_optimizer_lr_sp="LearningRate.base_lr=0.001" ;;
+        fcos) model_yml="configs/fcos/fcos_r50_fpn_1x_coco.yml"
+            set_optimizer_lr_sp="LearningRate.base_lr=0.001" ;;
        deformable_detr) model_yml="configs/deformable_detr/deformable_detr_r50_1x_coco.yml" ;;
-        gfl) model_yml="configs/gfl/gfl_r50_fpn_1x_coco.yml" ;;
+        gfl) model_yml="configs/gfl/gfl_r50_fpn_1x_coco.yml"
+            set_optimizer_lr_sp="LearningRate.base_lr=0.001" ;;
        hrnet) model_yml="configs/keypoint/hrnet/hrnet_w32_256x192.yml" ;;
        higherhrnet) model_yml="configs/keypoint/higherhrnet/higherhrnet_hrnet_w32_512.yml" ;;
        solov2) model_yml="configs/solov2/solov2_r50_fpn_1x_coco.yml" ;;
@@ -45,10 +51,10 @@ function _train(){

    case ${run_mode} in
        sp) train_cmd="${python} -u tools/train.py -c ${model_yml} ${set_fp_item} \
-            -o ${set_batch_size} ${set_max_epoch} ${set_log_iter}" ;;
+            -o ${set_batch_size} ${set_max_epoch} ${set_log_iter} ${set_optimizer_lr_sp}" ;;
        mp) train_cmd="${python} -m paddle.distributed.launch --log_dir=./mylog \
            --gpus=${CUDA_VISIBLE_DEVICES} tools/train.py -c ${model_yml} ${set_fp_item} \
-            -o ${set_batch_size} ${set_max_epoch} ${set_log_iter}"
+            -o ${set_batch_size} ${set_max_epoch} ${set_log_iter} ${set_optimizer_lr_mp}"
            log_parse_file="mylog/workerlog.0" ;;
        *) echo "choose run_mode(sp or mp)"; exit 1;
    esac