未验证 提交 4f0aa7ef 编写于 作者: S shangliang Xu 提交者: GitHub

[benchmark] fix nan in training (#4345)

上级 8474ac99
......@@ -2,6 +2,7 @@
```
├── benchmark
│ ├── analysis_log.py
│ ├── prepare.sh
│ ├── README.md
│ ├── run_all.sh
......@@ -18,7 +19,7 @@
单模型运行脚本,可完成指定模型的测试方案
## Docker 运行环境
* docker image: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7
* docker image: registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7
* paddle = 2.1.2
* python = 3.7
......
from __future__ import print_function
import argparse
import json
import os
import sys
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--filename", type=str, help="The name of log which need to analysis.")
parser.add_argument(
"--jsonname", type=str, help="The name of dumped json where to output.")
parser.add_argument(
"--keyword",
type=str,
default="ips:",
help="Keyword to specify analysis data")
parser.add_argument(
'--model_name',
type=str,
default="faster_rcnn",
help='training model_name, transformer_base')
parser.add_argument(
'--mission_name',
type=str,
default="目标检测",
help='training mission name')
parser.add_argument(
'--direction_id', type=int, default=0, help='training direction_id')
parser.add_argument(
'--run_mode',
type=str,
default="sp",
help='multi process or single process')
parser.add_argument(
'--index',
type=int,
default=1,
help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
parser.add_argument(
'--gpu_num', type=int, default=1, help='nums of training gpus')
parser.add_argument(
'--batch_size',
type=int,
default=1,
help='batch size of training samples')
args = parser.parse_args()
return args
def parse_text_from_file(file_path: str):
with open(file_path, "r") as f:
lines = f.read().splitlines()
return lines
def parse_avg_from_text(text: list, keyword: str, skip_line=4):
count_list = []
for i, line in enumerate(text):
if keyword in line:
words = line.split(" ")
for j, word in enumerate(words):
if word == keyword:
count_list.append(float(words[j + 1]))
break
count_list = count_list[skip_line:]
if count_list:
return sum(count_list) / len(count_list)
else:
return 0.0
if __name__ == '__main__':
args = parse_args()
run_info = dict()
run_info["log_file"] = args.filename
res_log_file = args.jsonname
run_info["model_name"] = args.model_name
run_info["mission_name"] = args.mission_name
run_info["direction_id"] = args.direction_id
run_info["run_mode"] = args.run_mode
run_info["index"] = args.index
run_info["gpu_num"] = args.gpu_num
run_info["FINAL_RESULT"] = 0
run_info["JOB_FAIL_FLAG"] = 0
text = parse_text_from_file(args.filename)
avg_ips = parse_avg_from_text(text, args.keyword)
run_info["FINAL_RESULT"] = avg_ips * args.gpu_num
if avg_ips == 0.0:
run_info["JOB_FAIL_FLAG"] = 1
print("Failed at get info from training's output log, please check.")
sys.exit()
json_info = json.dumps(run_info)
with open(res_log_file, "w") as of:
of.write(json_info)
......@@ -20,12 +20,18 @@ function _train(){
echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
# set runtime params
set_optimizer_lr_sp=" "
set_optimizer_lr_mp=" "
# parse model_name
case ${model_name} in
faster_rcnn) model_yml="configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml" ;;
fcos) model_yml="configs/fcos/fcos_r50_fpn_1x_coco.yml" ;;
faster_rcnn) model_yml="configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml"
set_optimizer_lr_sp="LearningRate.base_lr=0.001" ;;
fcos) model_yml="configs/fcos/fcos_r50_fpn_1x_coco.yml"
set_optimizer_lr_sp="LearningRate.base_lr=0.001" ;;
deformable_detr) model_yml="configs/deformable_detr/deformable_detr_r50_1x_coco.yml" ;;
gfl) model_yml="configs/gfl/gfl_r50_fpn_1x_coco.yml" ;;
gfl) model_yml="configs/gfl/gfl_r50_fpn_1x_coco.yml"
set_optimizer_lr_sp="LearningRate.base_lr=0.001" ;;
hrnet) model_yml="configs/keypoint/hrnet/hrnet_w32_256x192.yml" ;;
higherhrnet) model_yml="configs/keypoint/higherhrnet/higherhrnet_hrnet_w32_512.yml" ;;
solov2) model_yml="configs/solov2/solov2_r50_fpn_1x_coco.yml" ;;
......@@ -45,10 +51,10 @@ function _train(){
case ${run_mode} in
sp) train_cmd="${python} -u tools/train.py -c ${model_yml} ${set_fp_item} \
-o ${set_batch_size} ${set_max_epoch} ${set_log_iter}" ;;
-o ${set_batch_size} ${set_max_epoch} ${set_log_iter} ${set_optimizer_lr_sp}" ;;
mp) train_cmd="${python} -m paddle.distributed.launch --log_dir=./mylog \
--gpus=${CUDA_VISIBLE_DEVICES} tools/train.py -c ${model_yml} ${set_fp_item} \
-o ${set_batch_size} ${set_max_epoch} ${set_log_iter}"
-o ${set_batch_size} ${set_max_epoch} ${set_log_iter} ${set_optimizer_lr_mp}"
log_parse_file="mylog/workerlog.0" ;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册