From 161d5bbec5b8430afdeffdae716fd5dba3d7a251 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Fri, 4 Mar 2022 11:15:42 +0800 Subject: [PATCH] [TIPC] fix benchmark static shell, test=document_fix (#5293) --- .../N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh | 4 ++-- .../N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh | 2 +- .../mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh | 11 +++++------ .../benchmark_common/run_benchmark.sh | 8 ++++---- .../mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh | 4 ++-- .../mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh | 2 +- .../benchmark_common/prepare.sh | 11 +++++------ .../benchmark_common/run_benchmark.sh | 8 ++++---- .../yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh | 4 ++-- .../yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh | 2 +- .../benchmark_common/prepare.sh | 11 +++++------ .../benchmark_common/run_benchmark.sh | 8 ++++---- 12 files changed, 36 insertions(+), 39 deletions(-) diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh index 698b4f7c3..66cd2adbc 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_SingleP_DP.sh @@ -4,7 +4,7 @@ fp_item=fp32 run_process_type=SingleP run_mode=DP device_num=N1C1 -max_iter=500 +max_iter=100 num_workers=2 # get data @@ -14,4 +14,4 @@ bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_it # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh index ee3a9ddd0..1dabcecc8 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_MultiP_DP.sh @@ -4,7 +4,7 @@ fp_item=fp32 run_process_type=MultiP run_mode=DP device_num=N1C8 -max_iter=500 +max_iter=100 num_workers=2 # get data diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh index e5a7024d4..f2325a107 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/prepare.sh @@ -2,14 +2,13 @@ # 执行路径在模型库的根目录下 ################################# 安装框架 如: echo "*******prepare benchmark start ***********" -pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install -U pip echo `pip --version` -pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple -python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html -pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install Cython +pip install -r requirements.txt ################################# 准备训练数据 如: -wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar -cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . +wget -nc -P static/dataset/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar +cd ./static/dataset/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . rm -rf coco_benchmark/ && cd ../../../ echo "*******prepare benchmark end***********" diff --git a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh index d8d5b44cb..57e685d53 100644 --- a/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh @@ -14,7 +14,7 @@ function _set_params(){ skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_iter=${7:-"500"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=${8:-"8"} # (可选) # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 @@ -48,9 +48,9 @@ function _train(){ fi train_cmd="-c configs/mask_rcnn_r50_1x.yml -o LearningRate.base_lr=0.001 snapshot_iter=100000 \ - TrainReader.batch_size==${batch_size} \ + TrainReader.batch_size=${batch_size} \ max_iters=${max_iter} log_iter=1 \ - TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \ + TrainReader.worker_num=${num_workers} ${use_fp16_cmd} \ --is_profiler=${is_profiler} " # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in @@ -84,5 +84,5 @@ function _train(){ } source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -_train # 如果只产出训练log,不解析,可取消注释 +#_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh index 499468244..266a5a756 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_SingleP_DP.sh @@ -4,7 +4,7 @@ fp_item=fp32 run_process_type=SingleP run_mode=DP device_num=N1C1 -max_iter=500 +max_iter=100 num_workers=2 # get data @@ -14,4 +14,4 @@ bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_it # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh index 7781efff8..b8000b8eb 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_MultiP_DP.sh @@ -4,7 +4,7 @@ fp_item=fp32 run_process_type=MultiP run_mode=DP device_num=N1C8 -max_iter=500 +max_iter=100 num_workers=2 # get data diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh index e5a7024d4..f2325a107 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/prepare.sh @@ -2,14 +2,13 @@ # 执行路径在模型库的根目录下 ################################# 安装框架 如: echo "*******prepare benchmark start ***********" -pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install -U pip echo `pip --version` -pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple -python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html -pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install Cython +pip install -r requirements.txt ################################# 准备训练数据 如: -wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar -cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . +wget -nc -P static/dataset/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar +cd ./static/dataset/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . rm -rf coco_benchmark/ && cd ../../../ echo "*******prepare benchmark end***********" diff --git a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh index 520af4d4c..d3a5bbb99 100644 --- a/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh @@ -14,7 +14,7 @@ function _set_params(){ skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_iter=${7:-"500"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=${8:-"8"} # (可选) # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 @@ -48,9 +48,9 @@ function _train(){ fi train_cmd="-c configs/mask_rcnn_r50_fpn_1x.yml -o LearningRate.base_lr=0.001 snapshot_iter=100000 \ - TrainReader.batch_size==${batch_size} \ + TrainReader.batch_size=${batch_size} \ max_iters=${max_iter} log_iter=1 \ - TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \ + TrainReader.worker_num=${num_workers} ${use_fp16_cmd} \ --is_profiler=${is_profiler} " # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in @@ -84,5 +84,5 @@ function _train(){ } source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -_train # 如果只产出训练log,不解析,可取消注释 +#_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh index db222b973..d30a34bcb 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_SingleP_DP.sh @@ -4,7 +4,7 @@ fp_item=fp32 run_process_type=SingleP run_mode=DP device_num=N1C1 -max_iter=500 +max_iter=100 num_workers=8 # get data @@ -14,4 +14,4 @@ bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_it # run profiling sleep 10; export PROFILING=true -bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1; +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1; diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh index 80c80fe97..808199025 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_MultiP_DP.sh @@ -4,7 +4,7 @@ fp_item=fp32 run_process_type=MultiP run_mode=DP device_num=N1C8 -max_iter=500 +max_iter=100 num_workers=8 # get data diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh index e5a7024d4..f2325a107 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/prepare.sh @@ -2,14 +2,13 @@ # 执行路径在模型库的根目录下 ################################# 安装框架 如: echo "*******prepare benchmark start ***********" -pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install -U pip echo `pip --version` -pip install Cython -i https://pypi.tuna.tsinghua.edu.cn/simple -python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html -pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install Cython +pip install -r requirements.txt ################################# 准备训练数据 如: -wget -nc -P static/data/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar -cd ./static/data/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . +wget -nc -P static/dataset/coco/ https://paddledet.bj.bcebos.com/data/coco_benchmark.tar +cd ./static/dataset/coco/ && tar -xf coco_benchmark.tar && mv -u coco_benchmark/* . rm -rf coco_benchmark/ && cd ../../../ echo "*******prepare benchmark end***********" diff --git a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh index fcd55431b..8a75a75b8 100644 --- a/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh @@ -14,7 +14,7 @@ function _set_params(){ skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" - max_iter=${7:-"500"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 + max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=${8:-"8"} # (可选) # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 @@ -48,9 +48,9 @@ function _train(){ fi train_cmd="-c configs/yolov3_darknet.yml -o LearningRate.base_lr=0.002 snapshot_iter=100000 \ - TrainReader.batch_size==${batch_size} \ + TrainReader.batch_size=${batch_size} \ max_iters=${max_iter} log_iter=1 \ - TrainReader.worker_num==${num_workers} ${use_fp16_cmd} \ + TrainReader.worker_num=${num_workers} ${use_fp16_cmd} \ --is_profiler=${is_profiler} " # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in @@ -84,5 +84,5 @@ function _train(){ } source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -_train # 如果只产出训练log,不解析,可取消注释 +#_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 -- GitLab