Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
7cb9fc59
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7cb9fc59
编写于
3月 09, 2022
作者:
G
gmm
提交者:
GitHub
3月 09, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix benchmark (#5335)
* fix becnhmark,delete run_process_type * fix * fix benchmarkk
上级
9f9df6f7
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
61 addition
and
69 deletion
+61
-69
test_tipc/benchmark_train.sh
test_tipc/benchmark_train.sh
+7
-12
test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh
...cnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh
+2
-3
test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh
...cnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh
+1
-2
test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
...c/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
+15
-14
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh
...fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh
+2
-3
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh
...fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh
+1
-2
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
...sk_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
+15
-14
test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh
..._270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh
+2
-3
test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh
..._270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh
+1
-2
test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
...ov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
+15
-14
未找到文件。
test_tipc/benchmark_train.sh
浏览文件 @
7cb9fc59
...
...
@@ -137,7 +137,6 @@ else
batch_size
=
${
params_list
[1]
}
batch_size
=
`
echo
${
batch_size
}
|
tr
-cd
"[0-9]"
`
precision
=
${
params_list
[2]
}
# run_process_type=${params_list[3]}
run_mode
=
${
params_list
[3]
}
device_num
=
${
params_list
[4]
}
IFS
=
";"
...
...
@@ -162,10 +161,9 @@ for batch_size in ${batch_size_list[*]}; do
gpu_id
=
$(
set_gpu_id
$device_num
)
if
[
${#
gpu_id
}
-le
1
]
;
then
run_process_type
=
"SingleP"
log_path
=
"
$SAVE_LOG
/profiling_log"
mkdir
-p
$log_path
log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
_
${
device_num
}
_profiling"
log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
_
${
device_num
}
_profiling"
func_sed_params
"
$FILENAME
"
"
${
line_gpuid
}
"
"0"
# sed used gpu_id
# set profile_option params
tmp
=
`
sed
-i
"
${
line_profile
}
s/.*/
${
profile_option
}
/"
"
${
FILENAME
}
"
`
...
...
@@ -181,8 +179,8 @@ for batch_size in ${batch_size_list[*]}; do
speed_log_path
=
"
$SAVE_LOG
/index"
mkdir
-p
$log_path
mkdir
-p
$speed_log_path
log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
_
${
device_num
}
_log"
speed_log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
_
${
device_num
}
_speed"
log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
_
${
device_num
}
_log"
speed_log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
_
${
device_num
}
_speed"
func_sed_params
"
$FILENAME
"
"
${
line_profile
}
"
"null"
# sed profile_id as null
cmd
=
"bash test_tipc/test_train_inference_python.sh
${
FILENAME
}
benchmark_train >
${
log_path
}
/
${
log_name
}
2>&1 "
echo
$cmd
...
...
@@ -193,13 +191,12 @@ for batch_size in ${batch_size_list[*]}; do
eval
"cat
${
log_path
}
/
${
log_name
}
"
# parser log
_model_name
=
"
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
"
_model_name
=
"
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
"
cmd
=
"
${
python
}
${
BENCHMARK_ROOT
}
/scripts/analysis.py --filename
${
log_path
}
/
${
log_name
}
\
--speed_log_file '
${
speed_log_path
}
/
${
speed_log_name
}
'
\
--model_name
${
_model_name
}
\
--base_batch_size
${
batch_size
}
\
--run_mode
${
run_mode
}
\
--run_process_type
${
run_process_type
}
\
--fp_item
${
precision
}
\
--keyword ips:
\
--skip_steps 2
\
...
...
@@ -213,13 +210,12 @@ for batch_size in ${batch_size_list[*]}; do
else
IFS
=
";"
unset_env
=
`
unset
CUDA_VISIBLE_DEVICES
`
run_process_type
=
"MultiP"
log_path
=
"
$SAVE_LOG
/train_log"
speed_log_path
=
"
$SAVE_LOG
/index"
mkdir
-p
$log_path
mkdir
-p
$speed_log_path
log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
_
${
device_num
}
_log"
speed_log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
_
${
device_num
}
_speed"
log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
_
${
device_num
}
_log"
speed_log_name
=
"
${
repo_name
}
_
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
_
${
device_num
}
_speed"
func_sed_params
"
$FILENAME
"
"
${
line_gpuid
}
"
"
$gpu_id
"
# sed used gpu_id
func_sed_params
"
$FILENAME
"
"
${
line_profile
}
"
"null"
# sed --profile_option as null
cmd
=
"bash test_tipc/test_train_inference_python.sh
${
FILENAME
}
benchmark_train >
${
log_path
}
/
${
log_name
}
2>&1 "
...
...
@@ -230,14 +226,13 @@ for batch_size in ${batch_size_list[*]}; do
export
model_run_time
=
$((${
job_et
}
-
${
job_bt
}))
eval
"cat
${
log_path
}
/
${
log_name
}
"
# parser log
_model_name
=
"
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_
process_type
}
_
${
run_
mode
}
"
_model_name
=
"
${
model_name
}
_bs
${
batch_size
}
_
${
precision
}
_
${
run_mode
}
"
cmd
=
"
${
python
}
${
BENCHMARK_ROOT
}
/scripts/analysis.py --filename
${
log_path
}
/
${
log_name
}
\
--speed_log_file '
${
speed_log_path
}
/
${
speed_log_name
}
'
\
--model_name
${
_model_name
}
\
--base_batch_size
${
batch_size
}
\
--run_mode
${
run_mode
}
\
--run_process_type
${
run_process_type
}
\
--fp_item
${
precision
}
\
--keyword ips:
\
--skip_steps 2
\
...
...
test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_
SingleP_
DP.sh
→
test_tipc/static/mask_rcnn_r50_1x_coco/N1C1/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh
浏览文件 @
7cb9fc59
model_item
=
mask_rcnn_r50_1x_coco
bs_item
=
2
fp_item
=
fp32
run_process_type
=
SingleP
run_mode
=
DP
device_num
=
N1C1
max_iter
=
100
...
...
@@ -10,8 +9,8 @@ num_workers=2
# get data
bash test_tipc/static/
${
model_item
}
/benchmark_common/prepare.sh
# run
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
# run profiling
sleep
10
;
export
PROFILING
=
true
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
11
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
11
${
num_workers
}
2>&1
;
test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_
MultiP_
DP.sh
→
test_tipc/static/mask_rcnn_r50_1x_coco/N1C8/mask_rcnn_r50_1x_coco_bs2_fp32_DP.sh
浏览文件 @
7cb9fc59
model_item
=
mask_rcnn_r50_1x_coco
bs_item
=
2
fp_item
=
fp32
run_process_type
=
MultiP
run_mode
=
DP
device_num
=
N1C8
max_iter
=
100
...
...
@@ -10,4 +9,4 @@ num_workers=2
# get data
bash test_tipc/static/
${
model_item
}
/benchmark_common/prepare.sh
# run
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
test_tipc/static/mask_rcnn_r50_1x_coco/benchmark_common/run_benchmark.sh
浏览文件 @
7cb9fc59
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_
process_type} ${run_
mode} ${device_num}
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
function
_set_params
(){
model_item
=
${
1
:-
"model_item"
}
# (必选) 模型 item
base_batch_size
=
${
2
:-
"2"
}
# (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
fp_item
=
${
3
:-
"fp32"
}
# (必选) fp32|fp16
run_process_type
=
${
4
:-
"SingleP"
}
# (必选) 单进程 SingleP|多进程 MultiP
run_mode
=
${
5
:-
"DP"
}
# (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num
=
${
6
:-
"N1C1"
}
# (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
run_mode
=
${
4
:-
"DP"
}
# (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num
=
${
5
:-
"N1C1"
}
# (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling
=
${
PROFILING
:-
"false"
}
# (必选) Profiling 开关,默认关闭,通过全局变量传递
model_repo
=
"PaddleDetection"
# (必选) 模型套件的名字
speed_unit
=
"samples/sec"
# (必选)速度指标单位
skip_steps
=
10
# (必选)解析日志,跳过模型前几个性能不稳定的step
keyword
=
"ips:"
# (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key
=
"loss:"
# (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter
=
${
7
:-
"100"
}
# (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers
=
${
8
:-
"8"
}
# (可选)
max_iter
=
${
6
:-
"100"
}
# (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers
=
${
7
:-
"8"
}
# (可选)
# 以下为通用执行命令,无特殊可不用修改
model_name
=
${
model_item
}
_bs
${
base_batch_size
}
_
${
fp_item
}
_
${
run_
process_type
}
_
${
run_
mode
}
# (必填) 且格式不要改动,与竞品名称对齐
model_name
=
${
model_item
}
_bs
${
base_batch_size
}
_
${
fp_item
}
_
${
run_mode
}
# (必填) 且格式不要改动,与竞品名称对齐
device
=
${
CUDA_VISIBLE_DEVICES
//,/
}
arr
=(
${
device
}
)
num_gpu_devices
=
${#
arr
[*]
}
...
...
@@ -30,6 +29,9 @@ function _set_params(){
speed_log_file
=
${
speed_log_path
}
/
${
model_repo
}
_
${
model_name
}
_
${
device_num
}
_speed
}
function
_train
(){
export
FLAGS_eager_delete_tensor_gb
=
0.0
export
FLAGS_fraction_of_gpu_memory_to_use
=
0.98
export
FLAGS_memory_fraction_of_eager_deletion
=
1.0
cd
./static
batch_size
=
${
base_batch_size
}
# 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo
"current CUDA_VISIBLE_DEVICES=
${
CUDA_VISIBLE_DEVICES
}
, model_name=
${
model_name
}
, device_num=
${
device_num
}
, is profiling=
${
profiling
}
"
...
...
@@ -54,20 +56,19 @@ function _train(){
--is_profiler=
${
is_profiler
}
"
# 以下为通用执行命令,无特殊可不用修改
case
${
run_mode
}
in
DP
)
if
[[
${
run_process_type
}
=
"SingleP
"
]]
;
then
echo
"run
${
run_mode
}
${
run_process_type
}
"
DP
)
if
[[
${
device_num
}
=
"N1C1
"
]]
;
then
echo
"run
${
run_mode
}
${
device_num
}
"
train_cmd
=
"python -u tools/train.py
${
train_cmd
}
"
el
if
[[
${
run_process_type
}
=
"MultiP"
]]
;
then
el
se
rm
-rf
./mylog
train_cmd
=
"python -m paddle.distributed.launch --log_dir=./mylog --gpus=
$CUDA_VISIBLE_DEVICES
\
tools/train.py
${
train_cmd
}
"
else
echo
"run
${
run_mode
}
${
run_process_type
}
error"
,
exit
1
fi
;;
DP1-MP1-PP1
)
echo
"run run_mode: DP1-MP1-PP1"
;;
*
)
echo
"choose run_mode "
;
exit
1
;
esac
echo
"train_cmd:
${
train_cmd
}
log_file:
${
log_file
}
"
timeout
15m
${
train_cmd
}
>
${
log_file
}
2>&1
if
[
$?
-ne
0
]
;
then
...
...
@@ -75,8 +76,8 @@ function _train(){
else
echo
-e
"
${
model_name
}
, SUCCESS"
fi
kill
-9
`
ps
-ef
|grep
'python'
|awk
'{print $2}'
`
if
[
${
run_process_type
}
=
"MultiP
"
-a
-d
mylog
]
;
then
#
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if
[
${
device_num
}
!=
"N1C1
"
-a
-d
mylog
]
;
then
rm
${
log_file
}
cp
mylog/workerlog.0
${
log_file
}
fi
...
...
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_
SingleP_
DP.sh
→
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C1/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh
浏览文件 @
7cb9fc59
model_item
=
mask_rcnn_r50_fpn_1x_coco
bs_item
=
2
fp_item
=
fp32
run_process_type
=
SingleP
run_mode
=
DP
device_num
=
N1C1
max_iter
=
100
...
...
@@ -10,8 +9,8 @@ num_workers=2
# get data
bash test_tipc/static/
${
model_item
}
/benchmark_common/prepare.sh
# run
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
# run profiling
sleep
10
;
export
PROFILING
=
true
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
11
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
11
${
num_workers
}
2>&1
;
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_
MultiP_
DP.sh
→
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/N1C8/mask_rcnn_r50_fpn_1x_coco_bs2_fp32_DP.sh
浏览文件 @
7cb9fc59
model_item
=
mask_rcnn_r50_fpn_1x_coco
bs_item
=
2
fp_item
=
fp32
run_process_type
=
MultiP
run_mode
=
DP
device_num
=
N1C8
max_iter
=
100
...
...
@@ -10,4 +9,4 @@ num_workers=2
# get data
bash test_tipc/static/
${
model_item
}
/benchmark_common/prepare.sh
# run
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
test_tipc/static/mask_rcnn_r50_fpn_1x_coco/benchmark_common/run_benchmark.sh
浏览文件 @
7cb9fc59
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_
process_type} ${run_
mode} ${device_num}
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
function
_set_params
(){
model_item
=
${
1
:-
"model_item"
}
# (必选) 模型 item
base_batch_size
=
${
2
:-
"2"
}
# (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
fp_item
=
${
3
:-
"fp32"
}
# (必选) fp32|fp16
run_process_type
=
${
4
:-
"SingleP"
}
# (必选) 单进程 SingleP|多进程 MultiP
run_mode
=
${
5
:-
"DP"
}
# (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num
=
${
6
:-
"N1C1"
}
# (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
run_mode
=
${
4
:-
"DP"
}
# (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num
=
${
5
:-
"N1C1"
}
# (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling
=
${
PROFILING
:-
"false"
}
# (必选) Profiling 开关,默认关闭,通过全局变量传递
model_repo
=
"PaddleDetection"
# (必选) 模型套件的名字
speed_unit
=
"samples/sec"
# (必选)速度指标单位
skip_steps
=
10
# (必选)解析日志,跳过模型前几个性能不稳定的step
keyword
=
"ips:"
# (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key
=
"loss:"
# (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter
=
${
7
:-
"100"
}
# (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers
=
${
8
:-
"8"
}
# (可选)
max_iter
=
${
6
:-
"100"
}
# (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers
=
${
7
:-
"8"
}
# (可选)
# 以下为通用执行命令,无特殊可不用修改
model_name
=
${
model_item
}
_bs
${
base_batch_size
}
_
${
fp_item
}
_
${
run_
process_type
}
_
${
run_
mode
}
# (必填) 且格式不要改动,与竞品名称对齐
model_name
=
${
model_item
}
_bs
${
base_batch_size
}
_
${
fp_item
}
_
${
run_mode
}
# (必填) 且格式不要改动,与竞品名称对齐
device
=
${
CUDA_VISIBLE_DEVICES
//,/
}
arr
=(
${
device
}
)
num_gpu_devices
=
${#
arr
[*]
}
...
...
@@ -30,6 +29,9 @@ function _set_params(){
speed_log_file
=
${
speed_log_path
}
/
${
model_repo
}
_
${
model_name
}
_
${
device_num
}
_speed
}
function
_train
(){
export
FLAGS_eager_delete_tensor_gb
=
0.0
export
FLAGS_fraction_of_gpu_memory_to_use
=
0.98
export
FLAGS_memory_fraction_of_eager_deletion
=
1.0
cd
./static
batch_size
=
${
base_batch_size
}
# 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo
"current CUDA_VISIBLE_DEVICES=
${
CUDA_VISIBLE_DEVICES
}
, model_name=
${
model_name
}
, device_num=
${
device_num
}
, is profiling=
${
profiling
}
"
...
...
@@ -54,20 +56,19 @@ function _train(){
--is_profiler=
${
is_profiler
}
"
# 以下为通用执行命令,无特殊可不用修改
case
${
run_mode
}
in
DP
)
if
[[
${
run_process_type
}
=
"SingleP
"
]]
;
then
echo
"run
${
run_mode
}
${
run_process_type
}
"
DP
)
if
[[
${
device_num
}
=
"N1C1
"
]]
;
then
echo
"run
${
run_mode
}
${
device_num
}
"
train_cmd
=
"python -u tools/train.py
${
train_cmd
}
"
el
if
[[
${
run_process_type
}
=
"MultiP"
]]
;
then
el
se
rm
-rf
./mylog
train_cmd
=
"python -m paddle.distributed.launch --log_dir=./mylog --gpus=
$CUDA_VISIBLE_DEVICES
\
tools/train.py
${
train_cmd
}
"
else
echo
"run
${
run_mode
}
${
run_process_type
}
error"
,
exit
1
fi
;;
DP1-MP1-PP1
)
echo
"run run_mode: DP1-MP1-PP1"
;;
*
)
echo
"choose run_mode "
;
exit
1
;
esac
echo
"train_cmd:
${
train_cmd
}
log_file:
${
log_file
}
"
timeout
15m
${
train_cmd
}
>
${
log_file
}
2>&1
if
[
$?
-ne
0
]
;
then
...
...
@@ -75,8 +76,8 @@ function _train(){
else
echo
-e
"
${
model_name
}
, SUCCESS"
fi
kill
-9
`
ps
-ef
|grep
'python'
|awk
'{print $2}'
`
if
[
${
run_process_type
}
=
"MultiP
"
-a
-d
mylog
]
;
then
#
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if
[
${
device_num
}
!=
"N1C1
"
-a
-d
mylog
]
;
then
rm
${
log_file
}
cp
mylog/workerlog.0
${
log_file
}
fi
...
...
test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_
SingleP_
DP.sh
→
test_tipc/static/yolov3_darknet53_270e_coco/N1C1/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh
浏览文件 @
7cb9fc59
model_item
=
yolov3_darknet53_270e_coco
bs_item
=
8
fp_item
=
fp32
run_process_type
=
SingleP
run_mode
=
DP
device_num
=
N1C1
max_iter
=
100
...
...
@@ -10,8 +9,8 @@ num_workers=8
# get data
bash test_tipc/static/
${
model_item
}
/benchmark_common/prepare.sh
# run
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
# run profiling
sleep
10
;
export
PROFILING
=
true
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
11
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
11
${
num_workers
}
2>&1
;
test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_
MultiP_
DP.sh
→
test_tipc/static/yolov3_darknet53_270e_coco/N1C8/yolov3_darknet53_270e_coco_bs8_fp32_DP.sh
浏览文件 @
7cb9fc59
model_item
=
yolov3_darknet53_270e_coco
bs_item
=
8
fp_item
=
fp32
run_process_type
=
MultiP
run_mode
=
DP
device_num
=
N1C8
max_iter
=
100
...
...
@@ -10,4 +9,4 @@ num_workers=8
# get data
bash test_tipc/static/
${
model_item
}
/benchmark_common/prepare.sh
# run
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_
process_type
}
${
run_
mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
bash test_tipc/static/
${
model_item
}
/benchmark_common/run_benchmark.sh
${
model_item
}
${
bs_item
}
${
fp_item
}
${
run_mode
}
${
device_num
}
${
max_iter
}
${
num_workers
}
2>&1
;
test_tipc/static/yolov3_darknet53_270e_coco/benchmark_common/run_benchmark.sh
浏览文件 @
7cb9fc59
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item}
${run_process_type}
${run_mode} ${device_num}
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
function
_set_params
(){
model_item
=
${
1
:-
"model_item"
}
# (必选) 模型 item
base_batch_size
=
${
2
:-
"2"
}
# (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
fp_item
=
${
3
:-
"fp32"
}
# (必选) fp32|fp16
run_process_type
=
${
4
:-
"SingleP"
}
# (必选) 单进程 SingleP|多进程 MultiP
run_mode
=
${
5
:-
"DP"
}
# (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num
=
${
6
:-
"N1C1"
}
# (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
run_mode
=
${
4
:-
"DP"
}
# (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num
=
${
5
:-
"N1C1"
}
# (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling
=
${
PROFILING
:-
"false"
}
# (必选) Profiling 开关,默认关闭,通过全局变量传递
model_repo
=
"PaddleDetection"
# (必选) 模型套件的名字
speed_unit
=
"samples/sec"
# (必选)速度指标单位
skip_steps
=
10
# (必选)解析日志,跳过模型前几个性能不稳定的step
keyword
=
"ips:"
# (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key
=
"loss:"
# (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter
=
${
7
:-
"100"
}
# (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers
=
${
8
:-
"8"
}
# (可选)
max_iter
=
${
6
:-
"100"
}
# (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers
=
${
7
:-
"8"
}
# (可选)
# 以下为通用执行命令,无特殊可不用修改
model_name
=
${
model_item
}
_bs
${
base_batch_size
}
_
${
fp_item
}
_
${
run_
process_type
}
_
${
run_
mode
}
# (必填) 且格式不要改动,与竞品名称对齐
model_name
=
${
model_item
}
_bs
${
base_batch_size
}
_
${
fp_item
}
_
${
run_mode
}
# (必填) 且格式不要改动,与竞品名称对齐
device
=
${
CUDA_VISIBLE_DEVICES
//,/
}
arr
=(
${
device
}
)
num_gpu_devices
=
${#
arr
[*]
}
...
...
@@ -30,6 +29,9 @@ function _set_params(){
speed_log_file
=
${
speed_log_path
}
/
${
model_repo
}
_
${
model_name
}
_
${
device_num
}
_speed
}
function
_train
(){
export
FLAGS_eager_delete_tensor_gb
=
0.0
export
FLAGS_fraction_of_gpu_memory_to_use
=
0.98
export
FLAGS_memory_fraction_of_eager_deletion
=
1.0
cd
./static
batch_size
=
${
base_batch_size
}
# 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo
"current CUDA_VISIBLE_DEVICES=
${
CUDA_VISIBLE_DEVICES
}
, model_name=
${
model_name
}
, device_num=
${
device_num
}
, is profiling=
${
profiling
}
"
...
...
@@ -54,20 +56,19 @@ function _train(){
--is_profiler=
${
is_profiler
}
"
# 以下为通用执行命令,无特殊可不用修改
case
${
run_mode
}
in
DP
)
if
[[
${
run_process_type
}
=
"SingleP
"
]]
;
then
echo
"run
${
run_mode
}
${
run_process_type
}
"
DP
)
if
[[
${
device_num
}
=
"N1C1
"
]]
;
then
echo
"run
${
run_mode
}
${
device_num
}
"
train_cmd
=
"python -u tools/train.py
${
train_cmd
}
"
el
if
[[
${
run_process_type
}
=
"MultiP"
]]
;
then
el
se
rm
-rf
./mylog
train_cmd
=
"python -m paddle.distributed.launch --log_dir=./mylog --gpus=
$CUDA_VISIBLE_DEVICES
\
tools/train.py
${
train_cmd
}
"
else
echo
"run
${
run_mode
}
${
run_process_type
}
error"
,
exit
1
fi
;;
DP1-MP1-PP1
)
echo
"run run_mode: DP1-MP1-PP1"
;;
*
)
echo
"choose run_mode "
;
exit
1
;
esac
echo
"train_cmd:
${
train_cmd
}
log_file:
${
log_file
}
"
timeout
15m
${
train_cmd
}
>
${
log_file
}
2>&1
if
[
$?
-ne
0
]
;
then
...
...
@@ -75,8 +76,8 @@ function _train(){
else
echo
-e
"
${
model_name
}
, SUCCESS"
fi
kill
-9
`
ps
-ef
|grep
'python'
|awk
'{print $2}'
`
if
[
${
run_process_type
}
=
"MultiP
"
-a
-d
mylog
]
;
then
#
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if
[
${
device_num
}
!=
"N1C1
"
-a
-d
mylog
]
;
then
rm
${
log_file
}
cp
mylog/workerlog.0
${
log_file
}
fi
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录