未验证 提交 7cb9fc59 编写于 作者: G gmm 提交者: GitHub

fix benchmark (#5335)

* fix becnhmark,delete run_process_type

* fix

* fix benchmarkk
上级 9f9df6f7
......@@ -137,7 +137,6 @@ else
batch_size=${params_list[1]}
batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
precision=${params_list[2]}
# run_process_type=${params_list[3]}
run_mode=${params_list[3]}
device_num=${params_list[4]}
IFS=";"
......@@ -162,10 +161,9 @@ for batch_size in ${batch_size_list[*]}; do
gpu_id=$(set_gpu_id $device_num)
if [ ${#gpu_id} -le 1 ];then
run_process_type="SingleP"
log_path="$SAVE_LOG/profiling_log"
mkdir -p $log_path
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling"
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling"
func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
# set profile_option params
tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
......@@ -181,8 +179,8 @@ for batch_size in ${batch_size_list[*]}; do
speed_log_path="$SAVE_LOG/index"
mkdir -p $log_path
mkdir -p $speed_log_path
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
echo $cmd
......@@ -193,13 +191,12 @@ for batch_size in ${batch_size_list[*]}; do
eval "cat ${log_path}/${log_name}"
# parser log
_model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
--speed_log_file '${speed_log_path}/${speed_log_name}' \
--model_name ${_model_name} \
--base_batch_size ${batch_size} \
--run_mode ${run_mode} \
--run_process_type ${run_process_type} \
--fp_item ${precision} \
--keyword ips: \
--skip_steps 2 \
......@@ -213,13 +210,12 @@ for batch_size in ${batch_size_list[*]}; do
else
IFS=";"
unset_env=`unset CUDA_VISIBLE_DEVICES`
run_process_type="MultiP"
log_path="$SAVE_LOG/train_log"
speed_log_path="$SAVE_LOG/index"
mkdir -p $log_path
mkdir -p $speed_log_path
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
......@@ -230,14 +226,13 @@ for batch_size in ${batch_size_list[*]}; do
export model_run_time=$((${job_et}-${job_bt}))
eval "cat ${log_path}/${log_name}"
# parser log
_model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
--speed_log_file '${speed_log_path}/${speed_log_name}' \
--model_name ${_model_name} \
--base_batch_size ${batch_size} \
--run_mode ${run_mode} \
--run_process_type ${run_process_type} \
--fp_item ${precision} \
--keyword ips: \
--skip_steps 2 \
......
model_item=mask_rcnn_r50_1x_coco
bs_item=2
fp_item=fp32
run_process_type=SingleP
run_mode=DP
device_num=N1C1
max_iter=100
......@@ -10,8 +9,8 @@ num_workers=2
# get data
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
# run
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
# run profiling
sleep 10;
export PROFILING=true
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
model_item=mask_rcnn_r50_1x_coco
bs_item=2
fp_item=fp32
run_process_type=MultiP
run_mode=DP
device_num=N1C8
max_iter=100
......@@ -10,4 +9,4 @@ num_workers=2
# get data
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
# run
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
function _set_params(){
model_item=${1:-"model_item"} # (必选) 模型 item
base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
fp_item=${3:-"fp32"} # (必选) fp32|fp16
run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
model_repo="PaddleDetection" # (必选) 模型套件的名字
speed_unit="samples/sec" # (必选)速度指标单位
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=${8:-"8"} # (可选)
max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=${7:-"8"} # (可选)
# 以下为通用执行命令,无特殊可不用修改
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
......@@ -30,6 +29,9 @@ function _set_params(){
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
}
function _train(){
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
export FLAGS_memory_fraction_of_eager_deletion=1.0
cd ./static
batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
......@@ -54,20 +56,19 @@ function _train(){
--is_profiler=${is_profiler} "
# 以下为通用执行命令,无特殊可不用修改
case ${run_mode} in
DP) if [[ ${run_process_type} = "SingleP" ]];then
echo "run ${run_mode} ${run_process_type}"
DP) if [[ ${device_num} = "N1C1" ]];then
echo "run ${run_mode} ${device_num}"
train_cmd="python -u tools/train.py ${train_cmd}"
elif [[ ${run_process_type} = "MultiP" ]];then
else
rm -rf ./mylog
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
tools/train.py ${train_cmd}"
else
echo "run ${run_mode} ${run_process_type} error", exit 1
fi
;;
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;;
*) echo "choose run_mode "; exit 1;
esac
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
......@@ -75,8 +76,8 @@ function _train(){
else
echo -e "${model_name}, SUCCESS"
fi
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
# kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ ${device_num} != "N1C1" -a -d mylog ]; then
rm ${log_file}
cp mylog/workerlog.0 ${log_file}
fi
......
model_item=mask_rcnn_r50_fpn_1x_coco
bs_item=2
fp_item=fp32
run_process_type=SingleP
run_mode=DP
device_num=N1C1
max_iter=100
......@@ -10,8 +9,8 @@ num_workers=2
# get data
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
# run
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
# run profiling
sleep 10;
export PROFILING=true
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
model_item=mask_rcnn_r50_fpn_1x_coco
bs_item=2
fp_item=fp32
run_process_type=MultiP
run_mode=DP
device_num=N1C8
max_iter=100
......@@ -10,4 +9,4 @@ num_workers=2
# get data
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
# run
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
function _set_params(){
model_item=${1:-"model_item"} # (必选) 模型 item
base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
fp_item=${3:-"fp32"} # (必选) fp32|fp16
run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
model_repo="PaddleDetection" # (必选) 模型套件的名字
speed_unit="samples/sec" # (必选)速度指标单位
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=${8:-"8"} # (可选)
max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=${7:-"8"} # (可选)
# 以下为通用执行命令,无特殊可不用修改
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
......@@ -30,6 +29,9 @@ function _set_params(){
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
}
function _train(){
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
export FLAGS_memory_fraction_of_eager_deletion=1.0
cd ./static
batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
......@@ -54,20 +56,19 @@ function _train(){
--is_profiler=${is_profiler} "
# 以下为通用执行命令,无特殊可不用修改
case ${run_mode} in
DP) if [[ ${run_process_type} = "SingleP" ]];then
echo "run ${run_mode} ${run_process_type}"
DP) if [[ ${device_num} = "N1C1" ]];then
echo "run ${run_mode} ${device_num}"
train_cmd="python -u tools/train.py ${train_cmd}"
elif [[ ${run_process_type} = "MultiP" ]];then
else
rm -rf ./mylog
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
tools/train.py ${train_cmd}"
else
echo "run ${run_mode} ${run_process_type} error", exit 1
fi
;;
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;;
*) echo "choose run_mode "; exit 1;
esac
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
......@@ -75,8 +76,8 @@ function _train(){
else
echo -e "${model_name}, SUCCESS"
fi
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
# kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ ${device_num} != "N1C1" -a -d mylog ]; then
rm ${log_file}
cp mylog/workerlog.0 ${log_file}
fi
......
model_item=yolov3_darknet53_270e_coco
bs_item=8
fp_item=fp32
run_process_type=SingleP
run_mode=DP
device_num=N1C1
max_iter=100
......@@ -10,8 +9,8 @@ num_workers=8
# get data
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
# run
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
# run profiling
sleep 10;
export PROFILING=true
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} 11 ${num_workers} 2>&1;
model_item=yolov3_darknet53_270e_coco
bs_item=8
fp_item=fp32
run_process_type=MultiP
run_mode=DP
device_num=N1C8
max_iter=100
......@@ -10,4 +9,4 @@ num_workers=8
# get data
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
# run
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
function _set_params(){
model_item=${1:-"model_item"} # (必选) 模型 item
base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
fp_item=${3:-"fp32"} # (必选) fp32|fp16
run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
model_repo="PaddleDetection" # (必选) 模型套件的名字
speed_unit="samples/sec" # (必选)速度指标单位
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter=${7:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=${8:-"8"} # (可选)
max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=${7:-"8"} # (可选)
# 以下为通用执行命令,无特殊可不用修改
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
......@@ -30,6 +29,9 @@ function _set_params(){
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
}
function _train(){
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
export FLAGS_memory_fraction_of_eager_deletion=1.0
cd ./static
batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
......@@ -54,20 +56,19 @@ function _train(){
--is_profiler=${is_profiler} "
# 以下为通用执行命令,无特殊可不用修改
case ${run_mode} in
DP) if [[ ${run_process_type} = "SingleP" ]];then
echo "run ${run_mode} ${run_process_type}"
DP) if [[ ${device_num} = "N1C1" ]];then
echo "run ${run_mode} ${device_num}"
train_cmd="python -u tools/train.py ${train_cmd}"
elif [[ ${run_process_type} = "MultiP" ]];then
else
rm -rf ./mylog
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
tools/train.py ${train_cmd}"
else
echo "run ${run_mode} ${run_process_type} error", exit 1
fi
;;
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;;
*) echo "choose run_mode "; exit 1;
esac
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
......@@ -75,8 +76,8 @@ function _train(){
else
echo -e "${model_name}, SUCCESS"
fi
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
# kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ ${device_num} != "N1C1" -a -d mylog ]; then
rm ${log_file}
cp mylog/workerlog.0 ${log_file}
fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册