未验证 提交 6150c2dd 编写于 作者: S Snow 提交者: GitHub

Dev sx insightface (#131)

* add insightface pr template

* update readme and add fp32 results

* update readme

* update readme and modify scripts

* update insightface readme and add reports

* add pics in chinese report

* add pics in english

* add en pics

* refine readme.md

* update test pics

* upfate report

* Update dlperf_insightface_test_report_v1.md

test pic

* Create dlperf_insightface_test_report_v1.md

test pics again

* fix pic bug

* Update dlperf_insightface_test_report_v1.md

* fix data

* add new path of insightface and update as reviewed

* refine scripts and update readme

* update data and pics

* modify as reviewed

* update report as reviewed

* modify OOM into - and rm invalid pics' links

* fix mxnet max num_classes

* update changelog

* update change log

* add insightface report links and introduction

* modify as reviewed

* modify as reviewed

* fix part num and dataset zoo

* rm data preprocess part

* update link

* update data of 2 nodes

* add multi node scripts

* update results of 4 nodes

* update report

* update pics

* update mxnet data

* update multi-node scripts and rm different insightface_train.py

* rm insightface_train.py and mv config into scripts with sed

* update gnuplot img

* reduce scripts and update

* update
Co-authored-by: NLiang Depeng <liangdepeng@gmail.com>
Co-authored-by: NFlowingsun007 <flowingsun007@163.com>
Co-authored-by: MarDino's avatarMARD1NO <359521840@qq.com>
上级 1be38a77
......@@ -2,7 +2,9 @@
set -ex
workdir=/home/leinao/sx
workdir=/workdir
host_num=${1:-4}
network=${2:-"r100"}
dataset=${3:-"emore"}
......@@ -22,9 +24,11 @@ use_synthetic_data=${16:-False}
port=22
scripts_path=${workdir}/oneflow_face
test_scripts=${scripts_path}/scripts
LOCAL_RUN=${scripts_path}/scripts/train_insightface.sh
SCRIPTS_PATH=${workdir}/oneflow_face
TEST_SCRIPTS=${SCRIPTS_PATH}/scripts
LOCAL_RUN=${SCRIPTS_PATH}/scripts/train_insightface.sh
##############################################
#0 prepare the host list for training
......@@ -53,6 +57,16 @@ fi
hosts=("${host_list[@]:0:${host_num}}")
echo "Working on hosts:${hosts[@]}"
if [ ${host_num} == 2 ]; then
sed -i "s/node_ips = \[.*\]/node_ips = \[\"10.11.0.2\", \"10.11.0.3\"\]/g" $SCRIPTS_PATH/sample_config.py
elif [ ${host_num} == 4 ]; then
sed -i "s/node_ips = \[.*\]/node_ips = \[\"10.11.0.2\", \"10.11.0.3\", \"10.11.0.4\", \"10.11.0.5\"\]/g" $SCRIPTS_PATH/sample_config.py
else
echo "Please modify parameters in oneflow_face/sample_config.py, run_multi_nodes.sh manually! "
fi
test_case=${host_num}n${gpu_num_per_node}g_b${bz_per_device}_${network}_${dataset}_${loss}
log_file=${test_case}.log
......@@ -73,7 +87,8 @@ for host in "${hosts[@]:1}"
do
echo "start training on ${host}"
ssh -p ${port} $host "rm -rf ~/oneflow_temp/*"
scp -P ${port} -r $scripts_path $LOCAL_RUN $host:~/oneflow_temp
scp -P ${port} -r $SCRIPTS_PATH $LOCAL_RUN $host:~/oneflow_temp
ssh -p ${port} $host "cd ~/oneflow_temp; nohup bash train_insightface.sh ~/oneflow_temp/oneflow_face ${network} ${dataset} ${loss} ${num_nodes} $bz_per_device $train_unit $train_iter ${gpu_num_per_node} $precision $model_parallel $partial_fc $test_times $sample_ratio $num_classes 1>${log_file} 2>&1 </dev/null &"
done
......@@ -81,7 +96,7 @@ done
host=${hosts[0]}
echo "start training on ${host}"
ssh -p ${port} $host "rm -rf ~/oneflow_temp/*"
scp -P ${port} -r $scripts_path $LOCAL_RUN $host:~/oneflow_temp
scp -P ${port} -r $SCRIPTS_PATH $LOCAL_RUN $host:~/oneflow_temp
ssh -p ${port} $host "cd ~/oneflow_temp; bash train_insightface.sh ~/oneflow_temp/oneflow_face ${network} ${dataset} ${loss} ${num_nodes} $bz_per_device $train_unit $train_iter ${gpu_num_per_node} $precision $model_parallel $partial_fc $test_times $sample_ratio $num_classes 1>${log_file}"
echo "done"
......
# !/bin/bash
# !/bin/bash
export PYTHONUNBUFFERED=1
workspace=${1:-"/home/leinao/sx/test_face"}
workspace=${1:-"/oneflow_face"}
network=${2:-"r100"}
dataset=${3:-"emore"}
loss=${4:-"arcface"}
bz_per_device=${5:-128}
train_unit=${6:-"batch"}
iter_num=${7:-150}
precision=${8:-fp32}
model_parallel=${9:-True}
partila_fc=${10:-True}
sample_ratio=${11:-0.1}
num_classes=${12:-1500000}
use_synthetic_data=${13:-False}
num_nodes=${5:-1}
bz_per_device=${6:-64}
train_unit=${7:-"batch"}
train_iter=${8:-150}
gpu_num_per_node=${9:-8}
precision=${10:-fp32}
model_parallel=${11:-1}
partial_fc=${12:-1}
test_times=${13:-1}
sample_ratio=${14:-0.1}
num_classes=${15:-85744}
use_synthetic_data=${16:-False}
i=1
while [ $i -le 5 ]
do
bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${iter_num} 1 ${precision} ${model_parallel} ${partila_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data}
while [ $i -le 5 ]; do
bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${train_iter} 1 ${precision} ${model_parallel} ${partial_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data}
echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
let i++
sleep 20
done
i=1
while [ $i -le 5 ]
do
bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${iter_num} 1 ${precision} ${model_parallel} ${partila_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data}
while [ $i -le 5 ]; do
bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${train_iter} 4 ${precision} ${model_parallel} ${partial_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data}
echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
let i++
sleep 20
done
i=1
while [ $i -le 5 ]
do
bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${iter_num} 1 ${precision} ${model_parallel} ${partila_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data}
while [ $i -le 5 ]; do
bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${train_iter} 8 ${precision} ${model_parallel} ${partial_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data}
echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
let i++
sleep 20
......
#!/bin/bash
#export ONEFLOW_DEBUG_MODE=True
export PYTHONUNBUFFERED=1
workspace=${1:-"/data/oneflow_temp/oneflow_face"}
workspace=${1:-"/oneflow_face"}
network=${2:-"r100"}
dataset=${3:-"emore"}
loss=${4:-"arcface"}
num_nodes=${5:-4}
batch_size_per_device=${6:-64}
train_unit=${7:-"batch"}
train_iter=${8:-150}
train_iter=${8:-150}
gpu_num_per_node=${9:-8}
precision=${10:-fp32}
model_parallel=${11:-1}
......@@ -22,24 +23,24 @@ MODEL_SAVE_DIR=${num_classes}_${precision}_b${batch_size_per_device}_oneflow_mod
LOG_DIR=$MODEL_SAVE_DIR
if [ $gpu_num_per_node -gt 1 ]; then
if [ $network = "r100" ]; then
data_part_num=32
elif [ $network = "r100_glint360k" ]; then
data_part_num=200
else
echo "Please modify exact data part num in sample_config.py!"
if [ $network = "r100" ]; then
data_part_num=32
elif [ $network = "r100_glint360k" ]; then
data_part_num=200
else
echo "Please modify exact data part num in sample_config.py!"
fi
else
data_part_num=1
data_part_num=1
fi
sed -i "s/emore.train_data_part_num = 32/emore.train_data_part_num = $data_part_num/g" $workspace/sample_config.py
sed -i "s/emore.num_classes = 85744/emore.num_classes = $num_classes/g" $workspace/sample_config.py
sed -i "s/${dataset}.train_data_part_num = [[:digit:]]*/${dataset}.train_data_part_num = $data_part_num/g" $workspace/sample_config.py
sed -i "s/${dataset}.num_classes = [[:digit:]]*/${dataset}.num_classes = $num_classes/g" $workspace/sample_config.py
PREC=""
if [ "$precision" = "fp16" ] ; then
if [ "$precision" = "fp16" ]; then
PREC=" --use_fp16=True"
elif [ "$precision" = "fp32" ] ; then
elif [ "$precision" = "fp32" ]; then
PREC=" --use_fp16=False"
else
echo "Unknown <precision> argument"
......@@ -57,6 +58,7 @@ CMD="$workspace/insightface_train.py"
CMD+=" --network=${network}"
CMD+=" --dataset=${dataset}"
CMD+=" --loss=${loss}"
CMD+=" --num_nodes=${num_nodes}"
CMD+=" --train_batch_size=$(expr $num_nodes '*' $gpu_num_per_node '*' $batch_size_per_device)"
CMD+=" --train_unit=${train_unit}"
CMD+=" --train_iter=${train_iter}"
......@@ -68,16 +70,16 @@ CMD+=" --log_dir=${LOG_DIR}"
CMD+=" $PREC"
CMD+=" --sample_ratio=${sample_ratio}"
CMD+=" --use_synthetic_data=${use_synthetic_data}"
CMD+=" --num_classes=${num_classes}"
CMD+=" --data_part_num=${data_part_num}"
CMD+=" --iter_num_in_snapshot=5000"
CMD+=" --validation_interval=5000"
CMD="/home/leinao/anaconda3/envs/insightface/bin/python3 $CMD "
set -x
if [ -z "$LOG_FILE" ] ; then
if [ -z "$LOG_FILE" ]; then
$CMD
else
(
$CMD
$CMD
) |& tee $LOG_FILE
fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册