使用DGC训练ResNet50出现NAN
Created by: listenlink
- paddle版本:1.5.1
- models版本:release/1.5
- 训练脚本:models/PaddleCV/image_classification/dist_train 使用nccl2在单机8卡V100训练resnet50,fp32在90epoch精度正常 top1@76.3,增加enable_dgc=True参数进行训练,实验多次在第38个epoch左右loss出现NAN,后续精度全部错误。 log如下:
训练脚本如下:
#!/bin/bash
set -e
enable_dgc=True
while true ; do
case "$1" in
-enable_dgc) enable_dgc="$2" ; shift 2 ;;
*)
if [[ ${#1} > 0 ]]; then
echo "not supported arugments ${1}" ; exit 1 ;
else
break
fi
;;
esac
done
case "${enable_dgc}" in
True) ;;
False) ;;
*) echo "not support argument -enable_dgc: ${dgc}" ; exit 1 ;;
esac
export MODEL="DistResNet"
export PADDLE_TRAINER_ENDPOINTS="127.0.0.1:7160,127.0.0.1:7161,127.0.0.1:7162,127.0.0.1:7163,127.0.0.1:7164,127.0.0.1:7165,127.0.0.1:7166,127.0.0.1:7167"
# PADDLE_TRAINERS_NUM is used only for reader when nccl2 mode
export PADDLE_TRAINERS_NUM="8"
mkdir -p logs
# NOTE: set NCCL_P2P_DISABLE so that can run nccl2 distribute train on one node.
# You can set vlog to see more details' log.
# export GLOG_v=1
# export GLOG_logtostderr=1
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7160" \
PADDLE_TRAINER_ID="0" \
CUDA_VISIBLE_DEVICES="0" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg --update_method nccl2 --batch_size 32 &> logs/tr0.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7161" \
PADDLE_TRAINER_ID="1" \
CUDA_VISIBLE_DEVICES="1" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr1.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7162" \
PADDLE_TRAINER_ID="2" \
CUDA_VISIBLE_DEVICES="2" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr2.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7163" \
PADDLE_TRAINER_ID="3" \
CUDA_VISIBLE_DEVICES="3" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr3.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7164" \
PADDLE_TRAINER_ID="4" \
CUDA_VISIBLE_DEVICES="4" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr4.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7165" \
PADDLE_TRAINER_ID="5" \
CUDA_VISIBLE_DEVICES="5" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr5.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7166" \
PADDLE_TRAINER_ID="6" \
CUDA_VISIBLE_DEVICES="6" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr6.log &
PADDLE_TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7167" \
PADDLE_TRAINER_ID="7" \
CUDA_VISIBLE_DEVICES="7" \
NCCL_P2P_DISABLE="1" \
python -u dist_train.py --enable_dgc ${enable_dgc} --lr 0.0125 --model $MODEL --data_dir /data/imagenet-jpeg/ --update_method nccl2 --batch_size 32 &> logs/tr7.log &