From 802e45291117c6c97a327a87f2ca0457148ad6c7 Mon Sep 17 00:00:00 2001 From: Flowingsun007 Date: Wed, 28 Oct 2020 23:15:00 +0800 Subject: [PATCH] add scripts for testing bert --- DeepSpeed/bert/scripts/multi_node_train.sh | 72 ++++++++++++++++++++++ DeepSpeed/bert/scripts/run_multi_node.sh | 20 ++++++ DeepSpeed/bert/scripts/run_single_node.sh | 37 +++++++++++ DeepSpeed/bert/scripts/run_two_node.sh | 16 +++++ 4 files changed, 145 insertions(+) create mode 100644 DeepSpeed/bert/scripts/multi_node_train.sh create mode 100644 DeepSpeed/bert/scripts/run_multi_node.sh create mode 100644 DeepSpeed/bert/scripts/run_single_node.sh create mode 100644 DeepSpeed/bert/scripts/run_two_node.sh diff --git a/DeepSpeed/bert/scripts/multi_node_train.sh b/DeepSpeed/bert/scripts/multi_node_train.sh new file mode 100644 index 0000000..f67ac26 --- /dev/null +++ b/DeepSpeed/bert/scripts/multi_node_train.sh @@ -0,0 +1,72 @@ +#!/bin/bash +OUTPUT_DIR=../output +# Where should we save checkpoints and tensorboard events? +rm -rf $OUTPUT_DIR +mkdir -p $OUTPUT_DIR +MODEL=${1:-"bert_base"} +BATCH_SIZE=${2:-32} +gpus=${3:-"0"} +nodes=${4:-$NODE1} +TEST_NUM=${5:-1} +DTYPE=${6:-"fp32"} + +a=`expr ${#gpus} + 1` +num_gpus=`expr ${a} / 2` +num_nodes=$(echo $nodes | tr ',' '\n' | wc -l) +train_batch_size=`expr ${BATCH_SIZE} \* 1024` + +LOG_FOLDER=../logs-${DTYPE}/deepspeed/bert/bz${BATCH_SIZE}/${num_nodes}n${num_gpus}g +mkdir -p $LOG_FOLDER +LOGFILE=${LOG_FOLDER}/bert_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log + +job_name=adam_nvidia_data_${MODEL} +config=${MODEL}.json +deepspeed_config=deepspeed_bsz64k_adam_config_seq128.json +# deepspeed_config=deepspeed_bsz4k_onebit_config_seq128.json + +if [ ${DTYPE} == "fp16" ];then + enabled=true +else + enabled=false +fi + +sed -i "s/\"train_batch_size\":.*$/\"train_batch_size\": $train_batch_size,/" $deepspeed_config +sed -i "s/\"train_micro_batch_size_per_gpu\":.*$/\"train_micro_batch_size_per_gpu\": $BATCH_SIZE,/" $deepspeed_config +sed -i "s/\"enabled\":.*$/\"enabled\":$enabled,/" $deepspeed_config + + +DATA_PATH_PREFIX=/datasets/bert/deepspeed/data/test +if [ $num_nodes -ge 2 ];then + NCCL_TREE_THRESHOLD=0 deepspeed --hostfile=deepspeed_hosts \ + --num_nodes=$num_nodes \ + --num_gpus=$num_gpus deepspeed_train.py \ + --cf ${config} \ + --max_seq_length 128 \ + --output_dir $OUTPUT_DIR \ + --deepspeed \ + --print_steps 1 \ + --lr_schedule "EP" \ + --max_steps_per_epoch 120 \ + --lr_offset 10e-4 \ + --job_name ${job_name} \ + --deepspeed_config $deepspeed_config \ + --data_path_prefix ${DATA_PATH_PREFIX} \ + --use_nvidia_dataset 2>&1 | tee $LOGFILE +else + NCCL_TREE_THRESHOLD=0 deepspeed \ + --num_nodes=$num_nodes \ + --num_gpus=$num_gpus deepspeed_train.py \ + --cf ${config} \ + --max_seq_length 128 \ + --output_dir $OUTPUT_DIR \ + --deepspeed \ + --print_steps 1 \ + --lr_schedule "EP" \ + --max_steps_per_epoch 120 \ + --lr_offset 10e-4 \ + --job_name ${job_name} \ + --deepspeed_config $deepspeed_config \ + --data_path_prefix ${DATA_PATH_PREFIX} \ + --use_nvidia_dataset 2>&1 | tee $LOGFILE +fi + diff --git a/DeepSpeed/bert/scripts/run_multi_node.sh b/DeepSpeed/bert/scripts/run_multi_node.sh new file mode 100644 index 0000000..660313b --- /dev/null +++ b/DeepSpeed/bert/scripts/run_multi_node.sh @@ -0,0 +1,20 @@ +SHELL_FOLDER=$(dirname $(readlink -f "$0")) +BATCH_SIZE=${1:-32} +DTYPE=${2:-"fp32"} +NODE1='10.11.0.2' +NODE2='10.11.0.3' +NODE3='10.11.0.4' +NODE4='10.11.0.5' +nodes=$NODE1,$NODE2,$NODE3,$NODE4 + + + +nodes=$NODE1,$NODE2,$NODE3,$NODE4 +i=1 +while [ $i -le 5 ] +do + bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0,1,2,3,4,5,6,7 $nodes $i $DTYPE + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" + let i++ + sleep 20 +done \ No newline at end of file diff --git a/DeepSpeed/bert/scripts/run_single_node.sh b/DeepSpeed/bert/scripts/run_single_node.sh new file mode 100644 index 0000000..929fbd4 --- /dev/null +++ b/DeepSpeed/bert/scripts/run_single_node.sh @@ -0,0 +1,37 @@ +SHELL_FOLDER=$(dirname $(readlink -f "$0")) +BATCH_SIZE=${1:-32} +DTYPE=${2:-"fp32"} +NODE1='10.11.0.2' +# NODE2='10.11.0.3' +# NODE3='10.11.0.4' +# NODE4='10.11.0.5' +nodes=$NODE1 + +i=1 +while [ $i -le 5 ] +do + bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0 $nodes $i $DTYPE + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" + let i++ + sleep 20 +done + + +i=1 +while [ $i -le 5 ] +do + bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0,1,2,3 $nodes $i $DTYPE + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" + let i++ + sleep 20 +done + + +i=1 +while [ $i -le 5 ] +do + bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0,1,2,3,4,5,6,7 $nodes $i $DTYPE + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" + let i++ + sleep 20 +done \ No newline at end of file diff --git a/DeepSpeed/bert/scripts/run_two_node.sh b/DeepSpeed/bert/scripts/run_two_node.sh new file mode 100644 index 0000000..45fa1f0 --- /dev/null +++ b/DeepSpeed/bert/scripts/run_two_node.sh @@ -0,0 +1,16 @@ +SHELL_FOLDER=$(dirname $(readlink -f "$0")) +BATCH_SIZE=${1:-32} +DTYPE=${2:-"fp32"} +NODE1='10.11.0.2' +NODE2='10.11.0.3' +nodes=$NODE1,$NODE2 + + +i=1 +while [ $i -le 5 ] +do + bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0,1,2,3,4,5,6,7 $nodes $i $DTYPE + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" + let i++ + sleep 20 +done \ No newline at end of file -- GitLab