diff --git a/model_zoo/official/nlp/bert_thor/README.md b/model_zoo/official/nlp/bert_thor/README.md index c89956996533582bb36aab129b64312528566143..80ed17b42c0aa11bbb58748b0da19a5ba3a73b90 100644 --- a/model_zoo/official/nlp/bert_thor/README.md +++ b/model_zoo/official/nlp/bert_thor/README.md @@ -128,12 +128,12 @@ Parameters for both training and inference can be set in config.py. ``` sh run_distribute_pretrain.sh [DEVICE_NUM] [EPOCH_SIZE] [DATA_DIR] [SCHEMA_DIR] [RANK_TABLE_FILE] ``` -We need three parameters for this scripts. +We need five parameters for this scripts. - `DEVICE_NUM`: the device number for distributed train. - `EPOCH_SIZE`: Epoch size used in the model - `DATA_DIR`:Data path, it is better to use absolute path. - `SCHEMA_DIR `:Schema path, it is better to use absolute path -- `RANK_TABLE_FILE`: the path of rank_table.json +- `RANK_TABLE_FILE`: rank table file with JSON format Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the followings in log. ``` diff --git a/model_zoo/official/nlp/bert_thor/pretrain_eval.py b/model_zoo/official/nlp/bert_thor/pretrain_eval.py index 0e64c61700e603377f2121e53704b2bfbf76db43..4cb501a4a62171ae06b4534d9576c7db7baf76c1 100644 --- a/model_zoo/official/nlp/bert_thor/pretrain_eval.py +++ b/model_zoo/official/nlp/bert_thor/pretrain_eval.py @@ -153,10 +153,8 @@ def MLM_eval(): net = Model(net_for_pretraining, eval_network=net_for_pretraining, eval_indexes=[0, 1, 2], metrics={'name': myMetric()}) res = net.eval(dataset, dataset_sink_mode=False) - print("==============================================================") for _, v in res.items(): print("Accuracy is: ", v) - print("==============================================================") if __name__ == "__main__": diff --git a/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh b/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh index 3ac2db0206d6506290440767445ff6dab78b31e5..1b695fd28c430a62a986b6874e6b098f312974bc 100644 --- a/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh +++ b/model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh @@ -25,6 +25,9 @@ EPOCH_SIZE=$2 DATA_DIR=$3 SCHEMA_DIR=$4 +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) +cd $BASE_PATH/ || exit + ulimit -u unlimited export RANK_TABLE_FILE=$5 export RANK_SIZE=$1 @@ -55,6 +58,7 @@ do --load_checkpoint_path="" \ --save_checkpoint_path='./' \ --save_checkpoint_steps=1000 \ + --train_steps=3000 \ --save_checkpoint_num=30 \ --data_dir=$DATA_DIR \ --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh b/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh index 35d18c2ad00d3778cc30545a2c4e629f590e3f44..87098430f0286ff8003d8d924386c1547d2b5a6a 100644 --- a/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh +++ b/model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh @@ -24,6 +24,9 @@ EPOCH_SIZE=$2 DATA_DIR=$3 SCHEMA_DIR=$4 +BASE_PATH=$(cd "`dirname $0`" || exit; pwd) +cd $BASE_PATH/ || exit + ulimit -u unlimited export DEVICE_ID=$1 export RANK_SIZE=1 @@ -51,6 +54,7 @@ python run_pretrain.py \ --load_checkpoint_path="" \ --save_checkpoint_path='./' \ --save_checkpoint_steps=5000 \ +--train_steps=-1 \ --save_checkpoint_num=20 \ --data_dir=$DATA_DIR \ --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/bert_thor/src/lr_generator.py b/model_zoo/official/nlp/bert_thor/src/lr_generator.py index d3ca9f458a2ea28861bbef896ef4e0e8828dcbae..cb761cdccb23ff0a3046c48a983717972187fbec 100644 --- a/model_zoo/official/nlp/bert_thor/src/lr_generator.py +++ b/model_zoo/official/nlp/bert_thor/src/lr_generator.py @@ -55,7 +55,7 @@ def get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps, return learning_rate -# bert kfac hyperparam setting +# bert thor hyperparam setting def get_bert_lr(): learning_rate = Tensor( get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=3.1e-3, warmup_steps=0, total_steps=30000,