提交 fa10a4e4 编写于 作者: C chenhaozhe

optimize print of bert scripts'

上级 a3b8b4c2
...@@ -14,17 +14,30 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( ...@@ -14,17 +14,30 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
### Pre-Training ### Pre-Training
- Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. - Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file.
- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`.
``` bash ``` bash
sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR bash scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
``` ```
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
- Run `run_standalone_pretrain_for_gpu.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`.
``` bash
bash scripts/run_standalone_pretrain_for_gpu.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
```
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`.
``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE bash scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE
``` ```
- Run `run_distribute_pretrain_for_gpu.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`.
```bash
bash scripts/run_distribute_pretrain_for_gpu.sh RANK_SIZE EPOCH_SIZE DATA_DIR SCHEMA_DIR
```
### Fine-Tuning and Evaluation ### Fine-Tuning and Evaluation
- Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset) - Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset)
......
...@@ -141,7 +141,7 @@ def run_pretrain(): ...@@ -141,7 +141,7 @@ def run_pretrain():
else: else:
raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]". raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]".
format(cfg.optimizer)) format(cfg.optimizer))
callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()] callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size())]
if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0:
config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
keep_checkpoint_max=args_opt.save_checkpoint_num) keep_checkpoint_max=args_opt.save_checkpoint_num)
......
...@@ -125,7 +125,7 @@ def distribute_pretrain(): ...@@ -125,7 +125,7 @@ def distribute_pretrain():
print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt") print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt")
os.chdir(cur_dir + "/LOG" + str(device_id)) os.chdir(cur_dir + "/LOG" + str(device_id))
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
......
...@@ -18,6 +18,7 @@ Functional Cells used in Bert finetune and evaluation. ...@@ -18,6 +18,7 @@ Functional Cells used in Bert finetune and evaluation.
""" """
import os import os
import math
import numpy as np import numpy as np
import mindspore.nn as nn import mindspore.nn as nn
from mindspore import log as logger from mindspore import log as logger
...@@ -90,15 +91,14 @@ class LossCallBack(Callback): ...@@ -90,15 +91,14 @@ class LossCallBack(Callback):
Args: Args:
per_print_times (int): Print loss every times. Default: 1. per_print_times (int): Print loss every times. Default: 1.
""" """
def __init__(self, per_print_times=1): def __init__(self, dataset_size=1):
super(LossCallBack, self).__init__() super(LossCallBack, self).__init__()
if not isinstance(per_print_times, int) or per_print_times < 0: self._dataset_size = dataset_size
raise ValueError("print_step must be int and >= 0")
self._per_print_times = per_print_times
def step_end(self, run_context): def step_end(self, run_context):
cb_params = run_context.original_args() cb_params = run_context.original_args()
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
str(cb_params.net_outputs))) print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix):
""" """
......
...@@ -7,7 +7,7 @@ The number of D chips can be automatically allocated based on the device_num set ...@@ -7,7 +7,7 @@ The number of D chips can be automatically allocated based on the device_num set
## how to use ## how to use
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
``` ```
python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
``` ```
output: output:
......
...@@ -124,7 +124,7 @@ def distribute_pretrain(): ...@@ -124,7 +124,7 @@ def distribute_pretrain():
print("data_dir:", data_dir) print("data_dir:", data_dir)
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册