diff --git a/example/Bert_NEZHA_cnwiki/config.py b/example/Bert_NEZHA_cnwiki/config.py deleted file mode 100644 index a704d9a2642412c49ed18bf67a6ca3a0cf0e1754..0000000000000000000000000000000000000000 --- a/example/Bert_NEZHA_cnwiki/config.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -""" -network config setting, will be used in train.py -""" - -from easydict import EasyDict as edict -import mindspore.common.dtype as mstype -from mindspore.model_zoo.Bert_NEZHA import BertConfig -bert_train_cfg = edict({ - 'epoch_size': 10, - 'num_warmup_steps': 0, - 'start_learning_rate': 1e-4, - 'end_learning_rate': 0.0, - 'decay_steps': 1000, - 'power': 10.0, - 'save_checkpoint_steps': 2000, - 'keep_checkpoint_max': 10, - 'checkpoint_prefix': "checkpoint_bert", - # please add your own dataset path - 'DATA_DIR': "/your/path/examples.tfrecord", - # please add your own dataset schema path - 'SCHEMA_DIR': "/your/path/datasetSchema.json" -}) -bert_net_cfg = BertConfig( - batch_size=16, - seq_length=128, - vocab_size=21136, - hidden_size=1024, - num_hidden_layers=24, - num_attention_heads=16, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - use_relative_positions=True, - input_mask_from_dataset=True, - token_type_ids_from_dataset=True, - dtype=mstype.float32, - compute_type=mstype.float16, -) diff --git a/example/Bert_NEZHA_cnwiki/train.py b/example/Bert_NEZHA_cnwiki/train.py deleted file mode 100644 index 2610542a9aaaab4fdc865f904cda09b808f96dc4..0000000000000000000000000000000000000000 --- a/example/Bert_NEZHA_cnwiki/train.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -""" -NEZHA (NEural contextualiZed representation for CHinese lAnguage understanding) is the Chinese pretrained language -model currently based on BERT developed by Huawei. -1. Prepare data -Following the data preparation as in BERT, run command as below to get dataset for training: - python ./create_pretraining_data.py \ - --input_file=./sample_text.txt \ - --output_file=./examples.tfrecord \ - --vocab_file=./your/path/vocab.txt \ - --do_lower_case=True \ - --max_seq_length=128 \ - --max_predictions_per_seq=20 \ - --masked_lm_prob=0.15 \ - --random_seed=12345 \ - --dupe_factor=5 -2. Pretrain -First, prepare the distributed training environment, then adjust configurations in config.py, finally run train.py. -""" - -import os -import numpy as np -from config import bert_train_cfg, bert_net_cfg -import mindspore.dataset.engine.datasets as de -import mindspore.dataset.transforms.c_transforms as C -from mindspore import context -from mindspore.common.tensor import Tensor -import mindspore.common.dtype as mstype -from mindspore.train.model import Model -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor -from mindspore.model_zoo.Bert_NEZHA import BertNetworkWithLoss, BertTrainOneStepCell -from mindspore.nn.optim import Lamb -_current_dir = os.path.dirname(os.path.realpath(__file__)) - -def create_train_dataset(batch_size): - """create train dataset""" - # apply repeat operations - repeat_count = bert_train_cfg.epoch_size - ds = de.TFRecordDataset([bert_train_cfg.DATA_DIR], bert_train_cfg.SCHEMA_DIR, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"]) - type_cast_op = C.TypeCast(mstype.int32) - ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) - ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) - ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) - ds = ds.map(input_columns="segment_ids", operations=type_cast_op) - ds = ds.map(input_columns="input_mask", operations=type_cast_op) - ds = ds.map(input_columns="input_ids", operations=type_cast_op) - # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - ds = ds.repeat(repeat_count) - return ds - -def weight_variable(shape): - """weight variable""" - np.random.seed(1) - ones = np.random.uniform(-0.1, 0.1, size=shape).astype(np.float32) - return Tensor(ones) - -def train_bert(): - """train bert""" - context.set_context(mode=context.GRAPH_MODE) - context.set_context(device_target="Ascend") - context.set_context(enable_task_sink=True) - context.set_context(enable_loop_sink=True) - context.set_context(enable_mem_reuse=True) - ds = create_train_dataset(bert_net_cfg.batch_size) - netwithloss = BertNetworkWithLoss(bert_net_cfg, True) - optimizer = Lamb(netwithloss.trainable_params(), decay_steps=bert_train_cfg.decay_steps, - start_learning_rate=bert_train_cfg.start_learning_rate, - end_learning_rate=bert_train_cfg.end_learning_rate, power=bert_train_cfg.power, - warmup_steps=bert_train_cfg.num_warmup_steps, decay_filter=lambda x: False) - netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) - netwithgrads.set_train(True) - model = Model(netwithgrads) - config_ck = CheckpointConfig(save_checkpoint_steps=bert_train_cfg.save_checkpoint_steps, - keep_checkpoint_max=bert_train_cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix=bert_train_cfg.checkpoint_prefix, config=config_ck) - model.train(ds.get_repeat_count(), ds, callbacks=[LossMonitor(), ckpoint_cb], dataset_sink_mode=False) - -if __name__ == '__main__': - train_bert() diff --git a/example/Bert_NEZHA_cnwiki/README.md b/example/bert_clue/README.md similarity index 69% rename from example/Bert_NEZHA_cnwiki/README.md rename to example/bert_clue/README.md index cd86b3bdd68dc9cd52881e0e3fd08bfd7ef44098..55b200e941af19d8baceccaac3dc37d65b4d84aa 100644 --- a/example/Bert_NEZHA_cnwiki/README.md +++ b/example/bert_clue/README.md @@ -4,20 +4,26 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( ## Requirements - Install [MindSpore](https://www.mindspore.cn/install/en). -- Download the zhwiki dataset from for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wiliextractor). Convert the dataset to TFRecord format and move the files to a specified path. +- Download the zhwiki dataset from for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wil +kiextractor). Convert the dataset to TFRecord format and move the files to a specified path. - Download the CLUE dataset from for fine-tuning and evaluation. > Notes: If you are running a fine-tuning or evaluation task, prepare the corresponding checkpoint file. ## Running the Example ### Pre-Training -- Set options in `config.py`. Make sure the 'DATA_DIR'(path to the dataset) and 'SCHEMA_DIR'(path to the json schema file) are set to your own path. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. +- Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. -- Run `run_pretrain.py` for pre-training of BERT-base and BERT-NEZHA model. +- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model. - ``` bash - python run_pretrain.py --backend=ms + ``` bash + sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_PATH ``` +- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. + + ``` bash + sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH MINDSPORE_PATH + ``` ### Fine-Tuning - Set options in `finetune_config.py`. Make sure the 'data_file', 'schema_file' and 'ckpt_file' are set to your own path, set the 'pre_training_ckpt' to save the checkpoint files generated. @@ -40,30 +46,42 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( ## Usage ### Pre-Training ``` -usage: run_pretrain.py [--backend BACKEND] - -optional parameters: - --backend, BACKEND MindSpore backend: ms +usage: run_pretrain.py [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N] + [--enable_task_sink ENABLE_TASK_SINK] [--enable_loop_sink ENABLE_LOOP_SINK] + [--enable_mem_reuse ENABLE_MEM_REUSE] [--enable_save_ckpt ENABLE_SAVE_CKPT] + [--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE] + [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] [--checkpoint_path CHECKPOINT_PATH] + [--save_checkpoint_steps N] [--save_checkpoint_num N] + [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR] + +options: + --distribute pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false" + --epoch_size epoch size: N, default is 1 + --device_num number of used devices: N, default is 1 + --device_id device id: N, default is 0 + --enable_task_sink enable task sink: "true" | "false", default is "true" + --enable_loop_sink enable loop sink: "true" | "false", default is "true" + --enable_mem_reuse enable memory reuse: "true" | "false", default is "true" + --enable_save_ckpt enable save checkpoint: "true" | "false", default is "true" + --enable_lossscale enable lossscale: "true" | "false", default is "true" + --do_shuffle enable shuffle: "true" | "false", default is "true" + --enable_data_sink enable data sink: "true" | "false", default is "true" + --data_sink_steps set data sink steps: N, default is 1 + --checkpoint_path path to save checkpoint files: PATH, default is "" + --save_checkpoint_steps steps for saving checkpoint files: N, default is 1000 + --save_checkpoint_num number for saving checkpoint files: N, default is 1 + --data_dir path to dataset directory: PATH, default is "" + --schema_dir path to schema.json file, PATH, default is "" ``` - ## Options and Parameters It contains of parameters of BERT model and options for training, which is set in file `config.py`, `finetune_config.py` and `evaluation_config.py` respectively. ### Options: ``` Pre-Training: bert_network version of BERT model: base | large, default is base - epoch_size repeat counts of training: N, default is 40 - dataset_sink_mode use dataset sink mode or not: True | False, default is True - do_shuffle shuffle the dataset or not: True | False, default is True - do_train_with_lossscale use lossscale or not: True | False, default is True loss_scale_value initial value of loss scale: N, default is 2^32 scale_factor factor used to update loss scale: N, default is 2 - scale_window steps for once updatation of loss scale: N, default is 1000 - save_checkpoint_steps steps to save a checkpoint: N, default is 2000 - keep_checkpoint_max numbers to save checkpoint: N, default is 1 - init_ckpt checkpoint file to load: PATH, default is "" - data_dir dataset file to load: PATH, default is "/your/path/cn-wiki-128" - schema_dir dataset schema file to load: PATH, default is "your/path/datasetSchema.json" + scale_window steps for once updatation of loss scale: N, default is 1000 optimizer optimizer used in the network: AdamWerigtDecayDynamicLR | Lamb | Momentum, default is "Lamb" Fine-Tuning: diff --git a/example/bert_clue/config.py b/example/bert_clue/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2d49121c50ae6838da6e02a7864e1e59e128be9f --- /dev/null +++ b/example/bert_clue/config.py @@ -0,0 +1,89 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in dataset.py, run_pretrain.py +""" +from easydict import EasyDict as edict +import mindspore.common.dtype as mstype +from mindspore.model_zoo.Bert_NEZHA import BertConfig +cfg = edict({ + 'bert_network': 'base', + 'loss_scale_value': 2**32, + 'scale_factor': 2, + 'scale_window': 1000, + 'optimizer': 'Lamb', + 'AdamWeightDecayDynamicLR': edict({ + 'learning_rate': 3e-5, + 'end_learning_rate': 0.0, + 'power': 5.0, + 'weight_decay': 1e-5, + 'eps': 1e-6, + }), + 'Lamb': edict({ + 'start_learning_rate': 3e-5, + 'end_learning_rate': 0.0, + 'power': 10.0, + 'warmup_steps': 10000, + 'weight_decay': 0.01, + 'eps': 1e-6, + 'decay_filter': lambda x: False, + }), + 'Momentum': edict({ + 'learning_rate': 2e-5, + 'momentum': 0.9, + }), +}) +if cfg.bert_network == 'base': + bert_net_cfg = BertConfig( + batch_size=16, + seq_length=128, + vocab_size=21136, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16, + ) +else: + bert_net_cfg = BertConfig( + batch_size=16, + seq_length=128, + vocab_size=21136, + hidden_size=1024, + num_hidden_layers=12, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=True, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16, + ) diff --git a/example/bert_clue/dataset.py b/example/bert_clue/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..671f0dca0f08436974fbcd7871f9ea0d1fe7ed62 --- /dev/null +++ b/example/bert_clue/dataset.py @@ -0,0 +1,58 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Data operations, will be used in run_pretrain.py +""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset.engine.datasets as de +import mindspore.dataset.transforms.c_transforms as C +from mindspore import log as logger +from config import bert_net_cfg + + +def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true", + data_sink_steps=1, data_dir=None, schema_dir=None): + """create train dataset""" + # apply repeat operations + repeat_count = epoch_size + files = os.listdir(data_dir) + data_files = [] + for file_name in files: + data_files.append(data_dir+file_name) + ds = de.TFRecordDataset(data_files, schema_dir, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, + shard_equal_rows=True) + ori_dataset_size = ds.get_dataset_size() + new_size = ori_dataset_size + if enable_data_sink == "true": + new_size = data_sink_steps * bert_net_cfg.batch_size + ds.set_dataset_size(new_size) + repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size()) + type_cast_op = C.TypeCast(mstype.int32) + ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) + ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) + ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + # apply batch operations + ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) + ds = ds.repeat(repeat_count) + logger.info("data size: {}".format(ds.get_dataset_size())) + logger.info("repeatcount: {}".format(ds.get_repeat_count())) + return ds diff --git a/example/bert_clue/run_distribute_pretrain.sh b/example/bert_clue/run_distribute_pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..93d68d8e9d44d9aa27c7efc99e891e39d50008f1 --- /dev/null +++ b/example/bert_clue/run_distribute_pretrain.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH MINDSPORE_PATH" +echo "for example: sh run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json /path/mindspore" +echo "It is better to use absolute path." +echo "==============================================================================================================" + +EPOCH_SIZE=$2 +DATA_DIR=$3 +SCHEMA_DIR=$4 +MINDSPORE_PATH=$6 + +export PYTHONPATH=$MINDSPORE_PATH/build/package:$PYTHONPATH +export MINDSPORE_HCCL_CONFIG_PATH=$5 +export RANK_SIZE=$1 + +for((i=0;i env.log + taskset -c $cmdopt python ../run_pretrain.py \ + --distribute="true" \ + --epoch_size=$EPOCH_SIZE \ + --device_id=$DEVICE_ID \ + --device_num=$RANK_SIZE \ + --enable_task_sink="true" \ + --enable_loop_sink="true" \ + --enable_mem_reuse="true" \ + --enable_save_ckpt="true" \ + --enable_lossscale="true" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=1 \ + --checkpoint_path="" \ + --save_checkpoint_steps=1000 \ + --save_checkpoint_num=1 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & + cd ../ +done diff --git a/example/bert_clue/run_pretrain.py b/example/bert_clue/run_pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..25c78e08d82d8ef77a727b49ea86c40df2f2dd0b --- /dev/null +++ b/example/bert_clue/run_pretrain.py @@ -0,0 +1,144 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +#################pre_train bert example on zh-wiki######################## +python run_pretrain.py +""" + +import os +import argparse +import mindspore.communication.management as D +from mindspore import context +from mindspore.train.model import Model +from mindspore.train.parallel_utils import ParallelMode +from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell +from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig +from mindspore.model_zoo.Bert_NEZHA import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell +from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR +from dataset import create_bert_dataset +from config import cfg, bert_net_cfg +_current_dir = os.path.dirname(os.path.realpath(__file__)) + +class LossCallBack(Callback): + """ + Monitor the loss in training. + If the loss in NAN or INF terminating training. + Note: + if per_print_times is 0 do not print loss. + Args: + per_print_times (int): Print loss every times. Default: 1. + """ + def __init__(self, per_print_times=1): + super(LossCallBack, self).__init__() + if not isinstance(per_print_times, int) or per_print_times < 0: + raise ValueError("print_step must be int and >= 0") + self._per_print_times = per_print_times + def step_end(self, run_context): + cb_params = run_context.original_args() + with open("./loss.log", "a+") as f: + f.write("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs))) + f.write('\n') + +def run_pretrain(): + """pre-train bert_clue""" + parser = argparse.ArgumentParser(description='bert pre_training') + parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") + parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") + parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") + parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") + parser.add_argument("--enable_task_sink", type=str, default="true", help="Enable task sink, default is true.") + parser.add_argument("--enable_loop_sink", type=str, default="true", help="Enable loop sink, default is true.") + parser.add_argument("--enable_mem_reuse", type=str, default="true", help="Enable mem reuse, default is true.") + parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") + parser.add_argument("--enable_lossscale", type=str, default="true", help="Use lossscale or not, default is not.") + parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") + parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") + parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") + parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path") + parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " + "default is 1000.") + parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") + parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") + parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") + + args_opt = parser.parse_args() + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + context.set_context(enable_task_sink=(args_opt.enable_task_sink == "true"), + enable_loop_sink=(args_opt.enable_loop_sink == "true"), + enable_mem_reuse=(args_opt.enable_mem_reuse == "true")) + context.set_context(reserve_class_name_in_scope=False) + + if args_opt.distribute == "true": + device_num = args_opt.device_num + context.reset_auto_parallel_context() + context.set_context(enable_hccl=True) + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, + device_num=device_num) + D.init() + rank = args_opt.device_id % device_num + else: + context.set_context(enable_hccl=False) + rank = 0 + device_num = 1 + + ds = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle, args_opt.enable_data_sink, + args_opt.data_sink_steps, args_opt.data_dir, args_opt.schema_dir) + + netwithloss = BertNetworkWithLoss(bert_net_cfg, True) + + if cfg.optimizer == 'Lamb': + optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(), + start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, + power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay, + eps=cfg.Lamb.eps, decay_filter=cfg.Lamb.decay_filter) + elif cfg.optimizer == 'Momentum': + optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, + momentum=cfg.Momentum.momentum) + elif cfg.optimizer == 'AdamWeightDecayDynamicLR': + optimizer = AdamWeightDecayDynamicLR(netwithloss.trainable_params(), + decay_steps=ds.get_dataset_size() * ds.get_repeat_count(), + learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, + end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, + power=cfg.AdamWeightDecayDynamicLR.power, + weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, + eps=cfg.AdamWeightDecayDynamicLR.eps) + else: + raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]". + format(cfg.optimizer)) + callback = [LossCallBack()] + if args_opt.enable_save_ckpt == "true": + config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, + keep_checkpoint_max=args_opt.save_checkpoint_num) + ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', config=config_ck) + callback.append(ckpoint_cb) + + if args_opt.checkpoint_path: + param_dict = load_checkpoint(args_opt.checkpoint_path) + load_param_into_net(netwithloss, param_dict) + + if args_opt.enable_lossscale == "true": + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, + scale_factor=cfg.scale_factor, + scale_window=cfg.scale_window) + netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, + scale_update_cell=update_cell) + else: + netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) + + model = Model(netwithgrads) + model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true")) +if __name__ == '__main__': + run_pretrain() diff --git a/example/bert_clue/run_standalone_pretrain.sh b/example/bert_clue/run_standalone_pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..aeffca7b046d237027e096840f615497600c0953 --- /dev/null +++ b/example/bert_clue/run_standalone_pretrain.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_PATH" +echo "for example: sh run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json /path/mindspore" +echo "==============================================================================================================" + +DEVICE_ID=$1 +EPOCH_SIZE=$2 +DATA_DIR=$3 +SCHEMA_DIR=$4 +MINDSPORE_PATH=$5 +export PYTHONPATH=$MINDSPORE_PATH/build/package:$PYTHONPATH + +python run_pretrain.py \ + --distribute="false" \ + --epoch_size=$EPOCH_SIZE \ + --device_id=$DEVICE_ID \ + --enable_task_sink="true" \ + --enable_loop_sink="true" \ + --enable_mem_reuse="true" \ + --enable_save_ckpt="true" \ + --enable_lossscale="true" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=1 \ + --checkpoint_path="" \ + --save_checkpoint_steps=1000 \ + --save_checkpoint_num=1 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &