From e00a7f38d241c1b2abb0c4b1420b86d49bcb7844 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Wed, 3 Feb 2021 22:34:08 +0800 Subject: [PATCH] Add checkpoint support for gpt2 model (#5257) * fix checkpoints problem. --- .../examples/language_model/gpt2/README.md | 2 +- .../language_model/gpt2/generate_sample.py | 1 - .../language_model/gpt2/run_pretrain.py | 42 +++++++++++++------ .../language_model/gpt2/scripts/run.sh | 4 +- .../language_model/gpt2/scripts/run_multi.sh | 5 ++- .../paddlenlp/transformers/gpt2/tokenizer.py | 26 +++++++++++- 6 files changed, 63 insertions(+), 17 deletions(-) diff --git a/PaddleNLP/examples/language_model/gpt2/README.md b/PaddleNLP/examples/language_model/gpt2/README.md index dc63a531..1151d99a 100644 --- a/PaddleNLP/examples/language_model/gpt2/README.md +++ b/PaddleNLP/examples/language_model/gpt2/README.md @@ -23,7 +23,7 @@ 1. paddle安装 - 本项目依赖于 PaddlePaddle 2.0rc1及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装 + 本项目依赖于 PaddlePaddle 2.0及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装 2. 下载代码 diff --git a/PaddleNLP/examples/language_model/gpt2/generate_sample.py b/PaddleNLP/examples/language_model/gpt2/generate_sample.py index dabfc302..9f897ec2 100644 --- a/PaddleNLP/examples/language_model/gpt2/generate_sample.py +++ b/PaddleNLP/examples/language_model/gpt2/generate_sample.py @@ -20,7 +20,6 @@ import argparse import numpy as np import paddle -from paddlenlp.utils.tools import loadz from paddlenlp.transformers import GPT2Model, GPT2ForPretraining from paddlenlp.transformers import GPT2ChineseTokenizer, GPT2Tokenizer from paddlenlp.utils.log import logger diff --git a/PaddleNLP/examples/language_model/gpt2/run_pretrain.py b/PaddleNLP/examples/language_model/gpt2/run_pretrain.py index 33373a28..67972fe0 100644 --- a/PaddleNLP/examples/language_model/gpt2/run_pretrain.py +++ b/PaddleNLP/examples/language_model/gpt2/run_pretrain.py @@ -30,15 +30,18 @@ from paddlenlp.utils.log import logger from data import GPT2Dataset import lr -MODEL_CLASSES = { - "gpt2-small-en": (GPT2ForPretraining, GPT2Tokenizer), - "gpt2-medium-en": (GPT2ForPretraining, GPT2Tokenizer), - "gpt2-large-en": (GPT2ForPretraining, GPT2Tokenizer), -} +MODEL_CLASSES = {"gpt2": (GPT2ForPretraining, GPT2Tokenizer)} def parse_args(): parser = argparse.ArgumentParser() + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, @@ -190,15 +193,18 @@ def do_train(args): worker_num = paddle.distributed.get_world_size() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) - model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path] + model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) eod_id = tokenizer.command_name_map["eod"].Id - model = GPT2ForPretraining( - GPT2Model(**model_class.pretrained_init_configuration[ - args.model_name_or_path])) - # creat the critrion for the gpt model - criterion = GPT2PretrainingCriterion() + pretrained_models_list = list( + model_class.pretrained_init_configuration.keys()) + if args.model_name_or_path in pretrained_models_list: + model = GPT2ForPretraining( + GPT2Model(**model_class.pretrained_init_configuration[ + args.model_name_or_path])) + else: + model = GPT2ForPretraining.from_pretrained(args.model_name_or_path) if args.decay_steps is None: args.decay_steps = args.max_steps @@ -223,6 +229,13 @@ def do_train(args): p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) + if args.model_name_or_path not in pretrained_models_list: + opt_dict = paddle.load( + os.path.join(args.model_name_or_path, "model_state.pdopt")) + optimizer.set_state_dict(opt_dict) + + # creat the critrion for the gpt model + criterion = GPT2PretrainingCriterion() global_step = 0 tic_train = time.time() @@ -259,7 +272,7 @@ def do_train(args): loss.backward() optimizer.step() lr_scheduler.step() - optimizer.clear_gradients() + optimizer.clear_grad() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, @@ -270,9 +283,14 @@ def do_train(args): model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + paddle.save( + optimizer.state_dict(), + os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: del train_data_loader return + del train_data_loader diff --git a/PaddleNLP/examples/language_model/gpt2/scripts/run.sh b/PaddleNLP/examples/language_model/gpt2/scripts/run.sh index 74ea8b1f..55e9492e 100644 --- a/PaddleNLP/examples/language_model/gpt2/scripts/run.sh +++ b/PaddleNLP/examples/language_model/gpt2/scripts/run.sh @@ -1,5 +1,7 @@ export CUDA_VISIBLE_DEVICES=0 -python run_pretrain.py --model_name_or_path gpt2-small-en --input_dir "./data"\ +python run_pretrain.py --model_type gpt2\ + --model_name_or_path gpt2-small-en\ + --input_dir "./data"\ --output_dir "output"\ --max_lr 0.00015\ --min_lr 0.00001\ diff --git a/PaddleNLP/examples/language_model/gpt2/scripts/run_multi.sh b/PaddleNLP/examples/language_model/gpt2/scripts/run_multi.sh index bfd2cde7..e4136040 100644 --- a/PaddleNLP/examples/language_model/gpt2/scripts/run_multi.sh +++ b/PaddleNLP/examples/language_model/gpt2/scripts/run_multi.sh @@ -1,5 +1,8 @@ unset CUDA_VISIBLE_DEVICES -python -m paddle.distributed.launch --gpus "0,1" run_pretrain.py --model_name_or_path gpt2-small-en --input_dir "./data"\ +python -m paddle.distributed.launch --gpus "0,1" run_pretrain.py \ + --model_type gpt2\ + --model_name_or_path gpt2-small-en\ + --input_dir "./data"\ --output_dir "output"\ --max_lr 0.00015\ --min_lr 0.00001\ diff --git a/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py b/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py index aacc4b50..657ae5a9 100644 --- a/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py +++ b/PaddleNLP/paddlenlp/transformers/gpt2/tokenizer.py @@ -18,6 +18,7 @@ from collections import namedtuple import json import jieba +import shutil from paddle.utils import try_import from .. import PretrainedTokenizer @@ -111,7 +112,8 @@ class GPT2ChineseTokenizer(PretrainedTokenizer): bod_id="", eod_id="", max_length=None): - + self._vocab_file = vocab_file + self._model_file = model_file if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the " @@ -149,6 +151,16 @@ class GPT2ChineseTokenizer(PretrainedTokenizer): '\n') return text + def save_resources(self, save_directory): + """ + Save tokenizer related resources to files under `save_directory`. + Args: + save_directory (str): Directory to save files into. + """ + for name, file_name in self.resource_files_names.items(): + save_path = os.path.join(save_directory, file_name) + shutil.copyfile(getattr(self, "_%s" % name), save_path) + class GPT2Tokenizer(PretrainedTokenizer): resource_files_names = { @@ -192,6 +204,8 @@ class GPT2Tokenizer(PretrainedTokenizer): special_tokens=None, max_len=None, do_lower_case=True): + self._vocab_file = vocab_file + self._merges_file = merges_file self.max_len = int(1e12) self.num_command_tokens = 2 self.num_type_tokens = 2 @@ -346,3 +360,13 @@ class GPT2Tokenizer(PretrainedTokenizer): text = bytearray([self.byte_decoder[c] for c in text]).decode( 'utf-8', errors=self.errors) return text + + def save_resources(self, save_directory): + """ + Save tokenizer related resources to files under `save_directory`. + Args: + save_directory (str): Directory to save files into. + """ + for name, file_name in self.resource_files_names.items(): + save_path = os.path.join(save_directory, file_name) + shutil.copyfile(getattr(self, "_%s" % name), save_path) -- GitLab