# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """BERT fine-tuning in Paddle Dygraph Mode.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import six import sys if six.PY2: reload(sys) sys.setdefaultencoding('utf8') import ast import time import argparse import numpy as np import multiprocessing import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import to_variable, Layer from .reader.cls import * from .model.bert import BertConfig from .model.cls import ClsModelLayer from .optimization import Optimizer from .utils.init import init_from_static_model __all__ = ["BERTClassifier"] def create_data(batch): """ convert data to variable """ src_ids = to_variable(batch[0], "src_ids") position_ids = to_variable(batch[1], "position_ids") sentence_ids = to_variable(batch[2], "sentence_ids") input_mask = to_variable(batch[3], "input_mask") labels = to_variable(batch[4], "labels") labels.stop_gradient = True return src_ids, position_ids, sentence_ids, input_mask, labels class BERTClassifier(Layer): def __init__(self, num_labels, task_name="mnli", model_path=None, use_cuda=True, return_pooled_out=True): super(BERTClassifier, self).__init__() self.task_name = task_name.lower() BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/" bert_config_path = BERT_BASE_PATH + "/bert_config.json" self.vocab_path = BERT_BASE_PATH + "/vocab.txt" self.init_pretraining_params = BERT_BASE_PATH + "/dygraph_params/" self.do_lower_case = True self.bert_config = BertConfig(bert_config_path) if use_cuda: self.dev_count = fluid.core.get_cuda_device_count() else: self.dev_count = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) self.trainer_count = fluid.dygraph.parallel.Env().nranks self.processors = { 'xnli': XnliProcessor, 'cola': ColaProcessor, 'mrpc': MrpcProcessor, 'mnli': MnliProcessor, } self.cls_model = ClsModelLayer( self.bert_config, num_labels, return_pooled_out=return_pooled_out) if model_path is not None: #restore the model print("Load params from %s" % model_path) model_dict, _ = fluid.load_dygraph(model_path) self.cls_model.load_dict(model_dict) elif self.init_pretraining_params: print("Load pre-trained model from %s" % self.init_pretraining_params) init_from_static_model(self.init_pretraining_params, self.cls_model, self.bert_config) else: raise Exception( "You should load pretrained model for training this teacher model." ) def emb_names(self): return self.cls_model.emb_names() def forward(self, input): return self.cls_model(input) def test(self, data_dir, batch_size=64, max_seq_len=512): processor = self.processors[self.task_name]( data_dir=data_dir, vocab_path=self.vocab_path, max_seq_len=max_seq_len, do_lower_case=self.do_lower_case, in_tokens=False) test_data_generator = processor.data_generator( batch_size=batch_size, phase='dev', epoch=1, shuffle=False) self.cls_model.eval() total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], [] for batch in test_data_generator(): data_ids = create_data(batch) total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids) np_loss = total_loss.numpy() np_acc = np_acces[-1].numpy() np_avg_acc = np.mean([acc.numpy() for acc in np_acces]) np_num_seqs = np_num_seqs.numpy() total_cost.extend(np_loss * np_num_seqs) final_acc.extend(np_acc * np_num_seqs) avg_acc.extend(np_avg_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) print("[evaluation] classifier[-1] average acc: %f; average acc: %f" % (np.sum(final_acc) / np.sum(total_num_seqs), np.sum(avg_acc) / np.sum(total_num_seqs))) self.cls_model.train() def fit(self, data_dir, epoch, batch_size=64, use_cuda=True, max_seq_len=512, warmup_proportion=0.1, use_data_parallel=False, learning_rate=0.00005, weight_decay=0.01, lr_scheduler="linear_warmup_decay", skip_steps=10, save_steps=1000, checkpoints="checkpoints"): processor = self.processors[self.task_name]( data_dir=data_dir, vocab_path=self.vocab_path, max_seq_len=max_seq_len, do_lower_case=self.do_lower_case, in_tokens=False, random_seed=5512) shuffle_seed = 1 if self.trainer_count > 1 else None train_data_generator = processor.data_generator( batch_size=batch_size, phase='train', epoch=epoch, dev_count=self.trainer_count, shuffle=True, shuffle_seed=shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') max_train_steps = epoch * num_train_examples // batch_size // self.trainer_count warmup_steps = int(max_train_steps * warmup_proportion) print("Device count: %d" % self.dev_count) print("Trainer count: %d" % self.trainer_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) if use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() optimizer = Optimizer( warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=learning_rate, model_cls=self.cls_model, weight_decay=weight_decay, scheduler=lr_scheduler, loss_scaling=1.0, parameter_list=self.cls_model.parameters()) if use_data_parallel: self.cls_model = fluid.dygraph.parallel.DataParallel( self.cls_model, strategy) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) steps = 0 time_begin = time.time() for batch in train_data_generator(): data_ids = create_data(batch) total_loss, logits, losses, accuracys, num_seqs = self.cls_model( data_ids) optimizer.optimization( total_loss, use_data_parallel=use_data_parallel, model=self.cls_model) self.cls_model.clear_gradients() if steps != 0 and steps % skip_steps == 0: time_end = time.time() used_time = time_end - time_begin current_example, current_epoch = processor.get_train_progress() localtime = time.asctime(time.localtime(time.time())) print( "%s, epoch: %s, steps: %s, dy_graph loss: %f, acc: %f, speed: %f steps/s" % (localtime, current_epoch, steps, total_loss.numpy(), accuracys[-1].numpy(), skip_steps / used_time)) time_begin = time.time() if steps != 0 and steps % save_steps == 0 and fluid.dygraph.parallel.Env( ).local_rank == 0: self.test(data_dir, batch_size=64, max_seq_len=512) save_path = os.path.join(checkpoints, "steps" + "_" + str(steps)) fluid.save_dygraph(self.cls_model.state_dict(), save_path) fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path) print("Save model parameters and optimizer status at %s" % save_path) steps += 1 if fluid.dygraph.parallel.Env().local_rank == 0: save_path = os.path.join(checkpoints, "final") fluid.save_dygraph(self.cls_model.state_dict(), save_path) fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path) print("Save model parameters and optimizer status at %s" % save_path)